cpgtools 2.0.0__py3-none-any.whl → 2.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cpgtools might be problematic. Click here for more details.
- cpgmodule/_version.py +1 -0
- cpgmodule/utils.py +35 -0
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_aggregation.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_anno_position.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_anno_probe.py +1 -2
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_density_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_distrb_chrom.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_distrb_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_distrb_region.py +1 -3
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_logo.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_to_gene.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_PCA.py +31 -23
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_UMAP.py +29 -22
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_jitter_plot.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_m_conversion.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_profile_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_profile_region.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_selectNBest.py +9 -6
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_stacked_barplot.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_stats.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_tSNE.py +31 -24
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_topN.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_trichotmize.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_Bayes.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_bb.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_fisher.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_glm.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_logit.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_nonparametric.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_ttest.py +6 -2
- cpgtools-2.0.3.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.3.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.3.dist-info/LICENSE +19 -0
- cpgtools-2.0.3.dist-info/METADATA +76 -0
- cpgtools-2.0.3.dist-info/RECORD +101 -0
- {cpgtools-2.0.0.dist-info → cpgtools-2.0.3.dist-info}/WHEEL +1 -1
- cpgtools-2.0.3.dist-info/top_level.txt +3 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- cpgtools-2.0.0.dist-info/LICENSE.txt +0 -674
- cpgtools-2.0.0.dist-info/METADATA +0 -28
- cpgtools-2.0.0.dist-info/RECORD +0 -64
- cpgtools-2.0.0.dist-info/top_level.txt +0 -2
cpgmodule/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.0.2"
|
cpgmodule/utils.py
CHANGED
|
@@ -5,6 +5,8 @@ from time import strftime
|
|
|
5
5
|
from bx.intervals import *
|
|
6
6
|
import numpy as np
|
|
7
7
|
from cpgmodule import ireader
|
|
8
|
+
import logging
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
def revcomp(dna):
|
|
10
12
|
'''reverse complement DNA sequences'''
|
|
@@ -604,4 +606,37 @@ def read_CpG_bed(cpgfile,genefile, bin_count = 100):
|
|
|
604
606
|
|
|
605
607
|
return np.array(cpg_profile).means(axis=0)
|
|
606
608
|
"""
|
|
609
|
+
def config_log(switch, logfile=None):
|
|
610
|
+
"""
|
|
611
|
+
Configureing the logging module.
|
|
612
|
+
|
|
613
|
+
Parameters
|
|
614
|
+
----------
|
|
615
|
+
switch : bool
|
|
616
|
+
Debugging switch.
|
|
617
|
+
Returns
|
|
618
|
+
-------
|
|
619
|
+
None.
|
|
620
|
+
|
|
621
|
+
"""
|
|
622
|
+
if switch is True:
|
|
623
|
+
if logfile is None:
|
|
624
|
+
logging.basicConfig(
|
|
625
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
626
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG)
|
|
627
|
+
else:
|
|
628
|
+
logging.basicConfig(
|
|
629
|
+
filename=logfile,
|
|
630
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
631
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG)
|
|
632
|
+
else:
|
|
633
|
+
if logfile is None:
|
|
634
|
+
logging.basicConfig(
|
|
635
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
636
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO)
|
|
637
|
+
else:
|
|
638
|
+
logging.basicConfig(
|
|
639
|
+
filename=logfile,
|
|
640
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
641
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO)
|
|
607
642
|
|
|
@@ -34,6 +34,7 @@ import numpy as np
|
|
|
34
34
|
from scipy.stats import binom
|
|
35
35
|
|
|
36
36
|
from optparse import OptionParser
|
|
37
|
+
from cpgmodule._version import __version__
|
|
37
38
|
from cpgmodule import ireader
|
|
38
39
|
from cpgmodule.utils import *
|
|
39
40
|
from cpgmodule import BED
|
|
@@ -44,7 +45,6 @@ __author__ = "Liguo Wang"
|
|
|
44
45
|
__copyright__ = "Copyleft"
|
|
45
46
|
__credits__ = []
|
|
46
47
|
__license__ = "GPL"
|
|
47
|
-
__version__="2.0.0"
|
|
48
48
|
__maintainer__ = "Liguo Wang"
|
|
49
49
|
__email__ = "wang.liguo@mayo.edu"
|
|
50
50
|
__status__ = "Development"
|
|
@@ -18,6 +18,7 @@ import subprocess
|
|
|
18
18
|
import numpy as np
|
|
19
19
|
from os.path import basename
|
|
20
20
|
from optparse import OptionParser
|
|
21
|
+
from cpgmodule._version import __version__
|
|
21
22
|
from cpgmodule import ireader
|
|
22
23
|
from cpgmodule.utils import *
|
|
23
24
|
from cpgmodule import BED
|
|
@@ -28,7 +29,6 @@ __author__ = "Liguo Wang"
|
|
|
28
29
|
__copyright__ = "Copyleft"
|
|
29
30
|
__credits__ = []
|
|
30
31
|
__license__ = "GPL"
|
|
31
|
-
__version__="0.1.9"
|
|
32
32
|
__maintainer__ = "Liguo Wang"
|
|
33
33
|
__email__ = "wang.liguo@mayo.edu"
|
|
34
34
|
__status__ = "Development"
|
|
@@ -10,13 +10,12 @@ import sys,os
|
|
|
10
10
|
from optparse import OptionParser
|
|
11
11
|
from cpgmodule import ireader
|
|
12
12
|
from cpgmodule.utils import *
|
|
13
|
-
|
|
13
|
+
from cpgmodule._version import __version__
|
|
14
14
|
|
|
15
15
|
__author__ = "Liguo Wang"
|
|
16
16
|
__copyright__ = "Copyleft"
|
|
17
17
|
__credits__ = []
|
|
18
18
|
__license__ = "GPL"
|
|
19
|
-
__version__="2.0.0"
|
|
20
19
|
__maintainer__ = "Liguo Wang"
|
|
21
20
|
__email__ = "wang.liguo@mayo.edu"
|
|
22
21
|
__status__ = "Development"
|
|
@@ -19,12 +19,12 @@ from cpgmodule import ireader
|
|
|
19
19
|
from cpgmodule.utils import *
|
|
20
20
|
from cpgmodule import BED
|
|
21
21
|
from cpgmodule import extend_bed
|
|
22
|
+
from cpgmodule._version import __version__
|
|
22
23
|
|
|
23
24
|
__author__ = "Liguo Wang"
|
|
24
25
|
__copyright__ = "Copyleft"
|
|
25
26
|
__credits__ = []
|
|
26
27
|
__license__ = "GPL"
|
|
27
|
-
__version__="2.0.0"
|
|
28
28
|
__maintainer__ = "Liguo Wang"
|
|
29
29
|
__email__ = "wang.liguo@mayo.edu"
|
|
30
30
|
__status__ = "Development"
|
|
@@ -14,12 +14,12 @@ import numpy as np
|
|
|
14
14
|
from optparse import OptionParser
|
|
15
15
|
from cpgmodule import ireader
|
|
16
16
|
from cpgmodule.utils import *
|
|
17
|
+
from cpgmodule._version import __version__
|
|
17
18
|
|
|
18
19
|
__author__ = "Liguo Wang"
|
|
19
20
|
__copyright__ = "Copyleft"
|
|
20
21
|
__credits__ = []
|
|
21
22
|
__license__ = "GPL"
|
|
22
|
-
__version__="2.0.0"
|
|
23
23
|
__maintainer__ = "Liguo Wang"
|
|
24
24
|
__email__ = "wang.liguo@mayo.edu"
|
|
25
25
|
__status__ = "Development"
|
|
@@ -34,12 +34,12 @@ from optparse import OptionParser
|
|
|
34
34
|
from cpgmodule import ireader
|
|
35
35
|
from cpgmodule.utils import *
|
|
36
36
|
from cpgmodule import BED
|
|
37
|
+
from cpgmodule._version import __version__
|
|
37
38
|
|
|
38
39
|
__author__ = "Liguo Wang"
|
|
39
40
|
__copyright__ = "Copyleft"
|
|
40
41
|
__credits__ = []
|
|
41
42
|
__license__ = "GPL"
|
|
42
|
-
__version__="2.0.0"
|
|
43
43
|
__maintainer__ = "Liguo Wang"
|
|
44
44
|
__email__ = "wang.liguo@mayo.edu"
|
|
45
45
|
__status__ = "Development"
|
|
@@ -23,18 +23,16 @@ import sys,os
|
|
|
23
23
|
import collections
|
|
24
24
|
import subprocess
|
|
25
25
|
import numpy as np
|
|
26
|
-
#import re
|
|
27
26
|
from optparse import OptionParser
|
|
28
27
|
from cpgmodule import ireader
|
|
29
28
|
from cpgmodule.utils import *
|
|
30
29
|
from cpgmodule import BED
|
|
31
|
-
|
|
30
|
+
from cpgmodule._version import __version__
|
|
32
31
|
|
|
33
32
|
__author__ = "Liguo Wang"
|
|
34
33
|
__copyright__ = "Copyleft"
|
|
35
34
|
__credits__ = []
|
|
36
35
|
__license__ = "GPL"
|
|
37
|
-
__version__="2.0.0"
|
|
38
36
|
__maintainer__ = "Liguo Wang"
|
|
39
37
|
__email__ = "wang.liguo@mayo.edu"
|
|
40
38
|
__status__ = "Development"
|
|
@@ -17,12 +17,12 @@ from cpgmodule import ireader
|
|
|
17
17
|
from cpgmodule.utils import *
|
|
18
18
|
from cpgmodule import BED
|
|
19
19
|
from cpgmodule.imotif import PSSM
|
|
20
|
+
from cpgmodule._version import __version__
|
|
20
21
|
|
|
21
22
|
__author__ = "Liguo Wang"
|
|
22
23
|
__copyright__ = "Copyleft"
|
|
23
24
|
__credits__ = []
|
|
24
25
|
__license__ = "GPL"
|
|
25
|
-
__version__="2.0.0"
|
|
26
26
|
__maintainer__ = "Liguo Wang"
|
|
27
27
|
__email__ = "wang.liguo@mayo.edu"
|
|
28
28
|
__status__ = "Development"
|
|
@@ -34,12 +34,12 @@ from optparse import OptionParser
|
|
|
34
34
|
from cpgmodule import ireader
|
|
35
35
|
from cpgmodule.utils import *
|
|
36
36
|
from cpgmodule.region2gene import *
|
|
37
|
+
from cpgmodule._version import __version__
|
|
37
38
|
|
|
38
39
|
__author__ = "Liguo Wang"
|
|
39
40
|
__copyright__ = "Copyleft"
|
|
40
41
|
__credits__ = []
|
|
41
42
|
__license__ = "GPL"
|
|
42
|
-
__version__="2.0.0"
|
|
43
43
|
__maintainer__ = "Liguo Wang"
|
|
44
44
|
__email__ = "wang.liguo@mayo.edu"
|
|
45
45
|
__status__ = "Development"
|
|
@@ -32,6 +32,7 @@ import sys
|
|
|
32
32
|
import subprocess
|
|
33
33
|
from optparse import OptionParser
|
|
34
34
|
from cpgmodule.utils import *
|
|
35
|
+
from cpgmodule._version import __version__
|
|
35
36
|
import pandas as pd
|
|
36
37
|
from sklearn.preprocessing import StandardScaler
|
|
37
38
|
from sklearn.decomposition import PCA
|
|
@@ -40,15 +41,15 @@ __author__ = "Liguo Wang"
|
|
|
40
41
|
__copyright__ = "Copyleft"
|
|
41
42
|
__credits__ = []
|
|
42
43
|
__license__ = "GPL"
|
|
43
|
-
__version__="2.0.0"
|
|
44
44
|
__maintainer__ = "Liguo Wang"
|
|
45
45
|
__email__ = "wang.liguo@mayo.edu"
|
|
46
46
|
__status__ = "Development"
|
|
47
47
|
|
|
48
48
|
def pick_colors(n):
|
|
49
|
-
my_colors = [
|
|
49
|
+
my_colors = [
|
|
50
|
+
"#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFE100", "#FF5005"]
|
|
50
51
|
if n > len(my_colors):
|
|
51
|
-
print ("Only support
|
|
52
|
+
print ("Only support 26 different colors", file = sys.stderr)
|
|
52
53
|
sys.exit()
|
|
53
54
|
return my_colors[0:n]
|
|
54
55
|
|
|
@@ -86,27 +87,30 @@ def main():
|
|
|
86
87
|
df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
|
|
87
88
|
|
|
88
89
|
#remove NA and transpose
|
|
89
|
-
df2 = df1.dropna(axis=0, how='any')
|
|
90
|
-
printlog("%d rows with missing values were removed." % (len(df1) - len(df2)))
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
printlog("Transposing data frame ...")
|
|
94
|
-
df2 = df2.T
|
|
95
|
-
#print (df2.head())
|
|
96
|
-
|
|
97
|
-
printlog("Standarizing values ...")
|
|
98
|
-
x = df2.values
|
|
99
|
-
x = StandardScaler().fit_transform(x)
|
|
100
|
-
|
|
90
|
+
df2 = df1.dropna(axis=0, how='any').T
|
|
91
|
+
printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.columns)))
|
|
92
|
+
|
|
101
93
|
printlog("Reading group file: \"%s\" ..." % (options.group_file))
|
|
102
94
|
group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
|
|
103
|
-
group.index = group.index.map(str)
|
|
104
|
-
|
|
105
95
|
#check if sample IDs are unique
|
|
106
96
|
if len(group.index) != len(group.index.unique()):
|
|
107
97
|
print ("Sample IDs are not unique", file = sys.stderr)
|
|
108
98
|
sys.exit()
|
|
99
|
+
group.index = group.index.map(str)
|
|
100
|
+
printlog("Group file \"%s\" contains %d samples" % (options.group_file, len(group.index)))
|
|
101
|
+
|
|
102
|
+
printlog("Find common sample IDs between group file and data file ...")
|
|
103
|
+
common_samples = list(set(group.index) & set(df2.index))
|
|
104
|
+
used_df = df2.loc[common_samples]
|
|
105
|
+
(usable_sample, usable_cpg) = used_df.shape
|
|
106
|
+
printlog("Used CpGs: %d, Used samples: %d" % (usable_cpg, usable_sample))
|
|
107
|
+
|
|
109
108
|
|
|
109
|
+
printlog("Standarizing values ...")
|
|
110
|
+
x = used_df.to_numpy()
|
|
111
|
+
x = StandardScaler().fit_transform(x)
|
|
112
|
+
|
|
113
|
+
|
|
110
114
|
group_names = group['Group_ID'].unique().tolist() # a list of unique group names
|
|
111
115
|
color_names = pick_colors(len(group_names)) # a list of unique colors
|
|
112
116
|
group_to_col = dict(zip(group_names, color_names))
|
|
@@ -116,9 +120,9 @@ def main():
|
|
|
116
120
|
pca = PCA(n_components = options.n_components, random_state = 0)
|
|
117
121
|
principalComponents = pca.fit_transform(x)
|
|
118
122
|
pca_names = [str(i)+str(j) for i,j in zip(['PC']*options.n_components,range(1,options.n_components+1))]
|
|
119
|
-
principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index =
|
|
123
|
+
principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index = used_df.index)
|
|
120
124
|
|
|
121
|
-
finalDf = pd.concat([principalDf, group], axis = 1, sort=False)
|
|
125
|
+
finalDf = pd.concat([principalDf, group], axis = 1, sort=False, join='inner')
|
|
122
126
|
finalDf.index.name = 'Sample_ID'
|
|
123
127
|
|
|
124
128
|
printlog("Writing PCA results to file: \"%s\" ..." % (options.out_file + '.PCA.tsv'))
|
|
@@ -133,18 +137,22 @@ def main():
|
|
|
133
137
|
|
|
134
138
|
print ('pdf(file=\"%s\", width=8, height=8)' % (options.out_file + '.PCA.pdf'),file=ROUT)
|
|
135
139
|
print ('')
|
|
136
|
-
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
|
|
140
|
+
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
|
|
141
|
+
% (options.out_file + '.PCA.tsv'), file=ROUT)
|
|
137
142
|
print ('attach(d)', file=ROUT)
|
|
138
143
|
if options.plot_alpha:
|
|
139
144
|
print ('library(scales)', file=ROUT)
|
|
140
|
-
print ('plot(PC1, PC2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="PCA 2D map"
|
|
145
|
+
print ('plot(PC1, PC2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="PCA 2D map", xlab="PC1 (var. explained: %.2f%%)", ylab="PC2 (var. explained: %.2f%%)")'
|
|
146
|
+
% (options.plot_alpha, pch[options.plot_char], pca_vars[0]*100, pca_vars[1]*100), file=ROUT)
|
|
141
147
|
else:
|
|
142
|
-
print ('plot(PC1, PC2, col = Colors, pch=%d, cex=1.2, main="PCA 2D map"
|
|
148
|
+
print ('plot(PC1, PC2, col = Colors, pch=%d, cex=1.2, main="PCA 2D map", xlab="PC1 (var. explained: %.2f%%)", ylab="PC2 (var. explained: %.2f%%)")'
|
|
149
|
+
% (pca_vars[0]*100, pca_vars[1]*100, pch[options.plot_char], pca_vars[0]*100, pca_vars[1]*100), file=ROUT)
|
|
143
150
|
|
|
144
151
|
if options.text_label:
|
|
145
152
|
print ('text(PC1, PC2, labels=Sample_ID, col = Colors, cex=0.5, pos=1)', file=ROUT)
|
|
146
153
|
|
|
147
|
-
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
|
|
154
|
+
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
|
|
155
|
+
% (legend_pos[options.legend_location], ','.join(['"' + str(i) + '"' for i in group_names]), ','.join(['"' + str(group_to_col[i]) + '"' for i in group_names]), pch[options.plot_char]), file=ROUT)
|
|
148
156
|
|
|
149
157
|
|
|
150
158
|
print ('dev.off()', file=ROUT)
|
|
@@ -32,6 +32,7 @@ import pandas as pd
|
|
|
32
32
|
import subprocess
|
|
33
33
|
from optparse import OptionParser
|
|
34
34
|
from cpgmodule.utils import *
|
|
35
|
+
from cpgmodule._version import __version__
|
|
35
36
|
from sklearn.preprocessing import StandardScaler
|
|
36
37
|
#import datatable as dt
|
|
37
38
|
#import seaborn as sns
|
|
@@ -41,15 +42,15 @@ __author__ = "Liguo Wang"
|
|
|
41
42
|
__copyright__ = "Copyleft"
|
|
42
43
|
__credits__ = []
|
|
43
44
|
__license__ = "GPL"
|
|
44
|
-
__version__="2.0.0"
|
|
45
45
|
__maintainer__ = "Liguo Wang"
|
|
46
46
|
__email__ = "wang.liguo@mayo.edu"
|
|
47
47
|
__status__ = "Development"
|
|
48
48
|
|
|
49
49
|
def pick_colors(n):
|
|
50
|
-
my_colors = [
|
|
50
|
+
my_colors = [
|
|
51
|
+
"#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFE100", "#FF5005"]
|
|
51
52
|
if n > len(my_colors):
|
|
52
|
-
print ("Only support
|
|
53
|
+
print ("Only support 26 different colors", file = sys.stderr)
|
|
53
54
|
sys.exit()
|
|
54
55
|
return my_colors[0:n]
|
|
55
56
|
|
|
@@ -99,26 +100,28 @@ def main():
|
|
|
99
100
|
df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
|
|
100
101
|
|
|
101
102
|
#remove NA and transpose
|
|
102
|
-
df2 = df1.dropna(axis=0, how='any')
|
|
103
|
-
printlog("%d rows with missing values were removed." % (len(df1) - len(df2)))
|
|
104
|
-
#print (df2.head())
|
|
105
|
-
|
|
106
|
-
printlog("Transposing data frame ...")
|
|
107
|
-
df2 = df2.T
|
|
108
|
-
#print (df2.head())
|
|
109
|
-
|
|
110
|
-
printlog("Standarizing values ...")
|
|
111
|
-
x = df2.values
|
|
112
|
-
x = StandardScaler().fit_transform(x)
|
|
103
|
+
df2 = df1.dropna(axis=0, how='any').T
|
|
104
|
+
printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.columns)))
|
|
113
105
|
|
|
114
106
|
printlog("Reading group file: \"%s\" ..." % (options.group_file))
|
|
115
107
|
group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
|
|
116
|
-
group.index = group.index.map(str)
|
|
117
|
-
|
|
118
108
|
#check if sample IDs are unique
|
|
119
109
|
if len(group.index) != len(group.index.unique()):
|
|
120
110
|
print ("Sample IDs are not unique", file = sys.stderr)
|
|
121
111
|
sys.exit()
|
|
112
|
+
group.index = group.index.map(str)
|
|
113
|
+
printlog("Group file \"%s\" contains %d samples" % (options.group_file, len(group.index)))
|
|
114
|
+
|
|
115
|
+
printlog("Find common sample IDs between group file and data file ...")
|
|
116
|
+
common_samples = list(set(group.index) & set(df2.index))
|
|
117
|
+
used_df = df2.loc[common_samples]
|
|
118
|
+
(usable_sample, usable_cpg) = used_df.shape
|
|
119
|
+
printlog("Used CpGs: %d, Used samples: %d" % (usable_cpg, usable_sample))
|
|
120
|
+
|
|
121
|
+
printlog("Standarizing values ...")
|
|
122
|
+
x = used_df.to_numpy()
|
|
123
|
+
x = StandardScaler().fit_transform(x)
|
|
124
|
+
|
|
122
125
|
|
|
123
126
|
group_names = group['Group_ID'].unique().tolist() # a list of unique group names
|
|
124
127
|
color_names = pick_colors(len(group_names)) # a list of unique colors
|
|
@@ -133,9 +136,9 @@ def main():
|
|
|
133
136
|
#pca = PCA(n_components = options.n_components, random_state = 0)
|
|
134
137
|
#principalComponents = pca.fit_transform(x)
|
|
135
138
|
pca_names = [str(i)+str(j) for i,j in zip(['UMAP']*options.n_components,range(1,options.n_components+1))]
|
|
136
|
-
principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index =
|
|
139
|
+
principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index = used_df.index)
|
|
137
140
|
|
|
138
|
-
finalDf = pd.concat([principalDf, group], axis = 1, sort=False)
|
|
141
|
+
finalDf = pd.concat([principalDf, group], axis = 1, sort=False, join='inner')
|
|
139
142
|
finalDf.index.name = 'Sample_ID'
|
|
140
143
|
|
|
141
144
|
printlog("Writing UMAP results to file: \"%s\" ..." % (options.out_file + '.UMAP.tsv'))
|
|
@@ -146,18 +149,22 @@ def main():
|
|
|
146
149
|
|
|
147
150
|
print ('pdf(file=\"%s\", width=8, height=8)' % (options.out_file + '.UMAP.pdf'),file=ROUT)
|
|
148
151
|
print ('')
|
|
149
|
-
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
|
|
152
|
+
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
|
|
153
|
+
% (options.out_file + '.UMAP.tsv'), file=ROUT)
|
|
150
154
|
print ('attach(d)', file=ROUT)
|
|
151
155
|
if options.plot_alpha:
|
|
152
156
|
print ('library(scales)', file=ROUT)
|
|
153
|
-
print ('plot(UMAP1, UMAP2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
|
|
157
|
+
print ('plot(UMAP1, UMAP2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
|
|
158
|
+
% (options.plot_alpha, pch[options.plot_char]), file=ROUT)
|
|
154
159
|
else:
|
|
155
|
-
print ('plot(UMAP1, UMAP2, col = Colors, pch=%d, cex=1.2, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
|
|
160
|
+
print ('plot(UMAP1, UMAP2, col = Colors, pch=%d, cex=1.2, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
|
|
161
|
+
% pch[options.plot_char], file=ROUT)
|
|
156
162
|
|
|
157
163
|
if options.text_label:
|
|
158
164
|
print ('text(UMAP1, UMAP2, labels=Sample_ID, col = Colors, cex=0.5, pos=1)', file=ROUT)
|
|
159
165
|
|
|
160
|
-
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
|
|
166
|
+
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
|
|
167
|
+
% (legend_pos[options.legend_location], ','.join(['"' + str(i) + '"' for i in group_names]), ','.join(['"' + str(group_to_col[i]) + '"' for i in group_names]), pch[options.plot_char]), file=ROUT)
|
|
161
168
|
|
|
162
169
|
|
|
163
170
|
print ('dev.off()', file=ROUT)
|
|
@@ -26,6 +26,7 @@ import sys,os
|
|
|
26
26
|
import collections
|
|
27
27
|
import subprocess
|
|
28
28
|
import numpy as np
|
|
29
|
+
from cpgmodule._version import __version__
|
|
29
30
|
from optparse import OptionParser
|
|
30
31
|
from cpgmodule import ireader
|
|
31
32
|
from cpgmodule.utils import *
|
|
@@ -36,7 +37,6 @@ __author__ = "Liguo Wang"
|
|
|
36
37
|
__copyright__ = "Copyleft"
|
|
37
38
|
__credits__ = []
|
|
38
39
|
__license__ = "GPL"
|
|
39
|
-
__version__="2.0.0"
|
|
40
40
|
__maintainer__ = "Liguo Wang"
|
|
41
41
|
__email__ = "wang.liguo@mayo.edu"
|
|
42
42
|
__status__ = "Development"
|
|
@@ -18,6 +18,7 @@ import sys,os
|
|
|
18
18
|
import collections
|
|
19
19
|
import numpy as np
|
|
20
20
|
from scipy import stats
|
|
21
|
+
from cpgmodule._version import __version__
|
|
21
22
|
from optparse import OptionParser
|
|
22
23
|
from cpgmodule import ireader
|
|
23
24
|
from cpgmodule.utils import *
|
|
@@ -26,7 +27,6 @@ __author__ = "Liguo Wang"
|
|
|
26
27
|
__copyright__ = "Copyleft"
|
|
27
28
|
__credits__ = []
|
|
28
29
|
__license__ = "GPL"
|
|
29
|
-
__version__="2.0.0"
|
|
30
30
|
__maintainer__ = "Liguo Wang"
|
|
31
31
|
__email__ = "wang.liguo@mayo.edu"
|
|
32
32
|
__status__ = "Development"
|
|
@@ -21,6 +21,7 @@ import collections
|
|
|
21
21
|
import subprocess
|
|
22
22
|
import numpy as np
|
|
23
23
|
from optparse import OptionParser
|
|
24
|
+
from cpgmodule._version import __version__
|
|
24
25
|
from cpgmodule import ireader
|
|
25
26
|
from cpgmodule.utils import *
|
|
26
27
|
from cpgmodule import BED
|
|
@@ -29,7 +30,6 @@ __author__ = "Liguo Wang"
|
|
|
29
30
|
__copyright__ = "Copyleft"
|
|
30
31
|
__credits__ = []
|
|
31
32
|
__license__ = "GPL"
|
|
32
|
-
__version__="2.0.0"
|
|
33
33
|
__maintainer__ = "Liguo Wang"
|
|
34
34
|
__email__ = "wang.liguo@mayo.edu"
|
|
35
35
|
__status__ = "Development"
|
|
@@ -25,6 +25,7 @@ import collections
|
|
|
25
25
|
import subprocess
|
|
26
26
|
import numpy as np
|
|
27
27
|
from optparse import OptionParser
|
|
28
|
+
from cpgmodule._version import __version__
|
|
28
29
|
from cpgmodule import ireader
|
|
29
30
|
from cpgmodule.utils import *
|
|
30
31
|
from cpgmodule import BED
|
|
@@ -33,7 +34,6 @@ __author__ = "Liguo Wang"
|
|
|
33
34
|
__copyright__ = "Copyleft"
|
|
34
35
|
__credits__ = []
|
|
35
36
|
__license__ = "GPL"
|
|
36
|
-
__version__="2.0.0"
|
|
37
37
|
__maintainer__ = "Liguo Wang"
|
|
38
38
|
__email__ = "wang.liguo@mayo.edu"
|
|
39
39
|
__status__ = "Development"
|
|
@@ -18,6 +18,7 @@ cg_003 0.845065 0.843957 0.840184 0.824286
|
|
|
18
18
|
import sys
|
|
19
19
|
import numpy as np
|
|
20
20
|
from optparse import OptionParser
|
|
21
|
+
from cpgmodule._version import __version__
|
|
21
22
|
from cpgmodule.utils import *
|
|
22
23
|
import pandas as pd
|
|
23
24
|
|
|
@@ -28,7 +29,6 @@ __author__ = "Liguo Wang"
|
|
|
28
29
|
__copyright__ = "Copyleft"
|
|
29
30
|
__credits__ = []
|
|
30
31
|
__license__ = "GPL"
|
|
31
|
-
__version__="2.0.0"
|
|
32
32
|
__maintainer__ = "Liguo Wang"
|
|
33
33
|
__email__ = "wang.liguo@mayo.edu"
|
|
34
34
|
__status__ = "Development"
|
|
@@ -40,8 +40,8 @@ def main():
|
|
|
40
40
|
parser = OptionParser(usage,version="%prog " + __version__)
|
|
41
41
|
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
|
|
42
42
|
parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Comma-separated group file defining the biological groups of each sample.")
|
|
43
|
-
parser.add_option("-
|
|
44
|
-
parser.add_option("-s","--score-function",action="store",type='string', dest="score_function", default='
|
|
43
|
+
parser.add_option("-c","--topK",action="store",type='int', dest="cpg_count", default=100, help="Number of top features to select. default=%default" )
|
|
44
|
+
parser.add_option("-s","--score-function",action="store",type='string', dest="score_function", default='chisq', help="Scoring function used to measure the dependency between features scores and labels. Must be \"chisq\" (chi-squared statistic), \"anova\" (ANOVA F-value), or \"mi\" (mutual information). default=%default" )
|
|
45
45
|
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
46
46
|
(options,args)=parser.parse_args()
|
|
47
47
|
|
|
@@ -77,7 +77,7 @@ def main():
|
|
|
77
77
|
|
|
78
78
|
printlog("Reading group file: \"%s\"" % (options.group_file))
|
|
79
79
|
group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
|
|
80
|
-
a = list(group['Group_ID']) #a is *string labels* for groups: ['Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Cancer', 'Cancer', 'Cancer', 'Cancer']
|
|
80
|
+
a = pd.Series(list(group['Group_ID'])) #a is *string labels* for groups: ['Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Cancer', 'Cancer', 'Cancer', 'Cancer']
|
|
81
81
|
#print (a)
|
|
82
82
|
y, tmp = pd.factorize(a) #y is *numeric labels* for groups: [0 0 0 0 0 1 1 1 1]
|
|
83
83
|
#print (np.array(y))
|
|
@@ -87,12 +87,15 @@ def main():
|
|
|
87
87
|
if options.score_function == 'anova':
|
|
88
88
|
printlog ("Using ANOVA F value to select features ...")
|
|
89
89
|
selector = SelectKBest(f_classif, k = options.cpg_count)
|
|
90
|
-
elif options.score_function == '
|
|
90
|
+
elif options.score_function == 'mi':
|
|
91
91
|
printlog ("Using Mutual Information to select features ...")
|
|
92
92
|
selector = SelectKBest(mutual_info_classif, k = options.cpg_count)
|
|
93
|
-
|
|
93
|
+
elif options.score_function == 'chisq':
|
|
94
94
|
printlog ("Using Chi Square statistic to select features ...")
|
|
95
95
|
selector = SelectKBest(chi2, k = options.cpg_count)
|
|
96
|
+
else:
|
|
97
|
+
printlog("Unknown function: %s" % options.score_function)
|
|
98
|
+
sys.exit(0)
|
|
96
99
|
else:
|
|
97
100
|
printlog("Doing nothing! '-k' >= the total number of features in \"%s\"" % (options.input_file))
|
|
98
101
|
sys.exit(0)
|
|
@@ -28,6 +28,7 @@ import collections
|
|
|
28
28
|
import subprocess
|
|
29
29
|
import numpy as np
|
|
30
30
|
from optparse import OptionParser
|
|
31
|
+
from cpgmodule._version import __version__
|
|
31
32
|
from cpgmodule import ireader
|
|
32
33
|
from cpgmodule.utils import *
|
|
33
34
|
from cpgmodule import BED
|
|
@@ -37,7 +38,6 @@ __author__ = "Liguo Wang"
|
|
|
37
38
|
__copyright__ = "Copyleft"
|
|
38
39
|
__credits__ = []
|
|
39
40
|
__license__ = "GPL"
|
|
40
|
-
__version__="2.0.0"
|
|
41
41
|
__maintainer__ = "Liguo Wang"
|
|
42
42
|
__email__ = "wang.liguo@mayo.edu"
|
|
43
43
|
__status__ = "Development"
|
|
@@ -25,6 +25,7 @@ import collections
|
|
|
25
25
|
import subprocess
|
|
26
26
|
import numpy as np
|
|
27
27
|
from optparse import OptionParser
|
|
28
|
+
from cpgmodule._version import __version__
|
|
28
29
|
from cpgmodule import ireader
|
|
29
30
|
from cpgmodule.utils import *
|
|
30
31
|
from cpgmodule import BED
|
|
@@ -33,7 +34,6 @@ __author__ = "Liguo Wang"
|
|
|
33
34
|
__copyright__ = "Copyleft"
|
|
34
35
|
__credits__ = []
|
|
35
36
|
__license__ = "GPL"
|
|
36
|
-
__version__="2.0.0"
|
|
37
37
|
__maintainer__ = "Liguo Wang"
|
|
38
38
|
__email__ = "wang.liguo@mayo.edu"
|
|
39
39
|
__status__ = "Development"
|