cpgtools 1.12.0__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cpgtools might be problematic. Click here for more details.
- cpgmodule/_version.py +1 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/utils.py +38 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_aggregation.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_position.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_probe.py +6 -4
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_density_gene_centered.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_chrom.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_gene_centered.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_region.py +1 -3
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_logo.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_to_gene.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_PCA.py +31 -23
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_UMAP.py +29 -22
- cpgtools-2.0.2.data/scripts/beta_imputation.py +604 -0
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_jitter_plot.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_m_conversion.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_gene_centered.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_region.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_selectNBest.py +9 -6
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_stacked_barplot.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_stats.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_tSNE.py +31 -24
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_topN.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_trichotmize.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_Bayes.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_bb.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_fisher.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_glm.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_logit.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_nonparametric.py +1 -1
- {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_ttest.py +3 -3
- cpgtools-2.0.2.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.2.dist-info/LICENSE +19 -0
- cpgtools-2.0.2.dist-info/METADATA +76 -0
- cpgtools-2.0.2.dist-info/RECORD +82 -0
- {cpgtools-1.12.0.dist-info → cpgtools-2.0.2.dist-info}/WHEEL +1 -1
- cpgtools-2.0.2.dist-info/top_level.txt +3 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +76 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- cpgtools-1.12.0.dist-info/LICENSE.txt +0 -674
- cpgtools-1.12.0.dist-info/METADATA +0 -30
- cpgtools-1.12.0.dist-info/RECORD +0 -43
- cpgtools-1.12.0.dist-info/top_level.txt +0 -2
cpgmodule/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.0.2"
|
|
File without changes
|
cpgmodule/methylClock.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Created on Fri Nov 25 10:55:14 2022
|
|
5
|
+
|
|
6
|
+
@author: Liguo Wang
|
|
7
|
+
"""
|
|
8
|
+
from cpgmodule import ireader
|
|
9
|
+
#import sys,os
|
|
10
|
+
|
|
11
|
+
class MethylSig():
|
|
12
|
+
"""
|
|
13
|
+
Pack DNA methylation signature file into object.
|
|
14
|
+
|
|
15
|
+
>>> from cpgmodule import methylClock
|
|
16
|
+
>>> a = methylClock.MethylAge(signature_file = 'coefBlup.tsv', signature_name = 'BLUP', signature_info="")
|
|
17
|
+
>>> a.name
|
|
18
|
+
'BLUP'
|
|
19
|
+
>>> a.Intercept
|
|
20
|
+
91.15396
|
|
21
|
+
>>> a.ncpg
|
|
22
|
+
319607
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, signature_file, signature_name, tissues = [], unit = '', signature_info = '', reference = '', pub_link = '', method = ''):
|
|
26
|
+
self.name = signature_name
|
|
27
|
+
self.info = signature_info
|
|
28
|
+
self.tissues = tissues
|
|
29
|
+
self.unit = unit
|
|
30
|
+
self.coef = {}
|
|
31
|
+
self.cpgs = []
|
|
32
|
+
self.ncpg = 0
|
|
33
|
+
self.Intercept = 0.0
|
|
34
|
+
self.ref = reference
|
|
35
|
+
self.pubmed = pub_link
|
|
36
|
+
self.method = method
|
|
37
|
+
for l in ireader.reader(signature_file):
|
|
38
|
+
if l.startswith('#'):
|
|
39
|
+
continue
|
|
40
|
+
f = l.split()
|
|
41
|
+
if l.startswith('Intercept'):
|
|
42
|
+
try:
|
|
43
|
+
self.Intercept = float(f[1])
|
|
44
|
+
except:
|
|
45
|
+
self.Intercept = 0.0
|
|
46
|
+
else:
|
|
47
|
+
self.cpgs.append(f[0])
|
|
48
|
+
self.ncpg += 1
|
|
49
|
+
try:
|
|
50
|
+
self.coef[f[0]] = float(f[1])
|
|
51
|
+
#self.ncpg += 1
|
|
52
|
+
except:
|
|
53
|
+
continue
|
cpgmodule/utils.py
CHANGED
|
@@ -5,6 +5,8 @@ from time import strftime
|
|
|
5
5
|
from bx.intervals import *
|
|
6
6
|
import numpy as np
|
|
7
7
|
from cpgmodule import ireader
|
|
8
|
+
import logging
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
def revcomp(dna):
|
|
10
12
|
'''reverse complement DNA sequences'''
|
|
@@ -266,7 +268,7 @@ def count_over_range(lst, cpg_ranges):
|
|
|
266
268
|
total_count += len(tmp)
|
|
267
269
|
return(total_size,total_count)
|
|
268
270
|
|
|
269
|
-
def read_grp_file1(gfile):
|
|
271
|
+
def read_grp_file1(gfile,na_lab="NA"):
|
|
270
272
|
'''
|
|
271
273
|
read group file. Group file define the biological groups of data matrix file.
|
|
272
274
|
(1) It must has header
|
|
@@ -292,6 +294,8 @@ def read_grp_file1(gfile):
|
|
|
292
294
|
l = l.replace(' ','')
|
|
293
295
|
line_num += 1
|
|
294
296
|
f = l.split(',')
|
|
297
|
+
if f[1] == na_lab:
|
|
298
|
+
continue
|
|
295
299
|
if len(f) < 2:
|
|
296
300
|
print ("Group fle must have 2 columns!", file=sys.stderr)
|
|
297
301
|
sys.exit(1)
|
|
@@ -602,4 +606,37 @@ def read_CpG_bed(cpgfile,genefile, bin_count = 100):
|
|
|
602
606
|
|
|
603
607
|
return np.array(cpg_profile).means(axis=0)
|
|
604
608
|
"""
|
|
609
|
+
def config_log(switch, logfile=None):
|
|
610
|
+
"""
|
|
611
|
+
Configureing the logging module.
|
|
612
|
+
|
|
613
|
+
Parameters
|
|
614
|
+
----------
|
|
615
|
+
switch : bool
|
|
616
|
+
Debugging switch.
|
|
617
|
+
Returns
|
|
618
|
+
-------
|
|
619
|
+
None.
|
|
620
|
+
|
|
621
|
+
"""
|
|
622
|
+
if switch is True:
|
|
623
|
+
if logfile is None:
|
|
624
|
+
logging.basicConfig(
|
|
625
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
626
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG)
|
|
627
|
+
else:
|
|
628
|
+
logging.basicConfig(
|
|
629
|
+
filename=logfile,
|
|
630
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
631
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG)
|
|
632
|
+
else:
|
|
633
|
+
if logfile is None:
|
|
634
|
+
logging.basicConfig(
|
|
635
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
636
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO)
|
|
637
|
+
else:
|
|
638
|
+
logging.basicConfig(
|
|
639
|
+
filename=logfile,
|
|
640
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
641
|
+
datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO)
|
|
605
642
|
|
|
@@ -34,6 +34,7 @@ import numpy as np
|
|
|
34
34
|
from scipy.stats import binom
|
|
35
35
|
|
|
36
36
|
from optparse import OptionParser
|
|
37
|
+
from cpgmodule._version import __version__
|
|
37
38
|
from cpgmodule import ireader
|
|
38
39
|
from cpgmodule.utils import *
|
|
39
40
|
from cpgmodule import BED
|
|
@@ -44,7 +45,6 @@ __author__ = "Liguo Wang"
|
|
|
44
45
|
__copyright__ = "Copyleft"
|
|
45
46
|
__credits__ = []
|
|
46
47
|
__license__ = "GPL"
|
|
47
|
-
__version__="1.12.0"
|
|
48
48
|
__maintainer__ = "Liguo Wang"
|
|
49
49
|
__email__ = "wang.liguo@mayo.edu"
|
|
50
50
|
__status__ = "Development"
|
|
@@ -18,6 +18,7 @@ import subprocess
|
|
|
18
18
|
import numpy as np
|
|
19
19
|
from os.path import basename
|
|
20
20
|
from optparse import OptionParser
|
|
21
|
+
from cpgmodule._version import __version__
|
|
21
22
|
from cpgmodule import ireader
|
|
22
23
|
from cpgmodule.utils import *
|
|
23
24
|
from cpgmodule import BED
|
|
@@ -28,7 +29,6 @@ __author__ = "Liguo Wang"
|
|
|
28
29
|
__copyright__ = "Copyleft"
|
|
29
30
|
__credits__ = []
|
|
30
31
|
__license__ = "GPL"
|
|
31
|
-
__version__="0.1.9"
|
|
32
32
|
__maintainer__ = "Liguo Wang"
|
|
33
33
|
__email__ = "wang.liguo@mayo.edu"
|
|
34
34
|
__status__ = "Development"
|
|
@@ -10,13 +10,12 @@ import sys,os
|
|
|
10
10
|
from optparse import OptionParser
|
|
11
11
|
from cpgmodule import ireader
|
|
12
12
|
from cpgmodule.utils import *
|
|
13
|
-
|
|
13
|
+
from cpgmodule._version import __version__
|
|
14
14
|
|
|
15
15
|
__author__ = "Liguo Wang"
|
|
16
16
|
__copyright__ = "Copyleft"
|
|
17
17
|
__credits__ = []
|
|
18
18
|
__license__ = "GPL"
|
|
19
|
-
__version__="1.12.0"
|
|
20
19
|
__maintainer__ = "Liguo Wang"
|
|
21
20
|
__email__ = "wang.liguo@mayo.edu"
|
|
22
21
|
__status__ = "Development"
|
|
@@ -76,8 +75,11 @@ def main():
|
|
|
76
75
|
for l in ireader.reader(options.input_file):
|
|
77
76
|
line_num += 1
|
|
78
77
|
f = l.split()
|
|
79
|
-
if
|
|
80
|
-
|
|
78
|
+
if line_num == 1:
|
|
79
|
+
if options.header:
|
|
80
|
+
print (l + '\t' + '\t'.join(header), file=OUT)
|
|
81
|
+
else:
|
|
82
|
+
print ('\t'.join(['NA']*len(f)) + '\t' + '\t'.join(header), file=OUT)
|
|
81
83
|
else:
|
|
82
84
|
if options.probe_col >= len(f):
|
|
83
85
|
print ("Error: column ID must be smaller than %d!" % len(f), file=sys.stderr)
|
|
@@ -19,12 +19,12 @@ from cpgmodule import ireader
|
|
|
19
19
|
from cpgmodule.utils import *
|
|
20
20
|
from cpgmodule import BED
|
|
21
21
|
from cpgmodule import extend_bed
|
|
22
|
+
from cpgmodule._version import __version__
|
|
22
23
|
|
|
23
24
|
__author__ = "Liguo Wang"
|
|
24
25
|
__copyright__ = "Copyleft"
|
|
25
26
|
__credits__ = []
|
|
26
27
|
__license__ = "GPL"
|
|
27
|
-
__version__="1.12.0"
|
|
28
28
|
__maintainer__ = "Liguo Wang"
|
|
29
29
|
__email__ = "wang.liguo@mayo.edu"
|
|
30
30
|
__status__ = "Development"
|
|
@@ -14,12 +14,12 @@ import numpy as np
|
|
|
14
14
|
from optparse import OptionParser
|
|
15
15
|
from cpgmodule import ireader
|
|
16
16
|
from cpgmodule.utils import *
|
|
17
|
+
from cpgmodule._version import __version__
|
|
17
18
|
|
|
18
19
|
__author__ = "Liguo Wang"
|
|
19
20
|
__copyright__ = "Copyleft"
|
|
20
21
|
__credits__ = []
|
|
21
22
|
__license__ = "GPL"
|
|
22
|
-
__version__="1.12.0"
|
|
23
23
|
__maintainer__ = "Liguo Wang"
|
|
24
24
|
__email__ = "wang.liguo@mayo.edu"
|
|
25
25
|
__status__ = "Development"
|
|
@@ -34,12 +34,12 @@ from optparse import OptionParser
|
|
|
34
34
|
from cpgmodule import ireader
|
|
35
35
|
from cpgmodule.utils import *
|
|
36
36
|
from cpgmodule import BED
|
|
37
|
+
from cpgmodule._version import __version__
|
|
37
38
|
|
|
38
39
|
__author__ = "Liguo Wang"
|
|
39
40
|
__copyright__ = "Copyleft"
|
|
40
41
|
__credits__ = []
|
|
41
42
|
__license__ = "GPL"
|
|
42
|
-
__version__="1.12.0"
|
|
43
43
|
__maintainer__ = "Liguo Wang"
|
|
44
44
|
__email__ = "wang.liguo@mayo.edu"
|
|
45
45
|
__status__ = "Development"
|
|
@@ -23,18 +23,16 @@ import sys,os
|
|
|
23
23
|
import collections
|
|
24
24
|
import subprocess
|
|
25
25
|
import numpy as np
|
|
26
|
-
#import re
|
|
27
26
|
from optparse import OptionParser
|
|
28
27
|
from cpgmodule import ireader
|
|
29
28
|
from cpgmodule.utils import *
|
|
30
29
|
from cpgmodule import BED
|
|
31
|
-
|
|
30
|
+
from cpgmodule._version import __version__
|
|
32
31
|
|
|
33
32
|
__author__ = "Liguo Wang"
|
|
34
33
|
__copyright__ = "Copyleft"
|
|
35
34
|
__credits__ = []
|
|
36
35
|
__license__ = "GPL"
|
|
37
|
-
__version__="1.12.0"
|
|
38
36
|
__maintainer__ = "Liguo Wang"
|
|
39
37
|
__email__ = "wang.liguo@mayo.edu"
|
|
40
38
|
__status__ = "Development"
|
|
@@ -17,12 +17,12 @@ from cpgmodule import ireader
|
|
|
17
17
|
from cpgmodule.utils import *
|
|
18
18
|
from cpgmodule import BED
|
|
19
19
|
from cpgmodule.imotif import PSSM
|
|
20
|
+
from cpgmodule._version import __version__
|
|
20
21
|
|
|
21
22
|
__author__ = "Liguo Wang"
|
|
22
23
|
__copyright__ = "Copyleft"
|
|
23
24
|
__credits__ = []
|
|
24
25
|
__license__ = "GPL"
|
|
25
|
-
__version__="1.12.0"
|
|
26
26
|
__maintainer__ = "Liguo Wang"
|
|
27
27
|
__email__ = "wang.liguo@mayo.edu"
|
|
28
28
|
__status__ = "Development"
|
|
@@ -34,12 +34,12 @@ from optparse import OptionParser
|
|
|
34
34
|
from cpgmodule import ireader
|
|
35
35
|
from cpgmodule.utils import *
|
|
36
36
|
from cpgmodule.region2gene import *
|
|
37
|
+
from cpgmodule._version import __version__
|
|
37
38
|
|
|
38
39
|
__author__ = "Liguo Wang"
|
|
39
40
|
__copyright__ = "Copyleft"
|
|
40
41
|
__credits__ = []
|
|
41
42
|
__license__ = "GPL"
|
|
42
|
-
__version__="1.12.0"
|
|
43
43
|
__maintainer__ = "Liguo Wang"
|
|
44
44
|
__email__ = "wang.liguo@mayo.edu"
|
|
45
45
|
__status__ = "Development"
|
|
@@ -32,6 +32,7 @@ import sys
|
|
|
32
32
|
import subprocess
|
|
33
33
|
from optparse import OptionParser
|
|
34
34
|
from cpgmodule.utils import *
|
|
35
|
+
from cpgmodule._version import __version__
|
|
35
36
|
import pandas as pd
|
|
36
37
|
from sklearn.preprocessing import StandardScaler
|
|
37
38
|
from sklearn.decomposition import PCA
|
|
@@ -40,15 +41,15 @@ __author__ = "Liguo Wang"
|
|
|
40
41
|
__copyright__ = "Copyleft"
|
|
41
42
|
__credits__ = []
|
|
42
43
|
__license__ = "GPL"
|
|
43
|
-
__version__="1.12.0"
|
|
44
44
|
__maintainer__ = "Liguo Wang"
|
|
45
45
|
__email__ = "wang.liguo@mayo.edu"
|
|
46
46
|
__status__ = "Development"
|
|
47
47
|
|
|
48
48
|
def pick_colors(n):
|
|
49
|
-
my_colors = [
|
|
49
|
+
my_colors = [
|
|
50
|
+
"#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFE100", "#FF5005"]
|
|
50
51
|
if n > len(my_colors):
|
|
51
|
-
print ("Only support
|
|
52
|
+
print ("Only support 26 different colors", file = sys.stderr)
|
|
52
53
|
sys.exit()
|
|
53
54
|
return my_colors[0:n]
|
|
54
55
|
|
|
@@ -86,27 +87,30 @@ def main():
|
|
|
86
87
|
df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
|
|
87
88
|
|
|
88
89
|
#remove NA and transpose
|
|
89
|
-
df2 = df1.dropna(axis=0, how='any')
|
|
90
|
-
printlog("%d rows with missing values were removed." % (len(df1) - len(df2)))
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
printlog("Transposing data frame ...")
|
|
94
|
-
df2 = df2.T
|
|
95
|
-
#print (df2.head())
|
|
96
|
-
|
|
97
|
-
printlog("Standarizing values ...")
|
|
98
|
-
x = df2.values
|
|
99
|
-
x = StandardScaler().fit_transform(x)
|
|
100
|
-
|
|
90
|
+
df2 = df1.dropna(axis=0, how='any').T
|
|
91
|
+
printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.columns)))
|
|
92
|
+
|
|
101
93
|
printlog("Reading group file: \"%s\" ..." % (options.group_file))
|
|
102
94
|
group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
|
|
103
|
-
group.index = group.index.map(str)
|
|
104
|
-
|
|
105
95
|
#check if sample IDs are unique
|
|
106
96
|
if len(group.index) != len(group.index.unique()):
|
|
107
97
|
print ("Sample IDs are not unique", file = sys.stderr)
|
|
108
98
|
sys.exit()
|
|
99
|
+
group.index = group.index.map(str)
|
|
100
|
+
printlog("Group file \"%s\" contains %d samples" % (options.group_file, len(group.index)))
|
|
101
|
+
|
|
102
|
+
printlog("Find common sample IDs between group file and data file ...")
|
|
103
|
+
common_samples = list(set(group.index) & set(df2.index))
|
|
104
|
+
used_df = df2.loc[common_samples]
|
|
105
|
+
(usable_sample, usable_cpg) = used_df.shape
|
|
106
|
+
printlog("Used CpGs: %d, Used samples: %d" % (usable_cpg, usable_sample))
|
|
107
|
+
|
|
109
108
|
|
|
109
|
+
printlog("Standarizing values ...")
|
|
110
|
+
x = used_df.to_numpy()
|
|
111
|
+
x = StandardScaler().fit_transform(x)
|
|
112
|
+
|
|
113
|
+
|
|
110
114
|
group_names = group['Group_ID'].unique().tolist() # a list of unique group names
|
|
111
115
|
color_names = pick_colors(len(group_names)) # a list of unique colors
|
|
112
116
|
group_to_col = dict(zip(group_names, color_names))
|
|
@@ -116,9 +120,9 @@ def main():
|
|
|
116
120
|
pca = PCA(n_components = options.n_components, random_state = 0)
|
|
117
121
|
principalComponents = pca.fit_transform(x)
|
|
118
122
|
pca_names = [str(i)+str(j) for i,j in zip(['PC']*options.n_components,range(1,options.n_components+1))]
|
|
119
|
-
principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index =
|
|
123
|
+
principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index = used_df.index)
|
|
120
124
|
|
|
121
|
-
finalDf = pd.concat([principalDf, group], axis = 1, sort=False)
|
|
125
|
+
finalDf = pd.concat([principalDf, group], axis = 1, sort=False, join='inner')
|
|
122
126
|
finalDf.index.name = 'Sample_ID'
|
|
123
127
|
|
|
124
128
|
printlog("Writing PCA results to file: \"%s\" ..." % (options.out_file + '.PCA.tsv'))
|
|
@@ -133,18 +137,22 @@ def main():
|
|
|
133
137
|
|
|
134
138
|
print ('pdf(file=\"%s\", width=8, height=8)' % (options.out_file + '.PCA.pdf'),file=ROUT)
|
|
135
139
|
print ('')
|
|
136
|
-
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
|
|
140
|
+
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
|
|
141
|
+
% (options.out_file + '.PCA.tsv'), file=ROUT)
|
|
137
142
|
print ('attach(d)', file=ROUT)
|
|
138
143
|
if options.plot_alpha:
|
|
139
144
|
print ('library(scales)', file=ROUT)
|
|
140
|
-
print ('plot(PC1, PC2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="PCA 2D map"
|
|
145
|
+
print ('plot(PC1, PC2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="PCA 2D map", xlab="PC1 (var. explained: %.2f%%)", ylab="PC2 (var. explained: %.2f%%)")'
|
|
146
|
+
% (options.plot_alpha, pch[options.plot_char], pca_vars[0]*100, pca_vars[1]*100), file=ROUT)
|
|
141
147
|
else:
|
|
142
|
-
print ('plot(PC1, PC2, col = Colors, pch=%d, cex=1.2, main="PCA 2D map"
|
|
148
|
+
print ('plot(PC1, PC2, col = Colors, pch=%d, cex=1.2, main="PCA 2D map", xlab="PC1 (var. explained: %.2f%%)", ylab="PC2 (var. explained: %.2f%%)")'
|
|
149
|
+
% (pca_vars[0]*100, pca_vars[1]*100, pch[options.plot_char], pca_vars[0]*100, pca_vars[1]*100), file=ROUT)
|
|
143
150
|
|
|
144
151
|
if options.text_label:
|
|
145
152
|
print ('text(PC1, PC2, labels=Sample_ID, col = Colors, cex=0.5, pos=1)', file=ROUT)
|
|
146
153
|
|
|
147
|
-
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
|
|
154
|
+
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
|
|
155
|
+
% (legend_pos[options.legend_location], ','.join(['"' + str(i) + '"' for i in group_names]), ','.join(['"' + str(group_to_col[i]) + '"' for i in group_names]), pch[options.plot_char]), file=ROUT)
|
|
148
156
|
|
|
149
157
|
|
|
150
158
|
print ('dev.off()', file=ROUT)
|
|
@@ -32,6 +32,7 @@ import pandas as pd
|
|
|
32
32
|
import subprocess
|
|
33
33
|
from optparse import OptionParser
|
|
34
34
|
from cpgmodule.utils import *
|
|
35
|
+
from cpgmodule._version import __version__
|
|
35
36
|
from sklearn.preprocessing import StandardScaler
|
|
36
37
|
#import datatable as dt
|
|
37
38
|
#import seaborn as sns
|
|
@@ -41,15 +42,15 @@ __author__ = "Liguo Wang"
|
|
|
41
42
|
__copyright__ = "Copyleft"
|
|
42
43
|
__credits__ = []
|
|
43
44
|
__license__ = "GPL"
|
|
44
|
-
__version__="1.12.0"
|
|
45
45
|
__maintainer__ = "Liguo Wang"
|
|
46
46
|
__email__ = "wang.liguo@mayo.edu"
|
|
47
47
|
__status__ = "Development"
|
|
48
48
|
|
|
49
49
|
def pick_colors(n):
|
|
50
|
-
my_colors = [
|
|
50
|
+
my_colors = [
|
|
51
|
+
"#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFE100", "#FF5005"]
|
|
51
52
|
if n > len(my_colors):
|
|
52
|
-
print ("Only support
|
|
53
|
+
print ("Only support 26 different colors", file = sys.stderr)
|
|
53
54
|
sys.exit()
|
|
54
55
|
return my_colors[0:n]
|
|
55
56
|
|
|
@@ -99,26 +100,28 @@ def main():
|
|
|
99
100
|
df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
|
|
100
101
|
|
|
101
102
|
#remove NA and transpose
|
|
102
|
-
df2 = df1.dropna(axis=0, how='any')
|
|
103
|
-
printlog("%d rows with missing values were removed." % (len(df1) - len(df2)))
|
|
104
|
-
#print (df2.head())
|
|
105
|
-
|
|
106
|
-
printlog("Transposing data frame ...")
|
|
107
|
-
df2 = df2.T
|
|
108
|
-
#print (df2.head())
|
|
109
|
-
|
|
110
|
-
printlog("Standarizing values ...")
|
|
111
|
-
x = df2.values
|
|
112
|
-
x = StandardScaler().fit_transform(x)
|
|
103
|
+
df2 = df1.dropna(axis=0, how='any').T
|
|
104
|
+
printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.columns)))
|
|
113
105
|
|
|
114
106
|
printlog("Reading group file: \"%s\" ..." % (options.group_file))
|
|
115
107
|
group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
|
|
116
|
-
group.index = group.index.map(str)
|
|
117
|
-
|
|
118
108
|
#check if sample IDs are unique
|
|
119
109
|
if len(group.index) != len(group.index.unique()):
|
|
120
110
|
print ("Sample IDs are not unique", file = sys.stderr)
|
|
121
111
|
sys.exit()
|
|
112
|
+
group.index = group.index.map(str)
|
|
113
|
+
printlog("Group file \"%s\" contains %d samples" % (options.group_file, len(group.index)))
|
|
114
|
+
|
|
115
|
+
printlog("Find common sample IDs between group file and data file ...")
|
|
116
|
+
common_samples = list(set(group.index) & set(df2.index))
|
|
117
|
+
used_df = df2.loc[common_samples]
|
|
118
|
+
(usable_sample, usable_cpg) = used_df.shape
|
|
119
|
+
printlog("Used CpGs: %d, Used samples: %d" % (usable_cpg, usable_sample))
|
|
120
|
+
|
|
121
|
+
printlog("Standarizing values ...")
|
|
122
|
+
x = used_df.to_numpy()
|
|
123
|
+
x = StandardScaler().fit_transform(x)
|
|
124
|
+
|
|
122
125
|
|
|
123
126
|
group_names = group['Group_ID'].unique().tolist() # a list of unique group names
|
|
124
127
|
color_names = pick_colors(len(group_names)) # a list of unique colors
|
|
@@ -133,9 +136,9 @@ def main():
|
|
|
133
136
|
#pca = PCA(n_components = options.n_components, random_state = 0)
|
|
134
137
|
#principalComponents = pca.fit_transform(x)
|
|
135
138
|
pca_names = [str(i)+str(j) for i,j in zip(['UMAP']*options.n_components,range(1,options.n_components+1))]
|
|
136
|
-
principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index =
|
|
139
|
+
principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index = used_df.index)
|
|
137
140
|
|
|
138
|
-
finalDf = pd.concat([principalDf, group], axis = 1, sort=False)
|
|
141
|
+
finalDf = pd.concat([principalDf, group], axis = 1, sort=False, join='inner')
|
|
139
142
|
finalDf.index.name = 'Sample_ID'
|
|
140
143
|
|
|
141
144
|
printlog("Writing UMAP results to file: \"%s\" ..." % (options.out_file + '.UMAP.tsv'))
|
|
@@ -146,18 +149,22 @@ def main():
|
|
|
146
149
|
|
|
147
150
|
print ('pdf(file=\"%s\", width=8, height=8)' % (options.out_file + '.UMAP.pdf'),file=ROUT)
|
|
148
151
|
print ('')
|
|
149
|
-
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
|
|
152
|
+
print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
|
|
153
|
+
% (options.out_file + '.UMAP.tsv'), file=ROUT)
|
|
150
154
|
print ('attach(d)', file=ROUT)
|
|
151
155
|
if options.plot_alpha:
|
|
152
156
|
print ('library(scales)', file=ROUT)
|
|
153
|
-
print ('plot(UMAP1, UMAP2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
|
|
157
|
+
print ('plot(UMAP1, UMAP2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
|
|
158
|
+
% (options.plot_alpha, pch[options.plot_char]), file=ROUT)
|
|
154
159
|
else:
|
|
155
|
-
print ('plot(UMAP1, UMAP2, col = Colors, pch=%d, cex=1.2, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
|
|
160
|
+
print ('plot(UMAP1, UMAP2, col = Colors, pch=%d, cex=1.2, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
|
|
161
|
+
% pch[options.plot_char], file=ROUT)
|
|
156
162
|
|
|
157
163
|
if options.text_label:
|
|
158
164
|
print ('text(UMAP1, UMAP2, labels=Sample_ID, col = Colors, cex=0.5, pos=1)', file=ROUT)
|
|
159
165
|
|
|
160
|
-
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
|
|
166
|
+
print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
|
|
167
|
+
% (legend_pos[options.legend_location], ','.join(['"' + str(i) + '"' for i in group_names]), ','.join(['"' + str(group_to_col[i]) + '"' for i in group_names]), pch[options.plot_char]), file=ROUT)
|
|
161
168
|
|
|
162
169
|
|
|
163
170
|
print ('dev.off()', file=ROUT)
|