cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,141 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program annotates CpGs by assigning them to their putative target genes. Follows the
7
+ "Basel plus extension" rules used by GREAT(http://great.stanford.edu/public/html/index.php)
8
+
9
+ * Basal regulatory domain*
10
+ is a user-defined genomic region around the TSS (transcription start site). By default,
11
+ from TSS upstream 5kb to TSS downstream 1Kb is considered as the gene's *basal regulatory
12
+ domain*. When defining a gene's "basal regulatory domain", the other nearby genes will be
13
+ ignored (which means different genes' basal regulatory domains can be overlapped.)
14
+
15
+ * Extended regulatory domain*
16
+ The gene regulatory domain is extended in both directions to the nearest gene's "basal
17
+ regulatory domain" but no more than the maximum extension (default = 1000 kb) in one
18
+ direction.
19
+
20
+ Notes
21
+ -----
22
+ 1. Genes that are assigned to a particular CpG largely depends on gene annotation. A
23
+ "conservative" gene model (such as Refseq curated protein coding genes) is recommended.
24
+ 2. In the gene model, multiple isoforms should be merged into a single gene.
25
+ #=========================================================================================
26
+ """
27
+
28
+
29
+ import sys,os
30
+ import collections
31
+ import subprocess
32
+ import numpy as np
33
+ from optparse import OptionParser
34
+ from cpgmodule import ireader
35
+ from cpgmodule.utils import *
36
+ from cpgmodule.region2gene import *
37
+ from cpgmodule._version import __version__
38
+
39
+ __author__ = "Liguo Wang"
40
+ __copyright__ = "Copyleft"
41
+ __credits__ = []
42
+ __license__ = "GPL"
43
+ __maintainer__ = "Liguo Wang"
44
+ __email__ = "wang.liguo@mayo.edu"
45
+ __status__ = "Development"
46
+
47
+ def main():
48
+
49
+ usage="%prog [options]" + "\n"
50
+ parser = OptionParser(usage,version="%prog " + __version__)
51
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED3+ file specifying the C position. BED3+ file could be a regular text file or compressed file (.gz, .bz2). [required]")
52
+ parser.add_option("-r","--refgene",action="store",type="string",dest="gene_file",help="Reference gene model in BED12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). \"One gene one transcript\" is recommended. Since most genes have multiple transcripts; one can collapse multiple transcripts of the same gene into a single super transcript or select the canonical transcript.")
53
+ parser.add_option("-u","--basal-up",action="store",type="int",dest="basal_up_size",default=5000,help="Size of extension to upstream of TSS (used to define gene's \"basal regulatory domain\"). default=%default (bp)")
54
+ parser.add_option("-d","--basal-down",action="store",type="int",dest="basal_down_size",default=1000,help="Size of extension to downstream of TSS (used to define gene's basal regulatory domain). default=%default (bp)")
55
+ parser.add_option("-e","--extension",action="store",type="int",dest="extension_size",default=1000000,help="Size of extension to both up- and down-stream of TSS (used to define gene's \"extended regulatory domain\"). default=%default (bp)")
56
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file. Two additional columns will be appended to the original BED file with the last column indicating \"genes whose extended regulatory domain are overlapped with the CpG\", the 2nd last column indicating \"genes whose basal regulatory domain are overlapped with the CpG\". [required]")
57
+ (options,args)=parser.parse_args()
58
+
59
+ print ()
60
+
61
+ if not (options.input_file):
62
+ #print ('You must specify input file(s)',file=sys.stderr)
63
+ print (__doc__)
64
+ parser.print_help()
65
+ sys.exit(101)
66
+ if not (options.gene_file):
67
+ #print ('You must specify the chrom size file',file=sys.stderr)
68
+ print (__doc__)
69
+ parser.print_help()
70
+ sys.exit(102)
71
+ if not (options.out_file):
72
+ #print ('You must specify the output file',file=sys.stderr)
73
+ print (__doc__)
74
+ parser.print_help()
75
+ sys.exit(103)
76
+
77
+ FOUT = open(options.out_file + '.associated_genes.txt','w')
78
+ print ("#The last column contains genes whose extended regulatory domain are overlapped with the CpG", file=FOUT)
79
+ print ("#The 2nd last column contains genes whose basal regulatory domain are overlapped with the CpG", file=FOUT)
80
+ print ("#\"//\" indicates no genes are found", file=FOUT)
81
+
82
+ printlog("Calculate basal regulatory domain from: \"%s\" ..." % (options.gene_file))
83
+ basal_domains = getBasalDomains(bedfile = options.gene_file, up = options.basal_up_size, down = options.basal_down_size, printit = False)
84
+
85
+ printlog("Calculate extended regulatory domain from: \"%s\" ..." % (options.gene_file))
86
+ extended_domains = geteExtendedDomains(basal_ranges = basal_domains, bedfile = options.gene_file, up = options.basal_up_size, down = options.basal_down_size, ext=options.extension_size, printit = False)
87
+
88
+ #overlap = extended_domains['chr1'].find(2161048,2161049)
89
+
90
+ printlog("Assigning CpG to gene ...")
91
+ for l in ireader.reader(options.input_file):
92
+ if l.startswith('#'):
93
+ print (l, file=FOUT)
94
+ continue
95
+ if l.startswith('track'):
96
+ continue
97
+ if l.startswith('browser'):
98
+ continue
99
+ try:
100
+ f = l.split()
101
+ chrom = f[0]
102
+ start = int(f[1])
103
+ end = int(f[2])
104
+ except:
105
+ print ("Invalid BED line: %s" % l, file=sys.stderr)
106
+ continue
107
+
108
+
109
+ basal_genes = set() #genes whose basal domain is overlapped with CpG
110
+ if chrom not in basal_domains:
111
+ basal_genes.add('//')
112
+ else:
113
+ overlaps = basal_domains[chrom].find(start,end)
114
+ if len(overlaps) == 0:
115
+ basal_genes.add('//')
116
+ else:
117
+ for o in overlaps:
118
+ basal_genes.add(o.value)
119
+
120
+ extend_genes = set() #genes whose extended domain is overlapped with CpG
121
+ if chrom not in extended_domains:
122
+ extend_genes.add('//')
123
+ else:
124
+ overlaps = extended_domains[chrom].find(start,end)
125
+ if len(overlaps) == 0:
126
+ extend_genes.add('//')
127
+ else:
128
+ for o in overlaps:
129
+ extend_genes.add(o.value)
130
+
131
+
132
+ extend_genes = extend_genes - basal_genes
133
+ if len(extend_genes) == 0:
134
+ extend_genes.add('//')
135
+ print (l + '\t' + ';'.join(basal_genes) + '\t' + ';'.join(extend_genes), file=FOUT)
136
+ FOUT.close()
137
+
138
+ if __name__=='__main__':
139
+ main()
140
+
141
+
@@ -0,0 +1,188 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program performs PCA (principal component analysis) for samples.
7
+
8
+ Example of input data file
9
+ ---------------------------
10
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
11
+ cg_001 0.831035 0.878022 0.794427 0.880911
12
+ cg_002 0.249544 0.209949 0.234294 0.236680
13
+ cg_003 0.845065 0.843957 0.840184 0.824286
14
+
15
+ Example of the input group file
16
+ ---------------------------
17
+ Sample,Group
18
+ Sample_01,normal
19
+ Sample_02,normal
20
+ Sample_03,tumor
21
+ Sample_04,tumor
22
+
23
+ Notes
24
+ -----
25
+ * Rows with missing values will be removed
26
+ * Beta values will be standardized into z scores
27
+ * Only the first two components will be visualized
28
+ """
29
+
30
+
31
+ import sys
32
+ import subprocess
33
+ from optparse import OptionParser
34
+ from cpgmodule.utils import *
35
+ from cpgmodule._version import __version__
36
+ import pandas as pd
37
+ from sklearn.preprocessing import StandardScaler
38
+ from sklearn.decomposition import PCA
39
+
40
+ __author__ = "Liguo Wang"
41
+ __copyright__ = "Copyleft"
42
+ __credits__ = []
43
+ __license__ = "GPL"
44
+ __maintainer__ = "Liguo Wang"
45
+ __email__ = "wang.liguo@mayo.edu"
46
+ __status__ = "Development"
47
+
48
+ def pick_colors(n):
49
+ my_colors = [
50
+ "#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFE100", "#FF5005"]
51
+ if n > len(my_colors):
52
+ print ("Only support 26 different colors", file = sys.stderr)
53
+ sys.exit()
54
+ return my_colors[0:n]
55
+
56
+
57
+ def main():
58
+ usage="%prog [options]" + "\n"
59
+ parser = OptionParser(usage,version="%prog " + __version__)
60
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",
61
+ help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
62
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",
63
+ help="Comma-separated group file defining the biological groups of each sample. Different groups will be colored differently in the PCA plot. Supports a maximum of 20 groups.")
64
+ parser.add_option("-n","--ncomponent",action="store",type='int', dest="n_components", default=2,
65
+ help="Number of components. default=%default" )
66
+ parser.add_option("-l","--label",action="store_true",default=False,dest="text_label",
67
+ help="If set, sample ids will be added underneath the data point. default=%default")
68
+ parser.add_option("-c","--char",action="store",type='int', default=1, dest="plot_char",
69
+ help="Ploting character: 1 = 'dot', 2 = 'circle'. default=%default")
70
+ parser.add_option("-a","--alpha",action="store",type='float', default=0.5, dest="plot_alpha",
71
+ help="Opacity of dots. default=%default")
72
+ parser.add_option("-x","--loc",action="store",type='int', default=1, dest="legend_location",
73
+ help="Location of legend panel: 1 = 'topright', 2 = 'bottomright', 3 = 'bottomleft', 4 = 'topleft'. default=%default")
74
+ parser.add_option("--loading",action="store_true", default=False, dest="cal_loading",
75
+ help="If set, will generate loading matrix. default=%default")
76
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",
77
+ help="The prefix of the output file.")
78
+ (options,args)=parser.parse_args()
79
+
80
+ if not (options.input_file):
81
+ print (__doc__)
82
+ parser.print_help()
83
+ sys.exit(101)
84
+ if not (options.group_file):
85
+ print (__doc__)
86
+ parser.print_help()
87
+ sys.exit(101)
88
+ if not (options.out_file):
89
+ print (__doc__)
90
+ parser.print_help()
91
+ sys.exit(103)
92
+ if options.n_components < 2:
93
+ options.n_components = 2
94
+
95
+ pch = {1:20, 2:1}
96
+ legend_pos = {1:'topright', 2: 'bottomright', 3:'bottomleft', 4:'topleft'}
97
+ printlog("Reading input file: \"%s\" ..." % (options.input_file))
98
+ df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
99
+
100
+ #remove NA and transpose
101
+ df2 = df1.dropna(axis=0, how='any').T
102
+ printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.columns)))
103
+
104
+ printlog("Reading group file: \"%s\" ..." % (options.group_file))
105
+ group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
106
+ #check if sample IDs are unique
107
+ if len(group.index) != len(group.index.unique()):
108
+ print ("Sample IDs are not unique", file = sys.stderr)
109
+ sys.exit()
110
+ group.index = group.index.map(str)
111
+ printlog("Group file \"%s\" contains %d samples" % (options.group_file, len(group.index)))
112
+
113
+ printlog("Find common sample IDs between group file and data file ...")
114
+ common_samples = list(set(group.index) & set(df2.index))
115
+ used_df = df2.loc[common_samples]
116
+ (usable_sample, usable_cpg) = used_df.shape
117
+ printlog("Used CpGs: %d, Used samples: %d" % (usable_cpg, usable_sample))
118
+
119
+
120
+ printlog("Standarizing values ...")
121
+ x = used_df.to_numpy()
122
+ x = StandardScaler().fit_transform(x)
123
+
124
+
125
+ group_names = group['Group_ID'].unique().tolist() # a list of unique group names
126
+ color_names = pick_colors(len(group_names)) # a list of unique colors
127
+ group_to_col = dict(zip(group_names, color_names))
128
+ color_list = [group_to_col[g] for g in group['Group_ID']]
129
+ group['Colors'] = color_list
130
+
131
+ pca = PCA(n_components = options.n_components, random_state = 0)
132
+ principalComponents = pca.fit_transform(x)
133
+ #pca_names = [str(i)+str(j) for i,j in zip(['PC']*options.n_components,range(1,options.n_components+1))]
134
+ pca_names = [f'PC{i+1}' for i in range(options.n_components)]
135
+ principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index = used_df.index)
136
+
137
+ finalDf = pd.concat([principalDf, group], axis = 1, sort=False, join='inner')
138
+ finalDf.index.name = 'Sample_ID'
139
+
140
+ printlog("Writing PCA results to file: \"%s\" ..." % (options.out_file + '.PCA.tsv'))
141
+ finalDf.to_csv(options.out_file + '.PCA.tsv', sep="\t")
142
+
143
+ # calculate loading matrix
144
+ if options.cal_loading:
145
+ loading_matrix = options.out_file + '.loadings.tsv'
146
+ printlog("Write loadings to matrix to \"%s\"" % loading_matrix)
147
+ loadings = pd.DataFrame(pca.components_, columns=used_df.columns, index=pca_names)
148
+ loadings.T.to_csv(loading_matrix, sep="\t", index=True)
149
+
150
+ pca_vars = pca.explained_variance_ratio_
151
+ for n,v in zip(pca_names, pca_vars):
152
+ print ("Variance explained by %s : %.4f%%" % (n, v*100))
153
+
154
+
155
+ ROUT = open(options.out_file + '.PCA.r','w')
156
+
157
+ print ('pdf(file=\"%s\", width=8, height=8)' % (options.out_file + '.PCA.pdf'),file=ROUT)
158
+ print ('')
159
+ print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
160
+ % (options.out_file + '.PCA.tsv'), file=ROUT)
161
+ print ('attach(d)', file=ROUT)
162
+ if options.plot_alpha:
163
+ print ('library(scales)', file=ROUT)
164
+ print ('plot(PC1, PC2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="PCA 2D map", xlab="PC1 (var. explained: %.2f%%)", ylab="PC2 (var. explained: %.2f%%)")'
165
+ % (options.plot_alpha, pch[options.plot_char], pca_vars[0]*100, pca_vars[1]*100), file=ROUT)
166
+ else:
167
+ print ('plot(PC1, PC2, col = Colors, pch=%d, cex=1.2, main="PCA 2D map", xlab="PC1 (var. explained: %.2f%%)", ylab="PC2 (var. explained: %.2f%%)")'
168
+ % (pca_vars[0]*100, pca_vars[1]*100, pch[options.plot_char], pca_vars[0]*100, pca_vars[1]*100), file=ROUT)
169
+
170
+ if options.text_label:
171
+ print ('text(PC1, PC2, labels=Sample_ID, col = Colors, cex=0.5, pos=1)', file=ROUT)
172
+
173
+ print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
174
+ % (legend_pos[options.legend_location], ','.join(['"' + str(i) + '"' for i in group_names]), ','.join(['"' + str(group_to_col[i]) + '"' for i in group_names]), pch[options.plot_char]), file=ROUT)
175
+
176
+
177
+ print ('dev.off()', file=ROUT)
178
+ ROUT.close()
179
+
180
+ try:
181
+ subprocess.call("Rscript " + options.out_file + '.PCA.r', shell=True)
182
+ except:
183
+ print ("Cannot generate pdf file from " + options.out_file + '.PCA.r', file=sys.stderr)
184
+ pass
185
+
186
+
187
+ if __name__=='__main__':
188
+ main()
@@ -0,0 +1,181 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program performs UMAP (Uniform Manifold Approximation and Projection) non-linear dimension reduction.
7
+
8
+ Example of input data file
9
+ ---------------------------
10
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
11
+ cg_001 0.831035 0.878022 0.794427 0.880911
12
+ cg_002 0.249544 0.209949 0.234294 0.236680
13
+ cg_003 0.845065 0.843957 0.840184 0.824286
14
+
15
+ Example of the input group file
16
+ ---------------------------
17
+ Sample,Group
18
+ Sample_01,normal
19
+ Sample_02,normal
20
+ Sample_03,tumor
21
+ Sample_04,tumor
22
+
23
+ Notes
24
+ -----
25
+ * Rows with missing values will be removed
26
+ * Beta values will be standardized into z scores
27
+ * Only the first two components will be visualized
28
+ """
29
+ import os,sys,umap
30
+ import numpy as np
31
+ import pandas as pd
32
+ import subprocess
33
+ from optparse import OptionParser
34
+ from cpgmodule.utils import *
35
+ from cpgmodule._version import __version__
36
+ from sklearn.preprocessing import StandardScaler
37
+ #import datatable as dt
38
+ #import seaborn as sns
39
+
40
+
41
+ __author__ = "Liguo Wang"
42
+ __copyright__ = "Copyleft"
43
+ __credits__ = []
44
+ __license__ = "GPL"
45
+ __maintainer__ = "Liguo Wang"
46
+ __email__ = "wang.liguo@mayo.edu"
47
+ __status__ = "Development"
48
+
49
+ def pick_colors(n):
50
+ my_colors = [
51
+ "#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFE100", "#FF5005"]
52
+ if n > len(my_colors):
53
+ print ("Only support 26 different colors", file = sys.stderr)
54
+ sys.exit()
55
+ return my_colors[0:n]
56
+
57
+
58
+ def main():
59
+
60
+ usage="%prog [options]" + "\n"
61
+ parser = OptionParser(usage,version="%prog " + __version__)
62
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
63
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Comma-separated group file defining the biological groups of each sample. Different groups will be colored differently in the 2-dimensional plot. Supports a maximum of 20 groups.")
64
+ parser.add_option("-n","--ncomponent",action="store",type='int', dest="n_components", default=2, help="Number of components. default=%default" )
65
+ parser.add_option("--nneighbors",action="store",type='int', dest="n_neighbors", default=15, help="This parameter controls the size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data. Low values of '--nneighbors' will force UMAP to concentrate on local structure, while large values will push UMAP to look at larger neighborhoods of each point when estimating the manifold structure of the data. Choose a value from [2, 200]. default=%default" )
66
+ parser.add_option("--min-dist",action="store",type='float', dest="min_distance", default=0.2, help="This parameter controls how tightly UMAP is allowed to pack points together. Choose a value from [0, 1). default=%default" )
67
+ parser.add_option("-l","--label",action="store_true",default=False,dest="text_label",help="If True, sample ids will be added underneath the data point. default=%default")
68
+ parser.add_option("-c","--char",action="store",type='int', default=1, dest="plot_char",help="Ploting character: 1 = 'dot', 2 = 'circle'. default=%default")
69
+ parser.add_option("-a","--alpha",action="store",type='float', default=0.5, dest="plot_alpha",help="Opacity of dots. default=%default")
70
+ parser.add_option("-x","--loc",action="store",type='int', default=1, dest="legend_location",help="Location of legend panel: 1 = 'topright', 2 = 'bottomright', 3 = 'bottomleft', 4 = 'topleft'. default=%default")
71
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
72
+ (options,args)=parser.parse_args()
73
+
74
+ print ()
75
+ if not (options.input_file):
76
+ print (__doc__)
77
+ parser.print_help()
78
+ sys.exit(101)
79
+
80
+ if not (options.out_file):
81
+ print (__doc__)
82
+ parser.print_help()
83
+ sys.exit(103)
84
+ if options.n_components < 2:
85
+ options.n_components = 2
86
+
87
+ if options.n_neighbors < 2:
88
+ options.n_neighbors = 2
89
+ if options.n_neighbors > 200:
90
+ options.n_neighbors = 200
91
+
92
+ if options.min_distance < 0:
93
+ options.min_distance = 0
94
+ if options.min_distance > 1:
95
+ options.min_distance = 1
96
+
97
+ pch = {1:20, 2:1}
98
+ legend_pos = {1:'topright', 2: 'bottomright', 3:'bottomleft', 4:'topleft'}
99
+ printlog("Reading input file: \"%s\" ..." % (options.input_file))
100
+ df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
101
+
102
+ #remove NA and transpose
103
+ df2 = df1.dropna(axis=0, how='any').T
104
+ printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.columns)))
105
+
106
+ printlog("Reading group file: \"%s\" ..." % (options.group_file))
107
+ group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
108
+ #check if sample IDs are unique
109
+ if len(group.index) != len(group.index.unique()):
110
+ print ("Sample IDs are not unique", file = sys.stderr)
111
+ sys.exit()
112
+ group.index = group.index.map(str)
113
+ printlog("Group file \"%s\" contains %d samples" % (options.group_file, len(group.index)))
114
+
115
+ printlog("Find common sample IDs between group file and data file ...")
116
+ common_samples = list(set(group.index) & set(df2.index))
117
+ used_df = df2.loc[common_samples]
118
+ (usable_sample, usable_cpg) = used_df.shape
119
+ printlog("Used CpGs: %d, Used samples: %d" % (usable_cpg, usable_sample))
120
+
121
+ printlog("Standarizing values ...")
122
+ x = used_df.to_numpy()
123
+ x = StandardScaler().fit_transform(x)
124
+
125
+
126
+ group_names = group['Group_ID'].unique().tolist() # a list of unique group names
127
+ color_names = pick_colors(len(group_names)) # a list of unique colors
128
+ group_to_col = dict(zip(group_names, color_names))
129
+ color_list = [group_to_col[g] for g in group['Group_ID']]
130
+ group['Colors'] = color_list
131
+
132
+
133
+ reducer = umap.UMAP(n_components = options.n_components, n_neighbors = options.n_neighbors, min_dist = options.min_distance, random_state = 99)
134
+ principalComponents = reducer.fit_transform(x)
135
+
136
+ #pca = PCA(n_components = options.n_components, random_state = 0)
137
+ #principalComponents = pca.fit_transform(x)
138
+ pca_names = [str(i)+str(j) for i,j in zip(['UMAP']*options.n_components,range(1,options.n_components+1))]
139
+ principalDf = pd.DataFrame(data = principalComponents, columns = pca_names, index = used_df.index)
140
+
141
+ finalDf = pd.concat([principalDf, group], axis = 1, sort=False, join='inner')
142
+ finalDf.index.name = 'Sample_ID'
143
+
144
+ printlog("Writing UMAP results to file: \"%s\" ..." % (options.out_file + '.UMAP.tsv'))
145
+ finalDf.to_csv(options.out_file + '.UMAP.tsv', sep="\t")
146
+
147
+
148
+ ROUT = open(options.out_file + '.UMAP.r','w')
149
+
150
+ print ('pdf(file=\"%s\", width=8, height=8)' % (options.out_file + '.UMAP.pdf'),file=ROUT)
151
+ print ('')
152
+ print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)'
153
+ % (options.out_file + '.UMAP.tsv'), file=ROUT)
154
+ print ('attach(d)', file=ROUT)
155
+ if options.plot_alpha:
156
+ print ('library(scales)', file=ROUT)
157
+ print ('plot(UMAP1, UMAP2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
158
+ % (options.plot_alpha, pch[options.plot_char]), file=ROUT)
159
+ else:
160
+ print ('plot(UMAP1, UMAP2, col = Colors, pch=%d, cex=1.2, main="UMAP 2D map", xlab="UMAP_1", ylab="UMAP_2")'
161
+ % pch[options.plot_char], file=ROUT)
162
+
163
+ if options.text_label:
164
+ print ('text(UMAP1, UMAP2, labels=Sample_ID, col = Colors, cex=0.5, pos=1)', file=ROUT)
165
+
166
+ print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)'
167
+ % (legend_pos[options.legend_location], ','.join(['"' + str(i) + '"' for i in group_names]), ','.join(['"' + str(group_to_col[i]) + '"' for i in group_names]), pch[options.plot_char]), file=ROUT)
168
+
169
+
170
+ print ('dev.off()', file=ROUT)
171
+ ROUT.close()
172
+
173
+ try:
174
+ subprocess.call("Rscript " + options.out_file + '.UMAP.r', shell=True)
175
+ except:
176
+ print ("Cannot generate pdf file from " + options.out_file + '.UMAP.r', file=sys.stderr)
177
+ pass
178
+
179
+
180
+ if __name__=='__main__':
181
+ main()