cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,152 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program calculates methylation profile (i.e., average
7
+ beta value) around user specified genomic regions.
8
+
9
+ Example of input BED6+ file
10
+ ---------------------------
11
+ chr22 44021512 44021513 cg24055475 0.9231 -
12
+ chr13 111568382 111568383 cg06540715 0.1071 +
13
+ chr20 44033594 44033595 cg21482942 0.6122 -
14
+
15
+ Example of input BED3+ file
16
+ ---------------------------
17
+ chr1 15864 15865
18
+ chr1 18826 18827
19
+ chr1 29406 29407
20
+ """
21
+
22
+
23
+ import sys,os
24
+ import collections
25
+ import subprocess
26
+ import numpy as np
27
+ from optparse import OptionParser
28
+ from cpgmodule._version import __version__
29
+ from cpgmodule import ireader
30
+ from cpgmodule.utils import *
31
+ from cpgmodule import BED
32
+
33
+ __author__ = "Liguo Wang"
34
+ __copyright__ = "Copyleft"
35
+ __credits__ = []
36
+ __license__ = "GPL"
37
+ __maintainer__ = "Liguo Wang"
38
+ __email__ = "wang.liguo@mayo.edu"
39
+ __status__ = "Development"
40
+
41
+ def main():
42
+
43
+ usage="%prog [options]" + "\n"
44
+ parser = OptionParser(usage,version="%prog " + __version__)
45
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED6+ file specifying the C position. This BED file should have at least six columns (Chrom, ChromStart, ChromeEnd, Name, Beta_value, Strand). BED6+ file can be a regular text file or compressed file (.gz, .bz2).")
46
+ parser.add_option("-r","--region",action="store",type="string",dest="region_file",help="BED3+ file of genomic regions. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd). If the 6-th column does not exist, all regions will be considered as on \"+\" strand. ")
47
+ parser.add_option("-d","--downstream",action="store",type="int",dest="downstream_size",default=2000,help="Size of extension to downstream. default=%default (bp)")
48
+ parser.add_option("-u","--upstream",action="store",type="int",dest="upstream_size",default=2000,help="Size of extension to upstream. default=%default (bp)")
49
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
50
+ (options,args)=parser.parse_args()
51
+
52
+ print ()
53
+
54
+ if not (options.input_file):
55
+ print (__doc__)
56
+ parser.print_help()
57
+ sys.exit(101)
58
+
59
+ if not (options.region_file):
60
+ print (__doc__)
61
+ parser.print_help()
62
+ sys.exit(102)
63
+
64
+ if not (options.out_file):
65
+ print (__doc__)
66
+ parser.print_help()
67
+ sys.exit(103)
68
+
69
+ FOUT = open(options.out_file + '.txt','w')
70
+ ROUT = open(options.out_file + '.r','w')
71
+ print ("\t".join(["Group","Relative_position(5'->3')", "Average_beta"]), file=FOUT)
72
+
73
+ #step1: read CpG file
74
+ printlog("Reading CpG file: \"%s\"" % (options.input_file))
75
+ cpg_ranges = read_CpG_bed(options.input_file)
76
+
77
+ #step2: read region file
78
+ printlog("Reading BED file: \"%s\"" % (options.region_file))
79
+
80
+ region_list = []
81
+ for chrom, st, end, strand in read_region_bed(options.region_file):
82
+ region_list.append((chrom, st, end, strand))
83
+ region_list = list(set(region_list))
84
+
85
+ printlog("Calculate average beta ...")
86
+ s = coverage_over_range(region_list,cpg_ranges)
87
+ for i in sorted(s):
88
+ print ('\t'.join(["User_region", str(i), str(s[i])]), file=FOUT)
89
+ print ('User_region <- c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
90
+ user_region_datapoints = len(s)
91
+
92
+ if options.upstream_size > 0:
93
+ printlog("Get upstream regions of \"%s\"" % (options.region_file))
94
+ upstream_region = []
95
+ for (chrom, st, end, strand) in region_list:
96
+ if strand == '+':
97
+ upstream_st = max(st - options.upstream_size, 0)
98
+ upstream_end = st
99
+ upstream_region.append((chrom, upstream_st, upstream_end, strand))
100
+ elif strand == '-':
101
+ upstream_st = end
102
+ upstream_end = end + options.upstream_size
103
+ upstream_region.append((chrom, upstream_st, upstream_end, strand))
104
+ upstream_region = list(set(upstream_region))
105
+
106
+ s = coverage_over_range(upstream_region,cpg_ranges)
107
+ for i in sorted(s):
108
+ print ('\t'.join(["Upstream_region", str(i), str(s[i])]), file=FOUT)
109
+ print ('Upstream_region <- c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
110
+ upstream_datapoints = len(s)
111
+
112
+
113
+ if options.downstream_size > 0:
114
+ printlog("Get downstream regions of \"%s\"" % (options.region_file))
115
+ downstream_region = []
116
+ for (chrom, st, end, strand) in region_list:
117
+ if strand == '+':
118
+ downstream_st = end
119
+ downstream_end = end + options.downstream_size
120
+ downstream_region.append((chrom, downstream_st, downstream_end, strand))
121
+ elif strand == '-':
122
+ downstream_st = st
123
+ downstream_end = max(st - options.downstream_size, 0)
124
+ downstream_region.append((chrom, downstream_st, downstream_end, strand))
125
+ downstream_region = list(set(downstream_region))
126
+ s = coverage_over_range(downstream_region,cpg_ranges)
127
+ for i in sorted(s):
128
+ print ('\t'.join(["Downstream_region", str(i), str(s[i])]), file=FOUT)
129
+ print ('Downstream_region <- c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
130
+ downstream_datapoints = len(s)
131
+
132
+ total_datapoints = upstream_datapoints + downstream_datapoints + user_region_datapoints
133
+ print('\n')
134
+ print ('pdf(file=\"%s\", width=6, height=6)' % (options.out_file + '.pdf'),file=ROUT)
135
+ print ('plot(0:%d, c(Upstream_region, User_region, Downstream_region),ylim=c(0,1), xaxt="n",xlab="", ylab="Average methylation", type="l", col="red")' % (total_datapoints -1), file=ROUT)
136
+ print ('abline(v = c(%d,%d),col="blue", lty="dashed")' % (upstream_datapoints-1, upstream_datapoints + user_region_datapoints - 1), file=ROUT)
137
+ print ('abline(h = 0.5,col="grey", lty="dashed")', file=ROUT)
138
+ print ('text(x=c(%d, %d), y=0.9, cex=0.7, labels=c("Upstream\\n(5\'->3\')","Downstream\\n(5\'->3\')"))' % (50, total_datapoints - 50), file=ROUT)
139
+ print ('dev.off()',file=ROUT)
140
+
141
+ FOUT.close()
142
+ ROUT.close()
143
+ try:
144
+ subprocess.call("Rscript " + options.out_file + '.r', shell=True)
145
+ except:
146
+ print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
147
+ pass
148
+
149
+
150
+
151
+ if __name__=='__main__':
152
+ main()
@@ -0,0 +1,116 @@
1
+ #!python
2
+
3
+ """
4
+ #=========================================================================================
5
+ Select the K best features according to the K highest scores. Scores can be measured by:
6
+
7
+ * ANOVA F-value between label/feature for classification tasks.
8
+ * Mutual information for a discrete target.
9
+ * Chi-squared stats of non-negative features for classification tasks.
10
+
11
+ Example of input data file
12
+ ---------------------------
13
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
14
+ cg_001 0.831035 0.878022 0.794427 0.880911
15
+ cg_002 0.249544 0.209949 0.234294 0.236680
16
+ cg_003 0.845065 0.843957 0.840184 0.824286
17
+ """
18
+ import sys
19
+ import numpy as np
20
+ from optparse import OptionParser
21
+ from cpgmodule._version import __version__
22
+ from cpgmodule.utils import *
23
+ import pandas as pd
24
+
25
+ from sklearn.feature_selection import SelectKBest
26
+ from sklearn.feature_selection import chi2,f_classif,mutual_info_classif
27
+
28
+ __author__ = "Liguo Wang"
29
+ __copyright__ = "Copyleft"
30
+ __credits__ = []
31
+ __license__ = "GPL"
32
+ __maintainer__ = "Liguo Wang"
33
+ __email__ = "wang.liguo@mayo.edu"
34
+ __status__ = "Development"
35
+
36
+
37
+ def main():
38
+
39
+ usage="%prog [options]" + "\n"
40
+ parser = OptionParser(usage,version="%prog " + __version__)
41
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
42
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Comma-separated group file defining the biological groups of each sample.")
43
+ parser.add_option("-c","--topK",action="store",type='int', dest="cpg_count", default=100, help="Number of top features to select. default=%default" )
44
+ parser.add_option("-s","--score-function",action="store",type='string', dest="score_function", default='chisq', help="Scoring function used to measure the dependency between features scores and labels. Must be \"chisq\" (chi-squared statistic), \"anova\" (ANOVA F-value), or \"mi\" (mutual information). default=%default" )
45
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
46
+ (options,args)=parser.parse_args()
47
+
48
+ print ()
49
+ if not (options.input_file):
50
+ print (__doc__)
51
+ parser.print_help()
52
+ sys.exit(101)
53
+ if not (options.group_file):
54
+ print (__doc__)
55
+ parser.print_help()
56
+ sys.exit(102)
57
+ if not (options.out_file):
58
+ print (__doc__)
59
+ parser.print_help()
60
+ sys.exit(103)
61
+
62
+ printlog("Reading input file: \"%s\"" % (options.input_file))
63
+ df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
64
+ #print (df1)
65
+
66
+ #remove any rows with NAs
67
+ df2 = df1.dropna(axis=0, how='any')
68
+ printlog("%d rows with missing values were removed." % (len(df1) - len(df2)))
69
+ #print (df2)
70
+
71
+ printlog("Transposing data matrix ... ")
72
+ df2 = df2.T
73
+ total_feature = len(df2.columns)
74
+ printlog("Total number of features: %d " % (total_feature))
75
+ #print (df2)
76
+
77
+
78
+ printlog("Reading group file: \"%s\"" % (options.group_file))
79
+ group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
80
+ a = pd.Series(list(group['Group_ID'])) #a is *string labels* for groups: ['Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Cancer', 'Cancer', 'Cancer', 'Cancer']
81
+ #print (a)
82
+ y, tmp = pd.factorize(a) #y is *numeric labels* for groups: [0 0 0 0 0 1 1 1 1]
83
+ #print (np.array(y))
84
+
85
+ if options.cpg_count < total_feature:
86
+
87
+ if options.score_function == 'anova':
88
+ printlog ("Using ANOVA F value to select features ...")
89
+ selector = SelectKBest(f_classif, k = options.cpg_count)
90
+ elif options.score_function == 'mi':
91
+ printlog ("Using Mutual Information to select features ...")
92
+ selector = SelectKBest(mutual_info_classif, k = options.cpg_count)
93
+ elif options.score_function == 'chisq':
94
+ printlog ("Using Chi Square statistic to select features ...")
95
+ selector = SelectKBest(chi2, k = options.cpg_count)
96
+ else:
97
+ printlog("Unknown function: %s" % options.score_function)
98
+ sys.exit(0)
99
+ else:
100
+ printlog("Doing nothing! '-k' >= the total number of features in \"%s\"" % (options.input_file))
101
+ sys.exit(0)
102
+
103
+
104
+ selector.fit_transform(df2, np.array(y))
105
+ cols = selector.get_support(indices=False)
106
+ selected_data = df2.loc[:,cols]
107
+ selected_featureNum = len(selected_data.columns)
108
+ printlog("Total number of selected features : %d " % (selected_featureNum))
109
+ #print (selected_data)
110
+
111
+ printlog("Writing to file: \"%s\"" % (options.out_file + '.selectedFeatures.tsv'))
112
+ pd.DataFrame.to_csv(selected_data.T, options.out_file + '.selectedFeatures.tsv', sep="\t", index_label="sample")
113
+
114
+
115
+ if __name__=='__main__':
116
+ main()
@@ -0,0 +1,119 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program creates stacked barplot for each sample. The stacked barplot showing
7
+ the proportions of CpGs whose beta values are falling into these 4 ranges:
8
+ * [0.00, 0.25] #first quantile
9
+ * [0.25, 0.50] #second quantile
10
+ * [0.50, 0.75] #third quantile
11
+ * [0.75, 1.00] #forth quantile
12
+
13
+ Example of input data file
14
+ ---------------------------
15
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
16
+ cg_001 0.831035 0.878022 0.794427 0.880911
17
+ cg_002 0.249544 0.209949 0.234294 0.236680
18
+ #=========================================================================================
19
+
20
+ Note: Please name your sample IDs using only "letters" [a-z, A-Z], "numbers" [0-9], and "_"; and
21
+ your sample ID should start with a letter.
22
+
23
+ """
24
+
25
+
26
+ import sys,os
27
+ import collections
28
+ import subprocess
29
+ import numpy as np
30
+ from optparse import OptionParser
31
+ from cpgmodule._version import __version__
32
+ from cpgmodule import ireader
33
+ from cpgmodule.utils import *
34
+ from cpgmodule import BED
35
+ import pandas as pd
36
+
37
+ __author__ = "Liguo Wang"
38
+ __copyright__ = "Copyleft"
39
+ __credits__ = []
40
+ __license__ = "GPL"
41
+ __maintainer__ = "Liguo Wang"
42
+ __email__ = "wang.liguo@mayo.edu"
43
+ __status__ = "Development"
44
+
45
+
46
+ def quarter_count(lst):
47
+ """
48
+ count number of beta values falling into each quarter
49
+ Note: beta >= 0 and beta <=1
50
+ """
51
+ q1 = 0
52
+ q2 = 0
53
+ q3 = 0
54
+ q4 = 0
55
+ for i in lst:
56
+ try:
57
+ j = float(i)
58
+ except:
59
+ continue
60
+ if not isinstance(j, float):
61
+ continue
62
+ if j < 0:
63
+ continue
64
+ elif j <= 0.25:
65
+ q1 += 1
66
+ elif j <= 0.50:
67
+ q2 += 1
68
+ elif j <= 0.75:
69
+ q3 += 1
70
+ elif j <= 1:
71
+ q4 += 1
72
+ else:
73
+ continue
74
+ return [q1, q2, q3, q4]
75
+
76
+ def main():
77
+
78
+ usage="%prog [options]" + "\n"
79
+ parser = OptionParser(usage,version="%prog " + __version__)
80
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
81
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
82
+ (options,args)=parser.parse_args()
83
+
84
+ print ()
85
+ if not (options.input_file):
86
+ print (__doc__)
87
+ parser.print_help()
88
+ sys.exit(101)
89
+
90
+ if not (options.out_file):
91
+ print (__doc__)
92
+ parser.print_help()
93
+ sys.exit(103)
94
+
95
+ printlog("Reading beta file: \"%s\"" % (options.input_file))
96
+ data = pd.read_csv(options.input_file,sep='\t')
97
+ samples = data.columns[1:]
98
+
99
+ ROUT = open(options.out_file + '.r','w')
100
+ print ('pdf(file=\"%s\", width=10, height=10)' % (options.out_file + '.pdf'),file=ROUT)
101
+
102
+ for s in samples:
103
+ tmp = quarter_count(data[s])
104
+ print ('%s <- c(%s)' % (s, ','.join([str(i) for i in tmp])), file=ROUT)
105
+ print ("cc = rev(c('#d7191c', '#fdae61', '#a6d96a', '#1a9641'))", file=ROUT)
106
+ print ('legend = c("beta [0.00 - 0.25]", "beta [0.25 - 0.50]", "beta [0.50 - 0.75]", "beta [0.75 - 1.00]")', file=ROUT)
107
+ print ('nm = c(%s)' % ','.join(['"' + s + '"' for s in samples]), file=ROUT)
108
+ print ('barplot(cbind(%s), col = cc, names.arg = nm, cex.names = 0.8, ylab = "Percentage", ylim=c(0,119), las=2, legend.text = legend)' % (','.join([s + ' * 100/sum(' + s + ')' for s in samples])), file=ROUT)
109
+ ROUT.close()
110
+
111
+ try:
112
+ subprocess.call("Rscript " + options.out_file + '.r', shell=True)
113
+ except:
114
+ print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
115
+ pass
116
+
117
+ if __name__=='__main__':
118
+ main()
119
+
@@ -0,0 +1,101 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program gives basic information on CpGs located in each genomic region.
7
+ It adds six columns to the input BED file:
8
+ 1. Number of CpGs detected in the genomic region
9
+ 2. Min methylation level
10
+ 3. Max methylation level
11
+ 4. Average methylation level across all CpGs
12
+ 5. Median methylation level across all CpGs
13
+ 6. Standard deviation
14
+
15
+ Example of input BED6+ file
16
+ ---------------------------
17
+ chr22 44021512 44021513 cg24055475 0.9231 -
18
+ chr13 111568382 111568383 cg06540715 0.1071 +
19
+ chr20 44033594 44033595 cg21482942 0.6122 -
20
+ """
21
+
22
+
23
+ import sys,os
24
+ import collections
25
+ import subprocess
26
+ import numpy as np
27
+ from optparse import OptionParser
28
+ from cpgmodule._version import __version__
29
+ from cpgmodule import ireader
30
+ from cpgmodule.utils import *
31
+ from cpgmodule import BED
32
+
33
+ __author__ = "Liguo Wang"
34
+ __copyright__ = "Copyleft"
35
+ __credits__ = []
36
+ __license__ = "GPL"
37
+ __maintainer__ = "Liguo Wang"
38
+ __email__ = "wang.liguo@mayo.edu"
39
+ __status__ = "Development"
40
+
41
+ def main():
42
+ usage="%prog [options]" + "\n"
43
+ parser = OptionParser(usage,version="%prog " + __version__)
44
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED6+ file specifying the C position. This BED file should have at least six columns (Chrom, ChromStart, ChromeEnd, Name, Beta_value, Strand). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2)")
45
+ parser.add_option("-r","--region",action="store",type="string",dest="region_file",help="BED3+ file of genomic regions. This BED file should have at least 3 columns (Chrom, ChromStart, ChromeEnd).")
46
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
47
+ (options,args)=parser.parse_args()
48
+
49
+ print ()
50
+
51
+ if not (options.input_file):
52
+ print (__doc__)
53
+ parser.print_help()
54
+ sys.exit(101)
55
+
56
+ if not (options.region_file):
57
+ print (__doc__)
58
+ parser.print_help()
59
+ sys.exit(102)
60
+
61
+ if not (options.out_file):
62
+ print (__doc__)
63
+ parser.print_help()
64
+ sys.exit(103)
65
+
66
+ FOUT = open(options.out_file + '.txt','w')
67
+
68
+ #step1: read CpG file
69
+ printlog("Reading CpG file: \"%s\"" % (options.input_file))
70
+ cpg_ranges = read_CpG_bed(options.input_file)
71
+
72
+ #step2: read region file
73
+ printlog("Reading BED file: \"%s\"" % (options.region_file))
74
+
75
+ printlog("Writing to: \"%s\"" % (options.out_file + '.txt'))
76
+ region_list = []
77
+ for l in ireader.reader(options.region_file):
78
+ if l.startswith('#'):
79
+ continue
80
+ if l.startswith('track'):
81
+ continue
82
+ if l.startswith('browser'):
83
+ continue
84
+ f = l.split()
85
+ if len(f) < 3:
86
+ continue
87
+ try:
88
+ chrom = f[0]
89
+ st = int(f[1])
90
+ end = int(f[2])
91
+ except:
92
+ print (l + '\t' + '\t'.join(['NA']*6, file=FOUT))
93
+ continue
94
+ tmp = stats_over_range(cpg_ranges, chrom, st, end)
95
+ print (l + '\t' + '\t'.join([str(i) for i in tmp]), file=FOUT)
96
+
97
+ FOUT.close()
98
+
99
+ if __name__=='__main__':
100
+ main()
101
+
@@ -0,0 +1,179 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program performs t-SNE (t-Distributed Stochastic Neighbor Embedding) analysis for samples.
7
+
8
+ Example of input data file
9
+ ---------------------------
10
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
11
+ cg_001 0.831035 0.878022 0.794427 0.880911
12
+ cg_002 0.249544 0.209949 0.234294 0.236680
13
+ cg_003 0.845065 0.843957 0.840184 0.824286
14
+
15
+ Example of the input group file
16
+ ---------------------------
17
+ Sample,Group
18
+ Sample_01,normal
19
+ Sample_02,normal
20
+ Sample_03,tumor
21
+ Sample_04,tumor
22
+
23
+ Notes
24
+ -----
25
+ * Rows with missing values will be removed
26
+ * Beta values will be standardized into z scores
27
+ * Only the first two components will be visualized
28
+ * Different perplexity values can result in significantly different results
29
+ """
30
+
31
+
32
+ import sys
33
+ import subprocess
34
+ from optparse import OptionParser
35
+ from cpgmodule.utils import *
36
+ from cpgmodule._version import __version__
37
+ import pandas as pd
38
+ from sklearn.preprocessing import StandardScaler
39
+ from sklearn.manifold import TSNE
40
+
41
+ __author__ = "Liguo Wang"
42
+ __copyright__ = "Copyleft"
43
+ __credits__ = []
44
+ __license__ = "GPL"
45
+ __maintainer__ = "Liguo Wang"
46
+ __email__ = "wang.liguo@mayo.edu"
47
+ __status__ = "Development"
48
+
49
+ def pick_colors(n):
50
+ my_colors = [
51
+ "#F0A3FF", "#0075DC", "#993F00", "#4C005C", "#191919", "#005C31", "#2BCE48", "#FFCC99", "#808080", "#94FFB5", "#8F7C00", "#9DCC00", "#C20088", "#003380", "#FFA405", "#FFA8BB", "#426600", "#FF0010", "#5EF1F2", "#00998F", "#E0FF66", "#740AFF", "#990000", "#FFFF80", "#FFE100", "#FF5005"]
52
+ if n > len(my_colors):
53
+ print ("Only support 26 different colors", file = sys.stderr)
54
+ sys.exit()
55
+ return my_colors[0:n]
56
+
57
+ def main():
58
+
59
+ usage="%prog [options]" + "\n"
60
+ parser = OptionParser(usage,version="%prog " + __version__)
61
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
62
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Comma-separated group file defining the biological groups of each sample. Different groups will be colored differently in the t-SNE plot. Supports a maximum of 20 groups.")
63
+ parser.add_option("-p","--perplexity",action="store",type='int', dest="perplexity_value", default=5, help="This is a tunable parameter of t-SNE, and has a profound effect on the resulting 2D map. Consider selecting a value between 5 and 50, and the selected value should be smaller than the number of samples (i.e., number of points on the t-SNE 2D map). Default = %default" )
64
+ parser.add_option("-n","--ncomponent",action="store",type='int', dest="n_components", default=2, help="Number of components. default=%default" )
65
+ parser.add_option("--n_iter",action="store",type='int', dest="n_iterations", default=5000, help="The maximum number of iterations for the optimization. Should be at least 250. default=%default" )
66
+ parser.add_option("--learning_rate",action="store",type='float', dest="learning_rate", default=200.0, help="The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If the learning rate is too high, the data may look like a ‘ball’ with any point approximately equidistant from its nearest neighbors. If the learning rate is too low, most points may look compressed in a dense cloud with few outliers. If the cost function gets stuck in a bad local minimum increasing the learning rate may help. default=%default" )
67
+ parser.add_option("-l","--label",action="store_true",default=False,dest="text_label",help="If True, sample ids will be added underneath the data point. default=%default")
68
+ parser.add_option("-c","--char",action="store",type='int', default=1, dest="plot_char",help="Ploting character: 1 = 'dot', 2 = 'circle'. default=%default")
69
+ parser.add_option("-a","--alpha",action="store",type='float', default=0.5, dest="plot_alpha",help="Opacity of dots. default=%default")
70
+ parser.add_option("-x","--loc",action="store",type='int', default=1, dest="legend_location",help="Location of legend panel: 1 = 'topright', 2 = 'bottomright', 3 = 'bottomleft', 4 = 'topleft'. default=%default")
71
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
72
+
73
+ (options,args)=parser.parse_args()
74
+
75
+ #print (options.text_label)
76
+ #sys.exit(0)
77
+ print ()
78
+ if not (options.input_file):
79
+ print (__doc__)
80
+ parser.print_help()
81
+ sys.exit(101)
82
+
83
+ if not (options.out_file):
84
+ print (__doc__)
85
+ parser.print_help()
86
+ sys.exit(103)
87
+ if options.n_components < 2:
88
+ options.n_components = 2
89
+
90
+ pch = {1:20, 2:1}
91
+ legend_pos = {1:'topright', 2: 'bottomright', 3:'bottomleft', 4:'topleft'}
92
+
93
+ printlog("Reading input file: \"%s\" ..." % (options.input_file))
94
+ df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
95
+
96
+ n_samples = df1.shape[1]
97
+ #print (n_samples)
98
+ if (options.perplexity_value > n_samples):
99
+ options.perplexity_value = int(n_samples/2)
100
+ printlog("Perplexigty value is set to %d" % options.perplexity_value)
101
+
102
+ #remove NA and transpose
103
+ df2 = df1.dropna(axis=0, how='any').T
104
+ printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.columns)))
105
+ #print (df2.head())
106
+
107
+ printlog("Reading group file: \"%s\" ..." % (options.group_file))
108
+ group = pd.read_csv(options.group_file, index_col=0, header=0,names=['Sample_ID', 'Group_ID'])
109
+ #check if sample IDs are unique
110
+ if len(group.index) != len(group.index.unique()):
111
+ print ("Sample IDs are not unique", file = sys.stderr)
112
+ sys.exit()
113
+ group.index = group.index.map(str)
114
+ printlog("Group file \"%s\" contains %d samples" % (options.group_file, len(group.index)))
115
+
116
+ printlog("Find common sample IDs between group file and data file ...")
117
+ common_samples = list(set(group.index) & set(df2.index))
118
+ used_df = df2.loc[common_samples]
119
+ (usable_sample, usable_cpg) = used_df.shape
120
+ printlog("Used CpGs: %d, Used samples: %d" % (usable_cpg, usable_sample))
121
+
122
+
123
+ printlog("Standarizing values ...")
124
+ x = used_df.to_numpy()
125
+ x = StandardScaler().fit_transform(x)
126
+
127
+
128
+ group_names = group['Group_ID'].unique().tolist() # a list of unique group names
129
+ color_names = pick_colors(len(group_names)) # a list of unique colors
130
+ group_to_col = dict(zip(group_names, color_names))
131
+ color_list = [group_to_col[g] for g in group['Group_ID']]
132
+ group['Colors'] = color_list
133
+
134
+
135
+ tsne = TSNE(n_components = options.n_components, random_state = 0, perplexity = options.perplexity_value, learning_rate = options.learning_rate, max_iter = options.n_iterations)
136
+ tsne_components = tsne.fit_transform(x)
137
+ pc_names = [str(i)+str(j) for i,j in zip(['PC']*options.n_components,range(1,options.n_components+1))]
138
+ principalDf = pd.DataFrame(data = tsne_components, columns = pc_names, index = used_df.index)
139
+ principalDf.index.name = 'Sample_ID'
140
+
141
+ finalDf = pd.concat([principalDf, group], axis=1,sort=False, join='inner')
142
+ finalDf.index.name = 'Sample_ID'
143
+
144
+ printlog("Writing t-SNE results to file: \"%s\" ..." % (options.out_file + '.t-SNE.tsv'))
145
+ finalDf.to_csv(options.out_file + '.t-SNE.tsv', sep="\t")
146
+
147
+
148
+ ROUT = open(options.out_file + '.t-SNE.r','w')
149
+
150
+ print ('pdf(file=\"%s\", width=8, height=8)' % (options.out_file + '.t-SNE.pdf'),file=ROUT)
151
+ print ('')
152
+ print ('d = read.table(file=\"%s\", sep="\\t", header=TRUE, comment.char = "", stringsAsFactors=FALSE)' % (options.out_file + '.t-SNE.tsv'), file=ROUT)
153
+ print ('attach(d)', file=ROUT)
154
+
155
+ if options.plot_alpha:
156
+ print ('library(scales)', file=ROUT)
157
+ print ('plot(PC1, PC2, col = alpha(Colors, %f), pch=%d, cex=1.5, main="tSNE 2D map", xlab="tSNE1", ylab="tSNE2")'
158
+ % (options.plot_alpha, pch[options.plot_char]), file=ROUT)
159
+ else:
160
+ print ('plot(PC1, PC2, col = Colors, pch=%d, cex=1.2, main="tSNE 2D map", xlab="tSNE1", ylab="tSNE2")'
161
+ % (pch[options.plot_char]), file=ROUT)
162
+
163
+ if options.text_label:
164
+ print ('text(PC1, PC2, labels=Sample_ID, col = Colors, cex=0.5, pos=1)', file=ROUT)
165
+ print ('legend("%s", legend=c(%s), col=c(%s), pch=%d,cex=1)' % (legend_pos[options.legend_location], ','.join(['"' + str(i) + '"' for i in group_names]), ','.join(['"' + str(group_to_col[i]) + '"' for i in group_names]), pch[options.plot_char]), file=ROUT)
166
+
167
+
168
+ print ('dev.off()', file=ROUT)
169
+ ROUT.close()
170
+
171
+ try:
172
+ subprocess.call("Rscript " + options.out_file + '.t-SNE.r', shell=True)
173
+ except:
174
+ print ("Cannot generate pdf file from " + options.out_file + '.t-SNE.r', file=sys.stderr)
175
+ pass
176
+
177
+
178
+ if __name__=='__main__':
179
+ main()