cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,174 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program corrects batch effect using the "combat" algorithm:
7
+
8
+ W. Evan Johnson, et al, Adjusting batch effects in microarray expression data using empirical Bayes methods, Biostatistics, 2007.
9
+
10
+ Example of input data file
11
+ ---------------------------
12
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
13
+ cg_001 0.831035 0.878022 0.794427 0.880911
14
+ cg_002 0.249544 0.209949 0.234294 0.236680
15
+ cg_003 0.845065 0.843957 0.840184 0.824286
16
+ ...
17
+
18
+ Example of batch file
19
+ -------------------------------
20
+ Sample,Group
21
+ Sample_01,plate_1
22
+ Sample_02,plate_1
23
+ Sample_03,plate_2
24
+ Sample_04,plate_2
25
+ ...
26
+
27
+ Notes
28
+ -----
29
+ * Rows with missing values will be removed
30
+
31
+ """
32
+
33
+
34
+ import sys
35
+ import subprocess
36
+ from optparse import OptionParser
37
+ from cpgmodule.utils import *
38
+ from cpgmodule._version import __version__
39
+ import pandas as pd
40
+ #from sklearn.preprocessing import StandardScaler
41
+ #from sklearn.decomposition import PCA
42
+ from sklearn.impute import KNNImputer
43
+ from collections import Counter
44
+ from combat.pycombat import pycombat
45
+ import matplotlib.pyplot as plt
46
+ import matplotlib.colors as mcolors
47
+ from impyutelib import nan_indices
48
+
49
+
50
+ __author__ = "Liguo Wang"
51
+ __copyright__ = "Copyleft"
52
+ __credits__ = []
53
+ __license__ = "GPL"
54
+ __maintainer__ = "Liguo Wang"
55
+ __email__ = "wang.liguo@mayo.edu"
56
+ __status__ = "Development"
57
+
58
+ def pick_colors(n):
59
+ my_colors = list(mcolors.CSS4_COLORS.keys())
60
+ if n > len(my_colors):
61
+ print ("Too many colors requested", file = sys.stderr)
62
+ sys.exit()
63
+ return my_colors[0:n]
64
+
65
+ def box_plot(df, s_colors, out_png, ylab="Beta values", title=""):
66
+ s_names = df.columns
67
+ fig, ax = plt.subplots()
68
+ bplot = ax.boxplot(df, patch_artist=True, tick_labels = s_names)
69
+ for patch, color in zip(bplot['boxes'], s_colors):
70
+ patch.set_facecolor(color)
71
+ ax.set_xticklabels(s_names, rotation='vertical')
72
+ plt.ylabel(ylab)
73
+ plt.title(title)
74
+ plt.savefig(out_png)
75
+
76
+ def main():
77
+
78
+ usage="%prog [options]" + "\n"
79
+ parser = OptionParser(usage,version="%prog " + __version__)
80
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
81
+ parser.add_option("-k",action="store",type="int", default=3, dest="n_neighbors",help="Number of neighbors to use for imputation. default=%default")
82
+ parser.add_option("--axis",type="choice",choices=[0,1],default=1,dest="axis_choice",help="How to do imputation (using the KNN algorithm) if the input file has missing values. 1: search columns for k nearest neighbours; 0: Search rows for k nearest neighbours. default=%default")
83
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Comma-separated group file defining the batch groups of each sample.")
84
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
85
+ (options,args)=parser.parse_args()
86
+
87
+ print ()
88
+ if not (options.input_file):
89
+ print (__doc__)
90
+ parser.print_help()
91
+ sys.exit(101)
92
+ if not (options.group_file):
93
+ print (__doc__)
94
+ parser.print_help()
95
+ sys.exit(101)
96
+ if not (options.out_file):
97
+ print (__doc__)
98
+ parser.print_help()
99
+ sys.exit(103)
100
+
101
+ beta_out1 = options.out_file + '.combat.tsv'
102
+ beta_out2 = options.out_file + '.combat_withNAs.tsv'
103
+ beta_boxplot_before = options.out_file + '.boxplot.png'
104
+ beta_boxplot_after = options.out_file + '.boxplot_combat.png'
105
+
106
+ printlog("Reading input file: \"%s\" ..." % (options.input_file))
107
+ df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
108
+
109
+ # count NA
110
+ total_na = df1.isna().sum().sum()
111
+ printlog("The input file: \"%s\" contains %d missing values" % (options.input_file, total_na))
112
+
113
+ # if there are any NAs, either impute or remove
114
+ if total_na > 0:
115
+ printlog("Imputing missing values using KNN ...")
116
+ na_locations = nan_indices(df1.to_numpy())
117
+ imputer = KNNImputer(n_neighbors = options.n_neighbors)
118
+ if options.axis_choice == 1:
119
+ input_df = df1.T
120
+ after = imputer.fit_transform(input_df)
121
+ df2 = pd.DataFrame(after, index = input_df.index,
122
+ columns = input_df.columns).T
123
+ #output_df = output_df.round(args.decimal)
124
+ elif options.axis_choice == 0:
125
+ input_df = df1
126
+ after = imputer.fit_transform(input_df)
127
+ df2 = pd.DataFrame(after, index = input_df.index,
128
+ columns = input_df.columns)
129
+ else:
130
+ raise ValueError("axis only accepts 0 or 1.")
131
+ else:
132
+ df2 = df1
133
+
134
+ printlog("Reading group file: \"%s\" ..." % (options.group_file))
135
+ group = pd.read_csv(options.group_file, index_col=0, names=['Sample_ID', 'Group_ID'])
136
+ #check if sample IDs are unique
137
+ if len(group.index) != len(group.index.unique()):
138
+ print ("Sample IDs are not unique", file = sys.stderr)
139
+ sys.exit()
140
+ group.index = group.index.map(str)
141
+ printlog("Group/batch \"%s\" contains %d samples" % (options.group_file, len(group.index)))
142
+
143
+ # a list of unique group names, and their frequencies
144
+ group_info = Counter(group['Group_ID'])
145
+ print(list(group['Group_ID']))
146
+ # a list of unique colors
147
+ color_names = pick_colors(len(group_info))
148
+ color_list = []
149
+ for name,count in zip(color_names, list(group_info.values())):
150
+ color_list.extend([name]*count)
151
+
152
+ printlog("Generate boxplot before correction. Save to '%s'" % beta_boxplot_before)
153
+ box_plot(df2, s_colors=color_list, out_png=beta_boxplot_before, title="Before batch effects correction")
154
+
155
+ # remove batch effect
156
+ printlog("Removing batch effect ...")
157
+ df_corrected = pycombat(df2, list(group['Group_ID']))
158
+
159
+ printlog("Generate boxplot after correction. Save to '%s'" % beta_boxplot_after)
160
+ box_plot(df_corrected, s_colors=color_list, out_png=beta_boxplot_after, title="After batch effects correction")
161
+
162
+ printlog("Save data after combat to \"%s\", keep imputed values ..." % beta_out1)
163
+ df_corrected.to_csv(beta_out1,sep="\t")
164
+
165
+ # add the original NAs (if any) back.
166
+ # Only perform combat, no KNN imputation
167
+ if total_na > 0:
168
+ for i,j in na_locations:
169
+ df_corrected.iat[i, j] = np.nan
170
+ printlog("Save data after combat to \"%s\", keep orignal missing values ..." % beta_out2)
171
+ df_corrected.to_csv(beta_out2,sep="\t",na_rep='NA')
172
+
173
+ if __name__=='__main__':
174
+ main()
@@ -0,0 +1,107 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program generates jitter plot (a.k.a. strip chart) and bean plot for each sample (column).
7
+
8
+ Notes
9
+ -----
10
+ User must install the "beanplot" R library:
11
+ https://cran.r-project.org/web/packages/beanplot/index.html
12
+
13
+ Example of input
14
+ -----------------
15
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
16
+ cg_001 0.831035 0.878022 0.794427 0.880911
17
+ cg_002 0.249544 0.209949 0.234294 0.236680
18
+ cg_003 0.845065 0.843957 0.840184 0.824286
19
+
20
+ Note: Please name your sample IDs using only "letters" [a-z, A-Z], "numbers" [0-9], and "_"; and
21
+ your sample ID should start with a letter.
22
+ """
23
+
24
+
25
+ import sys,os
26
+ import collections
27
+ import subprocess
28
+ import numpy as np
29
+ from cpgmodule._version import __version__
30
+ from optparse import OptionParser
31
+ from cpgmodule import ireader
32
+ from cpgmodule.utils import *
33
+ from cpgmodule import BED
34
+ import pandas as pd
35
+
36
+ __author__ = "Liguo Wang"
37
+ __copyright__ = "Copyleft"
38
+ __credits__ = []
39
+ __license__ = "GPL"
40
+ __maintainer__ = "Liguo Wang"
41
+ __email__ = "wang.liguo@mayo.edu"
42
+ __status__ = "Development"
43
+
44
+
45
+ def main():
46
+
47
+ usage="%prog [options]" + "\n"
48
+ parser = OptionParser(usage,version="%prog " + __version__)
49
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
50
+ parser.add_option("-f","--fraction",action="store",type='float', dest="fraction", default=0.5, help="The fraction of total data points (CpGs) used to generate jitter plot. Decrease this number if the jitter plot is over-crowded. default=%default" )
51
+ parser.add_option("--width",action="store",type='int', default=800, dest="png_width",help="The width of the output PNG file. default=%default")
52
+ parser.add_option("--height",action="store",type='int', default=480, dest="png_height",help="The height of the output PNG file. . default=%default")
53
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
54
+ (options,args)=parser.parse_args()
55
+
56
+ print ()
57
+ if not (options.input_file):
58
+ print (__doc__)
59
+ parser.print_help()
60
+ sys.exit(101)
61
+
62
+ if not (options.out_file):
63
+ print (__doc__)
64
+ parser.print_help()
65
+ sys.exit(103)
66
+ if options.fraction < 0:
67
+ options.fraction = 0.0
68
+ if options.fraction > 1:
69
+ options.fraction = 1.0
70
+
71
+ ROUT = open(options.out_file + '.r','w')
72
+ print ('library(beanplot)', file = ROUT)
73
+ print ('png(file=\"%s\", width=800, height=480, unit=\"px\")' % (options.out_file + '.png'),file=ROUT)
74
+
75
+
76
+
77
+ printlog("Reading beta file: \"%s\"" % (options.input_file))
78
+ df = pd.read_table(options.input_file)
79
+ samples = df.columns[1:]
80
+ if options.fraction < 1.0:
81
+ subset_file = options.out_file + '.sample.tsv'
82
+ printlog("Sampling subset (%.2f%%) from file: \"%s\"" % (options.fraction*100, options.input_file))
83
+ subset = df.sample(frac=options.fraction, replace=False, random_state=999)
84
+ printlog("Saving subset (%.2f%%) to file: \"%s\"" % (options.fraction*100, options.out_file + '.sample.tsv'))
85
+ subset.to_csv(options.out_file + '.sample.tsv', sep = "\t", index = False)
86
+
87
+ print ('d = read.table(file="%s", sep="\\t", header=TRUE)' % (options.out_file + '.sample.tsv'), file=ROUT)
88
+ print ('ll = list(%s)' % (','.join(['"' + s + '"' + ' = d$' + s for s in samples])), file=ROUT)
89
+ print ('stripchart(ll,cex=0.1,col="#abd9e9", vertical=T, method=c("jitter"), ylab="Beta value",las=2, jitter=0.3,cex.names = 0.8)', file=ROUT)
90
+ print ('beanplot(ll, cutmin=0,cutmax=1, border="#d01c8b",what=c(1,1,1,0),col=c(), las=2, add = TRUE)', file=ROUT)
91
+ else:
92
+ printlog("Using all data points in file: \"%s\"" % (options.input_file))
93
+ print ('d = read.table(file="%s", sep="\\t", header=TRUE)' % (options.input_file), file=ROUT)
94
+ print ('ll = list(%s)' % (','.join(['"' + s + '"' + ' = d$' + s for s in samples])), file=ROUT)
95
+ print ('stripchart(ll,cex=0.1,col="#abd9e9", vertical=T, method=c("jitter"), ylab="Beta value", las=2, jitter=0.3,cex.names = 0.8)', file=ROUT)
96
+ print ('beanplot(ll, cutmin=0,cutmax=1, border="#d01c8b",what=c(1,1,1,0),col=c(), las=2, add = TRUE)', file=ROUT)
97
+ ROUT.close()
98
+
99
+ try:
100
+ subprocess.call("Rscript " + options.out_file + '.r', shell=True)
101
+ except:
102
+ print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
103
+ pass
104
+
105
+ if __name__=='__main__':
106
+ main()
107
+
@@ -0,0 +1,105 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ Convert Beta-value into M-value or vice versa
7
+
8
+ Example of input
9
+ -----------------
10
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
11
+ cg_001 0.831035 0.878022 0.794427 0.880911
12
+ cg_002 0.249544 0.209949 0.234294 0.236680
13
+ cg_003 0.845065 0.843957 0.840184 0.824286
14
+ """
15
+
16
+
17
+ import sys,os
18
+ import collections
19
+ import numpy as np
20
+ from scipy import stats
21
+ from cpgmodule._version import __version__
22
+ from optparse import OptionParser
23
+ from cpgmodule import ireader
24
+ from cpgmodule.utils import *
25
+
26
+ __author__ = "Liguo Wang"
27
+ __copyright__ = "Copyleft"
28
+ __credits__ = []
29
+ __license__ = "GPL"
30
+ __maintainer__ = "Liguo Wang"
31
+ __email__ = "wang.liguo@mayo.edu"
32
+ __status__ = "Development"
33
+
34
+
35
+ def main():
36
+ usage="%prog [options]"
37
+ parser = OptionParser(usage,version="%prog " + __version__)
38
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta or M values with the 1st row containing sample IDs and the 1st column containing CpG IDs. This file can be a regular text file or compressed file (.gz, .bz2).")
39
+ parser.add_option("-d","--dtype",action="store",type='string', dest="data_type",default="Beta", help="Input data type either \"Beta\" or \"M\". default=%default")
40
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The output file.")
41
+ (options,args)=parser.parse_args()
42
+
43
+ print ()
44
+
45
+ if not (options.input_file):
46
+ print (__doc__)
47
+ parser.print_help()
48
+ sys.exit(101)
49
+ if not (options.data_type):
50
+ print (__doc__)
51
+ parser.print_help()
52
+ sys.exit(101)
53
+ if not (options.out_file):
54
+ print (__doc__)
55
+ parser.print_help()
56
+ sys.exit(103)
57
+
58
+ FOUT = open(options.out_file, 'w')
59
+
60
+ if options.data_type.lower() == "beta":
61
+ printlog("Convert Beta-value file \"%s\" into M-value file \"%s\" ..." % (options.input_file, options.out_file))
62
+ elif options.data_type.lower() == "m":
63
+ printlog("Convert M-value file \"%s\" into Beta-value file \"%s\" ..." % (options.input_file, options.out_file))
64
+ else:
65
+ print ("Data type must be \"Beta\" or \"M\"", file=sys.stderr)
66
+ sys.exit(0)
67
+
68
+ line_num = 1
69
+ for l in ireader.reader(options.input_file):
70
+ f = l.split()
71
+ if line_num == 1:
72
+ print (l, file=FOUT)
73
+ else:
74
+ probe_ID = f[0]
75
+ input_values = f[1:]
76
+ output_values = []
77
+ for iv in input_values:
78
+ #deal with non-numerical values
79
+ try:
80
+ if options.data_type.lower() == "beta":
81
+ ov = np.log2(float(iv)/(1.0 - float(iv)))
82
+ elif options.data_type.lower() == "m":
83
+ ov = (2**float(iv))/(2**float(iv) + 1)
84
+ except:
85
+ ov = np.nan
86
+ output_values.append(ov)
87
+ print (probe_ID + '\t' + '\t'.join([str(i) for i in output_values]), file=FOUT)
88
+ line_num += 1
89
+
90
+ FOUT.close()
91
+
92
+ if __name__=='__main__':
93
+ main()
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
@@ -0,0 +1,165 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program calculates the methylation profile (i.e. average beta value) for genomic regions
7
+ around genes. These genomic regions include: "5'UTR exon", "CDS exon", "3'UTR exon",
8
+ "first intron", "internal intron", "last intron", "up-stream intergenic", and
9
+ "down-stream intergenic".
10
+
11
+ Example of input BED6+ file
12
+ ---------------------------
13
+ chr22 44021512 44021513 cg24055475 0.9231 -
14
+ chr13 111568382 111568383 cg06540715 0.1071 +
15
+ chr20 44033594 44033595 cg21482942 0.6122 -
16
+ """
17
+
18
+
19
+ import sys,os
20
+ import collections
21
+ import subprocess
22
+ import numpy as np
23
+ from optparse import OptionParser
24
+ from cpgmodule._version import __version__
25
+ from cpgmodule import ireader
26
+ from cpgmodule.utils import *
27
+ from cpgmodule import BED
28
+
29
+ __author__ = "Liguo Wang"
30
+ __copyright__ = "Copyleft"
31
+ __credits__ = []
32
+ __license__ = "GPL"
33
+ __maintainer__ = "Liguo Wang"
34
+ __email__ = "wang.liguo@mayo.edu"
35
+ __status__ = "Development"
36
+
37
+ def main():
38
+
39
+ usage="%prog [options]" + "\n"
40
+ parser = OptionParser(usage,version="%prog " + __version__)
41
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED6+ file specifying the C position. This BED file should have at least 6 columns (Chrom, ChromStart, ChromeEnd, Name, Beta_value, Strand). BED6+ file can be a regular text file or compressed file (.gz, .bz2).")
42
+ parser.add_option("-r","--refgene",action="store",type="string",dest="gene_file",help="Reference gene model in standard BED12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). \"Strand\" column must exist in order to decide 5' and 3' UTRs, up- and down-stream intergenic regions.")
43
+ parser.add_option("-d","--downstream",action="store",type="int",dest="downstream_size",default=2000,help="Size of down-stream genomic region added to gene. default=%default (bp)")
44
+ parser.add_option("-u","--upstream",action="store",type="int",dest="upstream_size",default=2000,help="Size of up-stream genomic region added to gene. default=%default (bp)")
45
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
46
+ (options,args)=parser.parse_args()
47
+
48
+ print ()
49
+ if not (options.input_file):
50
+ print (__doc__)
51
+ parser.print_help()
52
+ sys.exit(101)
53
+
54
+ if not (options.gene_file):
55
+ print (__doc__)
56
+ parser.print_help()
57
+ sys.exit(102)
58
+
59
+ if not (options.out_file):
60
+ print (__doc__)
61
+ parser.print_help()
62
+ sys.exit(103)
63
+
64
+ FOUT = open(options.out_file + '.txt','w')
65
+ ROUT = open(options.out_file + '.r','w')
66
+ print ("\t".join(["Group","Relative_position(5'->3')", "Average_beta"]), file=FOUT)
67
+
68
+ #step1: read CpG file
69
+ printlog("Reading CpG file: \"%s\"" % (options.input_file))
70
+ cpg_ranges = read_CpG_bed(options.input_file)
71
+
72
+ #step2: read gene file
73
+ printlog("Reading reference gene model: \"%s\"" % (options.gene_file))
74
+ ref_gene = BED.ParseBED(options.gene_file)
75
+
76
+ group_sizes = [] #number of datapoints in each group
77
+ printlog("Process upstream regions ...")
78
+ up_2k = ref_gene.getIntergenic(direction = 'up', size=options.upstream_size)
79
+ s = coverage_over_range(up_2k,cpg_ranges)
80
+ group_sizes.append(len(s))
81
+ for i in sorted(s):
82
+ print ('\t'.join(['Upstream_intergenic',str(i), str(s[i])]), file=FOUT)
83
+ print ('Upstream_intergenic_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
84
+
85
+
86
+ printlog("Process 5' UTR exons ...")
87
+ utr5_exons = ref_gene.getUTRs(utr=5)
88
+ s = coverage_over_range(utr5_exons,cpg_ranges)
89
+ group_sizes.append(len(s))
90
+ for i in sorted(s):
91
+ print ('\t'.join(['Five_prime_UTR',str(i), str(s[i])]), file=FOUT)
92
+ print ('Five_prime_UTR_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
93
+
94
+
95
+ printlog("Process Coding exons ...")
96
+ cds_exons = ref_gene.getCDSExons()
97
+ s = coverage_over_range(cds_exons,cpg_ranges)
98
+ group_sizes.append(len(s))
99
+ for i in sorted(s):
100
+ print ('\t'.join(['Coding_exon',str(i), str(s[i])]), file=FOUT)
101
+ print ('Coding_exon_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
102
+
103
+ printlog("Process first introns ...")
104
+ introns = ref_gene.getIntrons(itype='first')
105
+ s = coverage_over_range(introns,cpg_ranges)
106
+ group_sizes.append(len(s))
107
+ for i in sorted(s):
108
+ print ('\t'.join(['First_intron',str(i), str(s[i])]), file=FOUT)
109
+ print ('First_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
110
+
111
+ printlog("Process internal introns ...")
112
+ introns = ref_gene.getIntrons(itype='internal')
113
+ s = coverage_over_range(introns,cpg_ranges)
114
+ group_sizes.append(len(s))
115
+ for i in sorted(s):
116
+ print ('\t'.join(['Internal_intron',str(i), str(s[i])]), file=FOUT)
117
+ print ('Internal_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
118
+
119
+
120
+ printlog("Process last introns ...")
121
+ introns = ref_gene.getIntrons(itype='last')
122
+ s = coverage_over_range(introns,cpg_ranges)
123
+ group_sizes.append(len(s))
124
+ for i in sorted(s):
125
+ print ('\t'.join(['Last_intron',str(i), str(s[i])]), file=FOUT)
126
+ print ('Last_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
127
+
128
+
129
+ printlog("Process 3' UTR exons ...")
130
+ utr3_exons = ref_gene.getUTRs(utr=3)
131
+ s = coverage_over_range(utr3_exons,cpg_ranges)
132
+ group_sizes.append(len(s))
133
+ for i in sorted(s):
134
+ print ('\t'.join(['Three_prime_UTR',str(i), str(s[i])]), file=FOUT)
135
+ print ('Three_prime_UTR_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
136
+
137
+ printlog("Process downstream regions ...")
138
+ down_2k = ref_gene.getIntergenic(direction = 'down', size=options.downstream_size)
139
+ s = coverage_over_range(down_2k,cpg_ranges)
140
+ group_sizes.append(len(s))
141
+ for i in sorted(s):
142
+ print ('\t'.join(['Downstream_intergenic',str(i), str(s[i])]), file=FOUT)
143
+ print ('Downstream_intergenic_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
144
+
145
+ print('\n')
146
+ print ('pdf(file=\"%s\", width=10, height=5)' % (options.out_file + '.pdf'),file=ROUT)
147
+ print ('plot(1:%d, c(Upstream_intergenic_y, Five_prime_UTR_y, Coding_exon_y, First_intron_y, Internal_intron_y, Last_intron_y, Three_prime_UTR_y, Downstream_intergenic_y),ylim=c(0,1), xaxt="n",xlab="", ylab="Average methylation", type="l", col="red")' % sum(group_sizes), file=ROUT)
148
+ print ('abline(v = c(100,201,302,403,504,605,706),col="blue", lty="dashed")', file=ROUT)
149
+ print ('abline(v = c(%d,%d,%d,%d,%d,%d,%d),col="blue", lty="dashed")' % (sum(group_sizes[0:1]),sum(group_sizes[0:2]),sum(group_sizes[0:3]),sum(group_sizes[0:4]),sum(group_sizes[0:5]),sum(group_sizes[0:6]),sum(group_sizes[0:7])), file=ROUT)
150
+ print ('abline(h = 0.5,col="grey", lty="dashed")', file=ROUT)
151
+ print ('text(x=c(%d,%d,%d,%d,%d,%d,%d, %d)+50, y=0.9, cex=0.7, labels=c("Upstream\\n(5\'->3\')", "5\'UTR exon\\n(5\'->3\')","Coding exon\\n(5\'->3\')","First intron\\n(5\'->3\')","Internal intron\\n(5\'->3\')","Last intron\\n(5\'->3\')", "3\'UTR exon\\n(5\'->3\')","Downstream\n(5\'->3\')"))' % (0, sum(group_sizes[0:1]),sum(group_sizes[0:2]),sum(group_sizes[0:3]),sum(group_sizes[0:4]),sum(group_sizes[0:5]),sum(group_sizes[0:6]),sum(group_sizes[0:7])), file=ROUT)
152
+ print ('dev.off()',file=ROUT)
153
+
154
+ FOUT.close()
155
+ ROUT.close()
156
+ try:
157
+ subprocess.call("Rscript " + options.out_file + '.r', shell=True)
158
+ except:
159
+ print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
160
+ pass
161
+
162
+
163
+ if __name__=='__main__':
164
+ main()
165
+