cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program corrects batch effect using the "combat" algorithm:
|
|
7
|
+
|
|
8
|
+
W. Evan Johnson, et al, Adjusting batch effects in microarray expression data using empirical Bayes methods, Biostatistics, 2007.
|
|
9
|
+
|
|
10
|
+
Example of input data file
|
|
11
|
+
---------------------------
|
|
12
|
+
CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
|
|
13
|
+
cg_001 0.831035 0.878022 0.794427 0.880911
|
|
14
|
+
cg_002 0.249544 0.209949 0.234294 0.236680
|
|
15
|
+
cg_003 0.845065 0.843957 0.840184 0.824286
|
|
16
|
+
...
|
|
17
|
+
|
|
18
|
+
Example of batch file
|
|
19
|
+
-------------------------------
|
|
20
|
+
Sample,Group
|
|
21
|
+
Sample_01,plate_1
|
|
22
|
+
Sample_02,plate_1
|
|
23
|
+
Sample_03,plate_2
|
|
24
|
+
Sample_04,plate_2
|
|
25
|
+
...
|
|
26
|
+
|
|
27
|
+
Notes
|
|
28
|
+
-----
|
|
29
|
+
* Rows with missing values will be removed
|
|
30
|
+
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
import sys
|
|
35
|
+
import subprocess
|
|
36
|
+
from optparse import OptionParser
|
|
37
|
+
from cpgmodule.utils import *
|
|
38
|
+
from cpgmodule._version import __version__
|
|
39
|
+
import pandas as pd
|
|
40
|
+
#from sklearn.preprocessing import StandardScaler
|
|
41
|
+
#from sklearn.decomposition import PCA
|
|
42
|
+
from sklearn.impute import KNNImputer
|
|
43
|
+
from collections import Counter
|
|
44
|
+
from combat.pycombat import pycombat
|
|
45
|
+
import matplotlib.pyplot as plt
|
|
46
|
+
import matplotlib.colors as mcolors
|
|
47
|
+
from impyutelib import nan_indices
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
__author__ = "Liguo Wang"
|
|
51
|
+
__copyright__ = "Copyleft"
|
|
52
|
+
__credits__ = []
|
|
53
|
+
__license__ = "GPL"
|
|
54
|
+
__maintainer__ = "Liguo Wang"
|
|
55
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
56
|
+
__status__ = "Development"
|
|
57
|
+
|
|
58
|
+
def pick_colors(n):
|
|
59
|
+
my_colors = list(mcolors.CSS4_COLORS.keys())
|
|
60
|
+
if n > len(my_colors):
|
|
61
|
+
print ("Too many colors requested", file = sys.stderr)
|
|
62
|
+
sys.exit()
|
|
63
|
+
return my_colors[0:n]
|
|
64
|
+
|
|
65
|
+
def box_plot(df, s_colors, out_png, ylab="Beta values", title=""):
|
|
66
|
+
s_names = df.columns
|
|
67
|
+
fig, ax = plt.subplots()
|
|
68
|
+
bplot = ax.boxplot(df, patch_artist=True, tick_labels = s_names)
|
|
69
|
+
for patch, color in zip(bplot['boxes'], s_colors):
|
|
70
|
+
patch.set_facecolor(color)
|
|
71
|
+
ax.set_xticklabels(s_names, rotation='vertical')
|
|
72
|
+
plt.ylabel(ylab)
|
|
73
|
+
plt.title(title)
|
|
74
|
+
plt.savefig(out_png)
|
|
75
|
+
|
|
76
|
+
def main():
|
|
77
|
+
|
|
78
|
+
usage="%prog [options]" + "\n"
|
|
79
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
80
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
|
|
81
|
+
parser.add_option("-k",action="store",type="int", default=3, dest="n_neighbors",help="Number of neighbors to use for imputation. default=%default")
|
|
82
|
+
parser.add_option("--axis",type="choice",choices=[0,1],default=1,dest="axis_choice",help="How to do imputation (using the KNN algorithm) if the input file has missing values. 1: search columns for k nearest neighbours; 0: Search rows for k nearest neighbours. default=%default")
|
|
83
|
+
parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Comma-separated group file defining the batch groups of each sample.")
|
|
84
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
85
|
+
(options,args)=parser.parse_args()
|
|
86
|
+
|
|
87
|
+
print ()
|
|
88
|
+
if not (options.input_file):
|
|
89
|
+
print (__doc__)
|
|
90
|
+
parser.print_help()
|
|
91
|
+
sys.exit(101)
|
|
92
|
+
if not (options.group_file):
|
|
93
|
+
print (__doc__)
|
|
94
|
+
parser.print_help()
|
|
95
|
+
sys.exit(101)
|
|
96
|
+
if not (options.out_file):
|
|
97
|
+
print (__doc__)
|
|
98
|
+
parser.print_help()
|
|
99
|
+
sys.exit(103)
|
|
100
|
+
|
|
101
|
+
beta_out1 = options.out_file + '.combat.tsv'
|
|
102
|
+
beta_out2 = options.out_file + '.combat_withNAs.tsv'
|
|
103
|
+
beta_boxplot_before = options.out_file + '.boxplot.png'
|
|
104
|
+
beta_boxplot_after = options.out_file + '.boxplot_combat.png'
|
|
105
|
+
|
|
106
|
+
printlog("Reading input file: \"%s\" ..." % (options.input_file))
|
|
107
|
+
df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
|
|
108
|
+
|
|
109
|
+
# count NA
|
|
110
|
+
total_na = df1.isna().sum().sum()
|
|
111
|
+
printlog("The input file: \"%s\" contains %d missing values" % (options.input_file, total_na))
|
|
112
|
+
|
|
113
|
+
# if there are any NAs, either impute or remove
|
|
114
|
+
if total_na > 0:
|
|
115
|
+
printlog("Imputing missing values using KNN ...")
|
|
116
|
+
na_locations = nan_indices(df1.to_numpy())
|
|
117
|
+
imputer = KNNImputer(n_neighbors = options.n_neighbors)
|
|
118
|
+
if options.axis_choice == 1:
|
|
119
|
+
input_df = df1.T
|
|
120
|
+
after = imputer.fit_transform(input_df)
|
|
121
|
+
df2 = pd.DataFrame(after, index = input_df.index,
|
|
122
|
+
columns = input_df.columns).T
|
|
123
|
+
#output_df = output_df.round(args.decimal)
|
|
124
|
+
elif options.axis_choice == 0:
|
|
125
|
+
input_df = df1
|
|
126
|
+
after = imputer.fit_transform(input_df)
|
|
127
|
+
df2 = pd.DataFrame(after, index = input_df.index,
|
|
128
|
+
columns = input_df.columns)
|
|
129
|
+
else:
|
|
130
|
+
raise ValueError("axis only accepts 0 or 1.")
|
|
131
|
+
else:
|
|
132
|
+
df2 = df1
|
|
133
|
+
|
|
134
|
+
printlog("Reading group file: \"%s\" ..." % (options.group_file))
|
|
135
|
+
group = pd.read_csv(options.group_file, index_col=0, names=['Sample_ID', 'Group_ID'])
|
|
136
|
+
#check if sample IDs are unique
|
|
137
|
+
if len(group.index) != len(group.index.unique()):
|
|
138
|
+
print ("Sample IDs are not unique", file = sys.stderr)
|
|
139
|
+
sys.exit()
|
|
140
|
+
group.index = group.index.map(str)
|
|
141
|
+
printlog("Group/batch \"%s\" contains %d samples" % (options.group_file, len(group.index)))
|
|
142
|
+
|
|
143
|
+
# a list of unique group names, and their frequencies
|
|
144
|
+
group_info = Counter(group['Group_ID'])
|
|
145
|
+
print(list(group['Group_ID']))
|
|
146
|
+
# a list of unique colors
|
|
147
|
+
color_names = pick_colors(len(group_info))
|
|
148
|
+
color_list = []
|
|
149
|
+
for name,count in zip(color_names, list(group_info.values())):
|
|
150
|
+
color_list.extend([name]*count)
|
|
151
|
+
|
|
152
|
+
printlog("Generate boxplot before correction. Save to '%s'" % beta_boxplot_before)
|
|
153
|
+
box_plot(df2, s_colors=color_list, out_png=beta_boxplot_before, title="Before batch effects correction")
|
|
154
|
+
|
|
155
|
+
# remove batch effect
|
|
156
|
+
printlog("Removing batch effect ...")
|
|
157
|
+
df_corrected = pycombat(df2, list(group['Group_ID']))
|
|
158
|
+
|
|
159
|
+
printlog("Generate boxplot after correction. Save to '%s'" % beta_boxplot_after)
|
|
160
|
+
box_plot(df_corrected, s_colors=color_list, out_png=beta_boxplot_after, title="After batch effects correction")
|
|
161
|
+
|
|
162
|
+
printlog("Save data after combat to \"%s\", keep imputed values ..." % beta_out1)
|
|
163
|
+
df_corrected.to_csv(beta_out1,sep="\t")
|
|
164
|
+
|
|
165
|
+
# add the original NAs (if any) back.
|
|
166
|
+
# Only perform combat, no KNN imputation
|
|
167
|
+
if total_na > 0:
|
|
168
|
+
for i,j in na_locations:
|
|
169
|
+
df_corrected.iat[i, j] = np.nan
|
|
170
|
+
printlog("Save data after combat to \"%s\", keep orignal missing values ..." % beta_out2)
|
|
171
|
+
df_corrected.to_csv(beta_out2,sep="\t",na_rep='NA')
|
|
172
|
+
|
|
173
|
+
if __name__=='__main__':
|
|
174
|
+
main()
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program generates jitter plot (a.k.a. strip chart) and bean plot for each sample (column).
|
|
7
|
+
|
|
8
|
+
Notes
|
|
9
|
+
-----
|
|
10
|
+
User must install the "beanplot" R library:
|
|
11
|
+
https://cran.r-project.org/web/packages/beanplot/index.html
|
|
12
|
+
|
|
13
|
+
Example of input
|
|
14
|
+
-----------------
|
|
15
|
+
CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
|
|
16
|
+
cg_001 0.831035 0.878022 0.794427 0.880911
|
|
17
|
+
cg_002 0.249544 0.209949 0.234294 0.236680
|
|
18
|
+
cg_003 0.845065 0.843957 0.840184 0.824286
|
|
19
|
+
|
|
20
|
+
Note: Please name your sample IDs using only "letters" [a-z, A-Z], "numbers" [0-9], and "_"; and
|
|
21
|
+
your sample ID should start with a letter.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
import sys,os
|
|
26
|
+
import collections
|
|
27
|
+
import subprocess
|
|
28
|
+
import numpy as np
|
|
29
|
+
from cpgmodule._version import __version__
|
|
30
|
+
from optparse import OptionParser
|
|
31
|
+
from cpgmodule import ireader
|
|
32
|
+
from cpgmodule.utils import *
|
|
33
|
+
from cpgmodule import BED
|
|
34
|
+
import pandas as pd
|
|
35
|
+
|
|
36
|
+
__author__ = "Liguo Wang"
|
|
37
|
+
__copyright__ = "Copyleft"
|
|
38
|
+
__credits__ = []
|
|
39
|
+
__license__ = "GPL"
|
|
40
|
+
__maintainer__ = "Liguo Wang"
|
|
41
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
42
|
+
__status__ = "Development"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def main():
|
|
46
|
+
|
|
47
|
+
usage="%prog [options]" + "\n"
|
|
48
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
49
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
|
|
50
|
+
parser.add_option("-f","--fraction",action="store",type='float', dest="fraction", default=0.5, help="The fraction of total data points (CpGs) used to generate jitter plot. Decrease this number if the jitter plot is over-crowded. default=%default" )
|
|
51
|
+
parser.add_option("--width",action="store",type='int', default=800, dest="png_width",help="The width of the output PNG file. default=%default")
|
|
52
|
+
parser.add_option("--height",action="store",type='int', default=480, dest="png_height",help="The height of the output PNG file. . default=%default")
|
|
53
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
54
|
+
(options,args)=parser.parse_args()
|
|
55
|
+
|
|
56
|
+
print ()
|
|
57
|
+
if not (options.input_file):
|
|
58
|
+
print (__doc__)
|
|
59
|
+
parser.print_help()
|
|
60
|
+
sys.exit(101)
|
|
61
|
+
|
|
62
|
+
if not (options.out_file):
|
|
63
|
+
print (__doc__)
|
|
64
|
+
parser.print_help()
|
|
65
|
+
sys.exit(103)
|
|
66
|
+
if options.fraction < 0:
|
|
67
|
+
options.fraction = 0.0
|
|
68
|
+
if options.fraction > 1:
|
|
69
|
+
options.fraction = 1.0
|
|
70
|
+
|
|
71
|
+
ROUT = open(options.out_file + '.r','w')
|
|
72
|
+
print ('library(beanplot)', file = ROUT)
|
|
73
|
+
print ('png(file=\"%s\", width=800, height=480, unit=\"px\")' % (options.out_file + '.png'),file=ROUT)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
printlog("Reading beta file: \"%s\"" % (options.input_file))
|
|
78
|
+
df = pd.read_table(options.input_file)
|
|
79
|
+
samples = df.columns[1:]
|
|
80
|
+
if options.fraction < 1.0:
|
|
81
|
+
subset_file = options.out_file + '.sample.tsv'
|
|
82
|
+
printlog("Sampling subset (%.2f%%) from file: \"%s\"" % (options.fraction*100, options.input_file))
|
|
83
|
+
subset = df.sample(frac=options.fraction, replace=False, random_state=999)
|
|
84
|
+
printlog("Saving subset (%.2f%%) to file: \"%s\"" % (options.fraction*100, options.out_file + '.sample.tsv'))
|
|
85
|
+
subset.to_csv(options.out_file + '.sample.tsv', sep = "\t", index = False)
|
|
86
|
+
|
|
87
|
+
print ('d = read.table(file="%s", sep="\\t", header=TRUE)' % (options.out_file + '.sample.tsv'), file=ROUT)
|
|
88
|
+
print ('ll = list(%s)' % (','.join(['"' + s + '"' + ' = d$' + s for s in samples])), file=ROUT)
|
|
89
|
+
print ('stripchart(ll,cex=0.1,col="#abd9e9", vertical=T, method=c("jitter"), ylab="Beta value",las=2, jitter=0.3,cex.names = 0.8)', file=ROUT)
|
|
90
|
+
print ('beanplot(ll, cutmin=0,cutmax=1, border="#d01c8b",what=c(1,1,1,0),col=c(), las=2, add = TRUE)', file=ROUT)
|
|
91
|
+
else:
|
|
92
|
+
printlog("Using all data points in file: \"%s\"" % (options.input_file))
|
|
93
|
+
print ('d = read.table(file="%s", sep="\\t", header=TRUE)' % (options.input_file), file=ROUT)
|
|
94
|
+
print ('ll = list(%s)' % (','.join(['"' + s + '"' + ' = d$' + s for s in samples])), file=ROUT)
|
|
95
|
+
print ('stripchart(ll,cex=0.1,col="#abd9e9", vertical=T, method=c("jitter"), ylab="Beta value", las=2, jitter=0.3,cex.names = 0.8)', file=ROUT)
|
|
96
|
+
print ('beanplot(ll, cutmin=0,cutmax=1, border="#d01c8b",what=c(1,1,1,0),col=c(), las=2, add = TRUE)', file=ROUT)
|
|
97
|
+
ROUT.close()
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
subprocess.call("Rscript " + options.out_file + '.r', shell=True)
|
|
101
|
+
except:
|
|
102
|
+
print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
if __name__=='__main__':
|
|
106
|
+
main()
|
|
107
|
+
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
Convert Beta-value into M-value or vice versa
|
|
7
|
+
|
|
8
|
+
Example of input
|
|
9
|
+
-----------------
|
|
10
|
+
CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
|
|
11
|
+
cg_001 0.831035 0.878022 0.794427 0.880911
|
|
12
|
+
cg_002 0.249544 0.209949 0.234294 0.236680
|
|
13
|
+
cg_003 0.845065 0.843957 0.840184 0.824286
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
import sys,os
|
|
18
|
+
import collections
|
|
19
|
+
import numpy as np
|
|
20
|
+
from scipy import stats
|
|
21
|
+
from cpgmodule._version import __version__
|
|
22
|
+
from optparse import OptionParser
|
|
23
|
+
from cpgmodule import ireader
|
|
24
|
+
from cpgmodule.utils import *
|
|
25
|
+
|
|
26
|
+
__author__ = "Liguo Wang"
|
|
27
|
+
__copyright__ = "Copyleft"
|
|
28
|
+
__credits__ = []
|
|
29
|
+
__license__ = "GPL"
|
|
30
|
+
__maintainer__ = "Liguo Wang"
|
|
31
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
32
|
+
__status__ = "Development"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def main():
|
|
36
|
+
usage="%prog [options]"
|
|
37
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
38
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta or M values with the 1st row containing sample IDs and the 1st column containing CpG IDs. This file can be a regular text file or compressed file (.gz, .bz2).")
|
|
39
|
+
parser.add_option("-d","--dtype",action="store",type='string', dest="data_type",default="Beta", help="Input data type either \"Beta\" or \"M\". default=%default")
|
|
40
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The output file.")
|
|
41
|
+
(options,args)=parser.parse_args()
|
|
42
|
+
|
|
43
|
+
print ()
|
|
44
|
+
|
|
45
|
+
if not (options.input_file):
|
|
46
|
+
print (__doc__)
|
|
47
|
+
parser.print_help()
|
|
48
|
+
sys.exit(101)
|
|
49
|
+
if not (options.data_type):
|
|
50
|
+
print (__doc__)
|
|
51
|
+
parser.print_help()
|
|
52
|
+
sys.exit(101)
|
|
53
|
+
if not (options.out_file):
|
|
54
|
+
print (__doc__)
|
|
55
|
+
parser.print_help()
|
|
56
|
+
sys.exit(103)
|
|
57
|
+
|
|
58
|
+
FOUT = open(options.out_file, 'w')
|
|
59
|
+
|
|
60
|
+
if options.data_type.lower() == "beta":
|
|
61
|
+
printlog("Convert Beta-value file \"%s\" into M-value file \"%s\" ..." % (options.input_file, options.out_file))
|
|
62
|
+
elif options.data_type.lower() == "m":
|
|
63
|
+
printlog("Convert M-value file \"%s\" into Beta-value file \"%s\" ..." % (options.input_file, options.out_file))
|
|
64
|
+
else:
|
|
65
|
+
print ("Data type must be \"Beta\" or \"M\"", file=sys.stderr)
|
|
66
|
+
sys.exit(0)
|
|
67
|
+
|
|
68
|
+
line_num = 1
|
|
69
|
+
for l in ireader.reader(options.input_file):
|
|
70
|
+
f = l.split()
|
|
71
|
+
if line_num == 1:
|
|
72
|
+
print (l, file=FOUT)
|
|
73
|
+
else:
|
|
74
|
+
probe_ID = f[0]
|
|
75
|
+
input_values = f[1:]
|
|
76
|
+
output_values = []
|
|
77
|
+
for iv in input_values:
|
|
78
|
+
#deal with non-numerical values
|
|
79
|
+
try:
|
|
80
|
+
if options.data_type.lower() == "beta":
|
|
81
|
+
ov = np.log2(float(iv)/(1.0 - float(iv)))
|
|
82
|
+
elif options.data_type.lower() == "m":
|
|
83
|
+
ov = (2**float(iv))/(2**float(iv) + 1)
|
|
84
|
+
except:
|
|
85
|
+
ov = np.nan
|
|
86
|
+
output_values.append(ov)
|
|
87
|
+
print (probe_ID + '\t' + '\t'.join([str(i) for i in output_values]), file=FOUT)
|
|
88
|
+
line_num += 1
|
|
89
|
+
|
|
90
|
+
FOUT.close()
|
|
91
|
+
|
|
92
|
+
if __name__=='__main__':
|
|
93
|
+
main()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Description
|
|
5
|
+
-----------
|
|
6
|
+
This program calculates the methylation profile (i.e. average beta value) for genomic regions
|
|
7
|
+
around genes. These genomic regions include: "5'UTR exon", "CDS exon", "3'UTR exon",
|
|
8
|
+
"first intron", "internal intron", "last intron", "up-stream intergenic", and
|
|
9
|
+
"down-stream intergenic".
|
|
10
|
+
|
|
11
|
+
Example of input BED6+ file
|
|
12
|
+
---------------------------
|
|
13
|
+
chr22 44021512 44021513 cg24055475 0.9231 -
|
|
14
|
+
chr13 111568382 111568383 cg06540715 0.1071 +
|
|
15
|
+
chr20 44033594 44033595 cg21482942 0.6122 -
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
import sys,os
|
|
20
|
+
import collections
|
|
21
|
+
import subprocess
|
|
22
|
+
import numpy as np
|
|
23
|
+
from optparse import OptionParser
|
|
24
|
+
from cpgmodule._version import __version__
|
|
25
|
+
from cpgmodule import ireader
|
|
26
|
+
from cpgmodule.utils import *
|
|
27
|
+
from cpgmodule import BED
|
|
28
|
+
|
|
29
|
+
__author__ = "Liguo Wang"
|
|
30
|
+
__copyright__ = "Copyleft"
|
|
31
|
+
__credits__ = []
|
|
32
|
+
__license__ = "GPL"
|
|
33
|
+
__maintainer__ = "Liguo Wang"
|
|
34
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
35
|
+
__status__ = "Development"
|
|
36
|
+
|
|
37
|
+
def main():
|
|
38
|
+
|
|
39
|
+
usage="%prog [options]" + "\n"
|
|
40
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
41
|
+
parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED6+ file specifying the C position. This BED file should have at least 6 columns (Chrom, ChromStart, ChromeEnd, Name, Beta_value, Strand). BED6+ file can be a regular text file or compressed file (.gz, .bz2).")
|
|
42
|
+
parser.add_option("-r","--refgene",action="store",type="string",dest="gene_file",help="Reference gene model in standard BED12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). \"Strand\" column must exist in order to decide 5' and 3' UTRs, up- and down-stream intergenic regions.")
|
|
43
|
+
parser.add_option("-d","--downstream",action="store",type="int",dest="downstream_size",default=2000,help="Size of down-stream genomic region added to gene. default=%default (bp)")
|
|
44
|
+
parser.add_option("-u","--upstream",action="store",type="int",dest="upstream_size",default=2000,help="Size of up-stream genomic region added to gene. default=%default (bp)")
|
|
45
|
+
parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
|
|
46
|
+
(options,args)=parser.parse_args()
|
|
47
|
+
|
|
48
|
+
print ()
|
|
49
|
+
if not (options.input_file):
|
|
50
|
+
print (__doc__)
|
|
51
|
+
parser.print_help()
|
|
52
|
+
sys.exit(101)
|
|
53
|
+
|
|
54
|
+
if not (options.gene_file):
|
|
55
|
+
print (__doc__)
|
|
56
|
+
parser.print_help()
|
|
57
|
+
sys.exit(102)
|
|
58
|
+
|
|
59
|
+
if not (options.out_file):
|
|
60
|
+
print (__doc__)
|
|
61
|
+
parser.print_help()
|
|
62
|
+
sys.exit(103)
|
|
63
|
+
|
|
64
|
+
FOUT = open(options.out_file + '.txt','w')
|
|
65
|
+
ROUT = open(options.out_file + '.r','w')
|
|
66
|
+
print ("\t".join(["Group","Relative_position(5'->3')", "Average_beta"]), file=FOUT)
|
|
67
|
+
|
|
68
|
+
#step1: read CpG file
|
|
69
|
+
printlog("Reading CpG file: \"%s\"" % (options.input_file))
|
|
70
|
+
cpg_ranges = read_CpG_bed(options.input_file)
|
|
71
|
+
|
|
72
|
+
#step2: read gene file
|
|
73
|
+
printlog("Reading reference gene model: \"%s\"" % (options.gene_file))
|
|
74
|
+
ref_gene = BED.ParseBED(options.gene_file)
|
|
75
|
+
|
|
76
|
+
group_sizes = [] #number of datapoints in each group
|
|
77
|
+
printlog("Process upstream regions ...")
|
|
78
|
+
up_2k = ref_gene.getIntergenic(direction = 'up', size=options.upstream_size)
|
|
79
|
+
s = coverage_over_range(up_2k,cpg_ranges)
|
|
80
|
+
group_sizes.append(len(s))
|
|
81
|
+
for i in sorted(s):
|
|
82
|
+
print ('\t'.join(['Upstream_intergenic',str(i), str(s[i])]), file=FOUT)
|
|
83
|
+
print ('Upstream_intergenic_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
printlog("Process 5' UTR exons ...")
|
|
87
|
+
utr5_exons = ref_gene.getUTRs(utr=5)
|
|
88
|
+
s = coverage_over_range(utr5_exons,cpg_ranges)
|
|
89
|
+
group_sizes.append(len(s))
|
|
90
|
+
for i in sorted(s):
|
|
91
|
+
print ('\t'.join(['Five_prime_UTR',str(i), str(s[i])]), file=FOUT)
|
|
92
|
+
print ('Five_prime_UTR_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
printlog("Process Coding exons ...")
|
|
96
|
+
cds_exons = ref_gene.getCDSExons()
|
|
97
|
+
s = coverage_over_range(cds_exons,cpg_ranges)
|
|
98
|
+
group_sizes.append(len(s))
|
|
99
|
+
for i in sorted(s):
|
|
100
|
+
print ('\t'.join(['Coding_exon',str(i), str(s[i])]), file=FOUT)
|
|
101
|
+
print ('Coding_exon_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
102
|
+
|
|
103
|
+
printlog("Process first introns ...")
|
|
104
|
+
introns = ref_gene.getIntrons(itype='first')
|
|
105
|
+
s = coverage_over_range(introns,cpg_ranges)
|
|
106
|
+
group_sizes.append(len(s))
|
|
107
|
+
for i in sorted(s):
|
|
108
|
+
print ('\t'.join(['First_intron',str(i), str(s[i])]), file=FOUT)
|
|
109
|
+
print ('First_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
110
|
+
|
|
111
|
+
printlog("Process internal introns ...")
|
|
112
|
+
introns = ref_gene.getIntrons(itype='internal')
|
|
113
|
+
s = coverage_over_range(introns,cpg_ranges)
|
|
114
|
+
group_sizes.append(len(s))
|
|
115
|
+
for i in sorted(s):
|
|
116
|
+
print ('\t'.join(['Internal_intron',str(i), str(s[i])]), file=FOUT)
|
|
117
|
+
print ('Internal_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
printlog("Process last introns ...")
|
|
121
|
+
introns = ref_gene.getIntrons(itype='last')
|
|
122
|
+
s = coverage_over_range(introns,cpg_ranges)
|
|
123
|
+
group_sizes.append(len(s))
|
|
124
|
+
for i in sorted(s):
|
|
125
|
+
print ('\t'.join(['Last_intron',str(i), str(s[i])]), file=FOUT)
|
|
126
|
+
print ('Last_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
printlog("Process 3' UTR exons ...")
|
|
130
|
+
utr3_exons = ref_gene.getUTRs(utr=3)
|
|
131
|
+
s = coverage_over_range(utr3_exons,cpg_ranges)
|
|
132
|
+
group_sizes.append(len(s))
|
|
133
|
+
for i in sorted(s):
|
|
134
|
+
print ('\t'.join(['Three_prime_UTR',str(i), str(s[i])]), file=FOUT)
|
|
135
|
+
print ('Three_prime_UTR_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
136
|
+
|
|
137
|
+
printlog("Process downstream regions ...")
|
|
138
|
+
down_2k = ref_gene.getIntergenic(direction = 'down', size=options.downstream_size)
|
|
139
|
+
s = coverage_over_range(down_2k,cpg_ranges)
|
|
140
|
+
group_sizes.append(len(s))
|
|
141
|
+
for i in sorted(s):
|
|
142
|
+
print ('\t'.join(['Downstream_intergenic',str(i), str(s[i])]), file=FOUT)
|
|
143
|
+
print ('Downstream_intergenic_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT)
|
|
144
|
+
|
|
145
|
+
print('\n')
|
|
146
|
+
print ('pdf(file=\"%s\", width=10, height=5)' % (options.out_file + '.pdf'),file=ROUT)
|
|
147
|
+
print ('plot(1:%d, c(Upstream_intergenic_y, Five_prime_UTR_y, Coding_exon_y, First_intron_y, Internal_intron_y, Last_intron_y, Three_prime_UTR_y, Downstream_intergenic_y),ylim=c(0,1), xaxt="n",xlab="", ylab="Average methylation", type="l", col="red")' % sum(group_sizes), file=ROUT)
|
|
148
|
+
print ('abline(v = c(100,201,302,403,504,605,706),col="blue", lty="dashed")', file=ROUT)
|
|
149
|
+
print ('abline(v = c(%d,%d,%d,%d,%d,%d,%d),col="blue", lty="dashed")' % (sum(group_sizes[0:1]),sum(group_sizes[0:2]),sum(group_sizes[0:3]),sum(group_sizes[0:4]),sum(group_sizes[0:5]),sum(group_sizes[0:6]),sum(group_sizes[0:7])), file=ROUT)
|
|
150
|
+
print ('abline(h = 0.5,col="grey", lty="dashed")', file=ROUT)
|
|
151
|
+
print ('text(x=c(%d,%d,%d,%d,%d,%d,%d, %d)+50, y=0.9, cex=0.7, labels=c("Upstream\\n(5\'->3\')", "5\'UTR exon\\n(5\'->3\')","Coding exon\\n(5\'->3\')","First intron\\n(5\'->3\')","Internal intron\\n(5\'->3\')","Last intron\\n(5\'->3\')", "3\'UTR exon\\n(5\'->3\')","Downstream\n(5\'->3\')"))' % (0, sum(group_sizes[0:1]),sum(group_sizes[0:2]),sum(group_sizes[0:3]),sum(group_sizes[0:4]),sum(group_sizes[0:5]),sum(group_sizes[0:6]),sum(group_sizes[0:7])), file=ROUT)
|
|
152
|
+
print ('dev.off()',file=ROUT)
|
|
153
|
+
|
|
154
|
+
FOUT.close()
|
|
155
|
+
ROUT.close()
|
|
156
|
+
try:
|
|
157
|
+
subprocess.call("Rscript " + options.out_file + '.r', shell=True)
|
|
158
|
+
except:
|
|
159
|
+
print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr)
|
|
160
|
+
pass
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
if __name__=='__main__':
|
|
164
|
+
main()
|
|
165
|
+
|