cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,161 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program performs differential CpG analysis using Fisher's exact test. It only applies
7
+ to two sample comparison with no replicates. if replicates are provided, *methyl reads*
8
+ and *total reads* of all replicates will be summed
9
+
10
+ Example of input data file
11
+ --------------------------
12
+ cgID sample_1 sample_2
13
+ CpG_1 129,170 166,178
14
+ CpG_2 24,77 67,99
15
+
16
+ the number before "," indicates *number of methyl reads*
17
+ the number after "," indicates *number of total reads*
18
+
19
+ Output
20
+ -------
21
+ Three columns ("Odds ratio", "pvalue" and "adjusted pvalue") will append to input data table.
22
+ #=========================================================================================
23
+ """
24
+
25
+
26
+ import sys,os
27
+ import collections
28
+ import subprocess
29
+ import numpy as np
30
+ import re
31
+ from scipy import stats
32
+ from optparse import OptionParser
33
+ from cpgmodule import ireader
34
+ from cpgmodule.utils import *
35
+ from cpgmodule import BED
36
+ from cpgmodule import padjust
37
+ from cpgmodule._version import __version__
38
+
39
+ __author__ = "Liguo Wang"
40
+ __copyright__ = "Copyleft"
41
+ __credits__ = []
42
+ __license__ = "GPL"
43
+ __maintainer__ = "Liguo Wang"
44
+ __email__ = "wang.liguo@mayo.edu"
45
+ __status__ = "Development"
46
+
47
+
48
+ def main():
49
+ usage="%prog [options]" + "\n"
50
+ parser = OptionParser(usage,version="%prog " + __version__)
51
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.")
52
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\".")
53
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
54
+ (options,args)=parser.parse_args()
55
+
56
+ print ()
57
+ #print (options.paired)
58
+ #print (options.welch_ttest)
59
+ if not (options.input_file):
60
+ print (__doc__)
61
+ parser.print_help()
62
+ sys.exit(101)
63
+
64
+ if not (options.group_file):
65
+ print (__doc__)
66
+ parser.print_help()
67
+ sys.exit(102)
68
+
69
+ if not (options.out_file):
70
+ print (__doc__)
71
+ parser.print_help()
72
+ sys.exit(103)
73
+
74
+ FOUT = open(options.out_file + '.pval.txt','w')
75
+ #ROUT = open(options.out_file + '.r','w')
76
+
77
+ printlog("Read group file \"%s\" ..." % (options.group_file))
78
+ (s,g) = read_grp_file1(options.group_file)
79
+ s2g = dict(zip(s,g))
80
+ g2s = collections.defaultdict(list)
81
+
82
+ for k,v in s2g.items():
83
+ g2s[v].append(k)
84
+
85
+ group_IDs = sorted(g2s.keys())
86
+ for g in group_IDs:
87
+ print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
88
+ print ('\t\t' + ','.join(g2s[g]))
89
+
90
+ if len(group_IDs) != 2:
91
+ printlog("You must have two groups!", file=sys.stderr)
92
+ sys.exit(1)
93
+
94
+ line_num = 1
95
+ probe_list = []
96
+ p_list = []
97
+ or_list = []
98
+ for l in ireader.reader(options.input_file):
99
+ f = l.split()
100
+ if len(f) == 0: continue
101
+ if line_num == 1:
102
+ sample_IDs = f[1:]
103
+ # check if sample ID matches
104
+ for s in s2g:
105
+ if s not in sample_IDs:
106
+ printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
107
+ sys.exit(3)
108
+ else:
109
+ cg_id = f[0]
110
+ probe_list.append(cg_id)
111
+ proportions = f[1:]
112
+ methyl_reads = 0
113
+ unmethyl_reads = 0
114
+ g2values = collections.defaultdict(dict)
115
+ for g in group_IDs:
116
+ g2values[g]['methyl'] = 0
117
+ g2values[g]['unmethyl'] = 0
118
+ for s,p in zip(sample_IDs, proportions):
119
+ gid = s2g[s]
120
+ m = re.match(r'(\d+)\s*\,\s*(\d+)', p)
121
+ if m is None:
122
+ continue
123
+ else:
124
+ c = int(m.group(1))
125
+ n = int(m.group(2))
126
+ if n >= c and n > 0:
127
+ g2values[gid]['methyl'] += c
128
+ g2values[gid]['unmethyl'] += (n-c)
129
+ else:
130
+ printlog("Incorrect data format!")
131
+ print (f)
132
+ sys.exit(1)
133
+ (odds, pval) = stats.fisher_exact([ [g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl']],[g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl']] ])
134
+ #print (g2values[group_IDs[0]]['methyl'], g2values[group_IDs[0]]['unmethyl'],g2values[group_IDs[1]]['methyl'], g2values[group_IDs[1]]['unmethyl'])
135
+ p_list.append(pval)
136
+ or_list.append(odds)
137
+ line_num += 1
138
+
139
+ printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
140
+ adjusted_p = {}
141
+ q_list = padjust.multiple_testing_correction(p_list)
142
+ for id,o,p,q in zip(probe_list, or_list, p_list, q_list):
143
+ adjusted_p[id] = '\t'.join([str(i) for i in (o,p,q)])
144
+
145
+ printlog("Writing to %s" % (options.out_file + '.pval.txt'))
146
+ line_num = 1
147
+ for l in ireader.reader(options.input_file):
148
+ if line_num == 1:
149
+ print (l + '\tOddsRatio\tpval\tadj.pval', file=FOUT)
150
+ else:
151
+ f = l.split()
152
+ probe_ID = f[0]
153
+ print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
154
+ line_num += 1
155
+ FOUT.close()
156
+
157
+
158
+
159
+
160
+ if __name__=='__main__':
161
+ main()
@@ -0,0 +1,191 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program performs differential CpG analysis using linear regression model based on
7
+ beta values.
8
+ """
9
+
10
+
11
+ import sys,os
12
+ import collections
13
+ import subprocess
14
+ import numpy as np
15
+ from scipy import stats
16
+ from optparse import OptionParser
17
+ from cpgmodule import ireader
18
+ from cpgmodule.utils import *
19
+ from cpgmodule import BED
20
+ from cpgmodule import padjust
21
+ from cpgmodule._version import __version__
22
+
23
+ __author__ = "Liguo Wang"
24
+ __copyright__ = "Copyleft"
25
+ __credits__ = []
26
+ __license__ = "GPL"
27
+ __maintainer__ = "Liguo Wang"
28
+ __email__ = "wang.liguo@mayo.edu"
29
+ __status__ = "Development"
30
+
31
+
32
+ def main():
33
+ usage="%prog [options]" + "\n"
34
+ parser = OptionParser(usage,version="%prog " + __version__)
35
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (.gz, .bz2).")
36
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\".")
37
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
38
+ (options,args)=parser.parse_args()
39
+
40
+ print ()
41
+ if not (options.input_file):
42
+ print (__doc__)
43
+ parser.print_help()
44
+ sys.exit(101)
45
+
46
+ if not (options.group_file):
47
+ print (__doc__)
48
+ parser.print_help()
49
+ sys.exit(102)
50
+
51
+ if not (options.out_file):
52
+ print (__doc__)
53
+ parser.print_help()
54
+ sys.exit(103)
55
+
56
+ if not os.path.isfile(options.input_file):
57
+ print ("Input data file \"%s\" does not exist\n" % options.input_file)
58
+ sys.exit(104)
59
+ if not os.path.isfile(options.group_file):
60
+ print ("Input group file \"%s\" does not exist\n" % options.input_file)
61
+ sys.exit(105)
62
+
63
+ ROUT = open(options.out_file + '.r','w')
64
+
65
+ printlog("Read group file \"%s\" ..." % (options.group_file))
66
+ (samples,cv_names, cvs, v_types) = read_grp_file2(options.group_file)
67
+ for cv_name in cv_names:
68
+ print ("%s: %s" % (cv_name, v_types[cv_name]))
69
+ for sample in samples:
70
+ print ('\t' + sample + '\t' + cvs[cv_name][sample])
71
+
72
+ primary_variable = cv_names[0]
73
+
74
+ print ('lrf1 <- function (cgid, y, %s){' % ','.join(cv_names), file=ROUT)
75
+ print ('try(fit <- glm(y ~ %s, family=gaussian))' % ('+'.join(cv_names)), file=ROUT)
76
+ print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
77
+ print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
78
+ print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(names(coefs), "coef", sep="."), paste(names(pvals), "pval",sep=".")))' % (options.out_file + '.results.txt'), file = ROUT)
79
+ print ('}', file=ROUT)
80
+ print ('\n', file=ROUT)
81
+
82
+ print ('lrf2 <- function (cgid, y,%s){' % ','.join(cv_names), file=ROUT)
83
+ print ('try(fit <- glm(y ~ %s, family=gaussian))' % ('+'.join(cv_names)), file=ROUT)
84
+ print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
85
+ print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
86
+ print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append = TRUE)' % (options.out_file + '.results.txt'), file = ROUT)
87
+ print ('}', file=ROUT)
88
+ print ('\n', file=ROUT)
89
+
90
+ printlog("Processing file \"%s\" ..." % (options.input_file))
91
+ line_num = 0
92
+ probe_list = []
93
+ p_list = []
94
+ for l in ireader.reader(options.input_file):
95
+ line_num += 1
96
+ f = l.split()
97
+ if len(f) == 0: continue
98
+ if line_num == 1:
99
+ sample_IDs = f[1:]
100
+ # check if sample ID matches
101
+ for s in samples:
102
+ if s not in sample_IDs:
103
+ printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
104
+ sys.exit(3)
105
+ for cv_name in cv_names:
106
+ if v_types[cv_name] == 'continuous':
107
+ print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
108
+ elif v_types[cv_name] == 'categorical':
109
+ print (cv_name + ' <- as.factor(c(%s))' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
110
+ else:
111
+ printlog("unknown vaiable type!")
112
+ sys.exit(1)
113
+ print ('\n', file=ROUT)
114
+ continue
115
+
116
+ continue
117
+ else:
118
+ beta_values = []
119
+ cg_id = f[0]
120
+ for i in f[1:]:
121
+ try:
122
+ beta_values.append(float(i))
123
+ except:
124
+ beta_values.append("NaN")
125
+ if line_num == 2:
126
+ print ('lrf1(\"%s\", c(%s), %s)' % (cg_id, ','.join([str(i) for i in beta_values]), ','.join(cv_names)), file=ROUT)
127
+ else:
128
+ print ('lrf2(\"%s\", c(%s), %s)' % (cg_id, ','.join([str(i) for i in beta_values]), ','.join(cv_names)), file=ROUT)
129
+
130
+ ROUT.close()
131
+
132
+
133
+ try:
134
+ printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
135
+ subprocess.call("Rscript %s 2>%s" % (options.out_file + '.r', options.out_file + '.warnings.txt' ), shell=True)
136
+ except:
137
+ print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr)
138
+ sys.exit(1)
139
+
140
+ # read
141
+ printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
142
+
143
+ line_num = 0
144
+ p_list = []
145
+ probe_list = []
146
+ for l in open(options.out_file + '.results.txt', 'r'):
147
+ l = l.strip()
148
+ line_num += 1
149
+ if line_num == 1:
150
+ headers = l.split()
151
+ for i,v in enumerate(headers):
152
+ if v.startswith(primary_variable) and v.endswith('.pval'):
153
+ primary_v_index = i
154
+ else:
155
+ v = l.split()
156
+ try:
157
+ pv = float(v[primary_v_index])
158
+ except:
159
+ continue
160
+ if pv >= 0 and pv <= 1:
161
+ p_list.append(pv)
162
+ probe_list.append(v[0])
163
+
164
+ # adjust
165
+ q_list = padjust.multiple_testing_correction(p_list)
166
+
167
+ # write
168
+ adjusted_p = {}
169
+ for id,p,q in zip(probe_list, p_list, q_list):
170
+ adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
171
+ FOUT = open(options.out_file + '.pval.txt','w')
172
+ printlog("Writing to %s" % (options.out_file + '.pval.txt'))
173
+ line_num = 1
174
+ for l in ireader.reader(options.input_file):
175
+ if line_num == 1:
176
+ print (l + '\tpval\tadj.pval', file=FOUT)
177
+ else:
178
+ f = l.split()
179
+ probe_ID = f[0]
180
+ try:
181
+ print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
182
+ except:
183
+ print (l + '\tNaN\tNaN', file=FOUT)
184
+ line_num += 1
185
+ FOUT.close()
186
+
187
+
188
+
189
+
190
+ if __name__=='__main__':
191
+ main()
@@ -0,0 +1,226 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program performs differential CpG analysis using the logistic regression model based on
7
+ methylation proportions (in the form of "c,n" where "c" indicates "Number of reads with
8
+ methylated C", and "n" indicates "Number of total reads". Both c and n are non-negative
9
+ integers and c <= n).
10
+
11
+ Example of input data
12
+ ---------------------
13
+ Below example showing input data on 2 CpGs of 3 groups (A,B, and C)
14
+ with each group has 3 replicates:
15
+
16
+ cgID A_1 A_2 A_3 B_1 B_2 B_3 C_1 C_2 C_3
17
+ CpG_1 129,170 166,178 7,9 1 6,16 10,10 10,15 11,15 16,22 20,36
18
+ CpG_2 0,77 0,99 0,85 0,77 1,37 3,37 0,42 0,153 0,6
19
+
20
+ """
21
+
22
+
23
+ import sys,os
24
+ import collections
25
+ import subprocess
26
+ import numpy as np
27
+ import re
28
+ from scipy import stats
29
+ from optparse import OptionParser
30
+ from cpgmodule import ireader
31
+ from cpgmodule.utils import *
32
+ from cpgmodule import BED
33
+ from cpgmodule import padjust
34
+ from cpgmodule._version import __version__
35
+
36
+ __author__ = "Liguo Wang"
37
+ __copyright__ = "Copyleft"
38
+ __credits__ = []
39
+ __license__ = "GPL"
40
+ __maintainer__ = "Liguo Wang"
41
+ __email__ = "wang.liguo@mayo.edu"
42
+ __status__ = "Development"
43
+
44
+
45
+ def main():
46
+ usage="%prog [options]" + "\n"
47
+ parser = OptionParser(usage,version="%prog " + __version__)
48
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing methylation proportions (represented by \"methyl_count,total_count\", eg. \"20,30\") with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). This file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url.")
49
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological groups of each sample as well as other covariables such as gender, age. The first variable is grouping variable (must be categorical), all the other variables are considered as covariates (can be categorical or continuous). Sample IDs should match to the \"Data file\".")
50
+ parser.add_option("-f","--family",action="store",type="int",dest="family_func",default=1, help="Error distribution and link function to be used in the GLM model. Can be integer 1 or 2 with 1 = \"quasibinomial\" and 2 = \"binomial\". Default=%default.")
51
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
52
+ (options,args)=parser.parse_args()
53
+
54
+ print ()
55
+ if not (options.input_file):
56
+ print (__doc__)
57
+ parser.print_help()
58
+ sys.exit(101)
59
+
60
+ if not (options.group_file):
61
+ print (__doc__)
62
+ parser.print_help()
63
+ sys.exit(102)
64
+
65
+ if not (options.out_file):
66
+ print (__doc__)
67
+ parser.print_help()
68
+ sys.exit(103)
69
+ if not os.path.isfile(options.input_file):
70
+ print ("Input data file \"%s\" does not exist\n" % options.input_file)
71
+ sys.exit(104)
72
+ if not os.path.isfile(options.group_file):
73
+ print ("Input group file \"%s\" does not exist\n" % options.input_file)
74
+ sys.exit(105)
75
+
76
+ ROUT = open(options.out_file + '.r','w')
77
+ family = {1:'quasibinomial', 2:'binomial',}
78
+ if not options.family_func in family.keys():
79
+ print ("Incorrect value of '-f'!")
80
+ sys.exit(106)
81
+
82
+ printlog("Read group file \"%s\" ..." % (options.group_file))
83
+ (samples,cv_names, cvs, v_types) = read_grp_file2(options.group_file)
84
+ for cv_name in cv_names:
85
+ print ("%s: %s" % (cv_name, v_types[cv_name]))
86
+ for sample in samples:
87
+ print ('\t' + sample + '\t' + cvs[cv_name][sample])
88
+
89
+ primary_variable = cv_names[0]
90
+
91
+ print ('lrf1 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
92
+ print ('try(fit <- glm(cbind(m,t - m) ~ %s, family=%s))' % ('+'.join(cv_names),family[options.family_func]), file=ROUT)
93
+ print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
94
+ print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
95
+ print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
96
+ print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
97
+ print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=c("ID",paste(gsub("2","",names(coefs)), "coef",sep="."), paste(gsub("2","",names(pvals)), "pval",sep=".")))' % (options.out_file + '.results.txt'), file = ROUT)
98
+ print ('}', file=ROUT)
99
+ print ('\n', file=ROUT)
100
+
101
+ print ('lrf2 <- function (cgid, m,t,%s){' % ','.join(cv_names), file=ROUT)
102
+ print ('try(fit <- glm(cbind(m,t - m) ~ %s, family=%s))' % ('+'.join(cv_names),family[options.family_func]), file=ROUT)
103
+ print ('pvals <- coef(summary(fit))[,4]', file=ROUT)
104
+ print ('coefs <- coef(summary(fit))[,1]', file=ROUT)
105
+ print ('\tif(max(pvals, na.rm=T)>1){pvals = pvals + NA}', file=ROUT)
106
+ print ('\tif(sum(m, na.rm=T) == 0){pvals = pvals + NA}', file=ROUT)
107
+ print ( 'write.table(file=\"%s\",x=matrix(c(cgid, as.vector(coefs), as.vector(pvals)), nrow=1),quote=FALSE, row.names=FALSE, sep="\\t", col.names=FALSE, append = TRUE)' % (options.out_file + '.results.txt'), file = ROUT)
108
+ print ('}', file=ROUT)
109
+ print ('\n', file=ROUT)
110
+
111
+ printlog("Processing file \"%s\" ..." % (options.input_file))
112
+ line_num = 0
113
+ probe_list = []
114
+ p_list = []
115
+ for l in ireader.reader(options.input_file):
116
+ line_num += 1
117
+ f = l.split()
118
+ if len(f) == 0: continue
119
+ if line_num == 1:
120
+ sample_IDs = f[1:]
121
+ # check if sample ID matches
122
+ for s in samples:
123
+ if s not in sample_IDs:
124
+ printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
125
+ sys.exit(3)
126
+ #for cv_name in cv_names:
127
+ # print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
128
+ for cv_name in cv_names:
129
+ if v_types[cv_name] == 'continuous':
130
+ print (cv_name + ' <- c(%s)' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
131
+ elif v_types[cv_name] == 'categorical':
132
+ print (cv_name + ' <- as.factor(c(%s))' % (','.join([str(cvs[cv_name][s]) for s in sample_IDs ])), file = ROUT)
133
+ else:
134
+ printlog("unknown vaiable type!")
135
+ sys.exit(1)
136
+
137
+ print ('\n', file=ROUT)
138
+ continue
139
+ else:
140
+ methyl_reads = [] # c
141
+ total_reads = [] # n
142
+ cg_id = f[0]
143
+ for i in f[1:]:
144
+ #try:
145
+ m = re.match(r'(\d+)\s*\,\s*(\d+)', i)
146
+ if m is None:
147
+ methyl_reads.append("NaN")
148
+ total_reads.append("NaN")
149
+ continue
150
+ else:
151
+ c = int(m.group(1))
152
+ n = int(m.group(2))
153
+ if n >= c and n > 0:
154
+ methyl_reads.append(c)
155
+ total_reads.append(n)
156
+ else:
157
+ printlog("Incorrect data format!")
158
+ print (f)
159
+ sys.exit(1)
160
+ if line_num == 2:
161
+ print ('lrf1(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)
162
+ else:
163
+ print ('lrf2(\"%s\", c(%s), c(%s), %s)' % (cg_id, ','.join([str(read) for read in methyl_reads]), ','.join([str(read) for read in total_reads]), ','.join(cv_names)), file=ROUT)
164
+
165
+ ROUT.close()
166
+
167
+
168
+ try:
169
+ printlog("Runing Rscript file \"%s\" ..." % (options.out_file + '.r'))
170
+ subprocess.call("Rscript %s 2>%s" % (options.out_file + '.r', options.out_file + '.warnings.txt' ), shell=True)
171
+ except:
172
+ print ("Error: cannot run Rscript: \"%s\"" % (options.out_file + '.r'), file=sys.stderr)
173
+ sys.exit(1)
174
+
175
+
176
+ # read
177
+ printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
178
+
179
+ line_num = 0
180
+ p_list = []
181
+ probe_list = []
182
+ for l in open(options.out_file + '.results.txt', 'r'):
183
+ l = l.strip()
184
+ line_num += 1
185
+ if line_num == 1:
186
+ headers = l.split()
187
+ for i,v in enumerate(headers):
188
+ if v.startswith(primary_variable) and v.endswith('.pval'):
189
+ primary_v_index = i
190
+ else:
191
+ v = l.split()
192
+ try:
193
+ pv = float(v[primary_v_index])
194
+ except:
195
+ continue
196
+ if pv >= 0 and pv <= 1:
197
+ p_list.append(pv)
198
+ probe_list.append(v[0])
199
+
200
+
201
+ # adjust
202
+ q_list = padjust.multiple_testing_correction(p_list)
203
+
204
+ # write
205
+ adjusted_p = {}
206
+ for id,p,q in zip(probe_list, p_list, q_list):
207
+ adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
208
+ FOUT = open(options.out_file + '.pval.txt','w')
209
+ printlog("Writing to %s" % (options.out_file + '.pval.txt'))
210
+ line_num = 1
211
+ for l in ireader.reader(options.input_file):
212
+ if line_num == 1:
213
+ print (l + '\tpval\tadj.pval', file=FOUT)
214
+ else:
215
+ f = l.split()
216
+ probe_ID = f[0]
217
+ try:
218
+ print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
219
+ except:
220
+ print (l + '\tNaN\tNaN', file=FOUT)
221
+ line_num += 1
222
+ FOUT.close()
223
+
224
+ if __name__=='__main__':
225
+ main()
226
+