cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,176 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program performs differential CpG analysis using the Mann-Whitney U test
7
+ for two group comparison, and the Kruskal-Wallis H-test for multiple group
8
+ comparison.
9
+ """
10
+
11
+
12
+ import sys,os
13
+ import collections
14
+ import subprocess
15
+ import numpy as np
16
+ from scipy import stats
17
+ from optparse import OptionParser
18
+ from cpgmodule import ireader
19
+ from cpgmodule.utils import *
20
+ from cpgmodule import BED
21
+ from cpgmodule import padjust
22
+ from cpgmodule._version import __version__
23
+
24
+ __author__ = "Liguo Wang"
25
+ __copyright__ = "Copyleft"
26
+ __credits__ = []
27
+ __license__ = "GPL"
28
+ __maintainer__ = "Liguo Wang"
29
+ __email__ = "wang.liguo@mayo.edu"
30
+ __status__ = "Development"
31
+
32
+ def mwu_test(a, b):
33
+ '''
34
+ mann-whitney U test of two samples.
35
+ '''
36
+ p = np.nan
37
+ t = np.nan
38
+ try:
39
+ tmp = stats.mannwhitneyu(a, b, alternative='two-sided')
40
+ p = tmp.pvalue
41
+ t = tmp.statistic
42
+ except:
43
+ pass
44
+ return (p,t)
45
+
46
+ def kruskal_test(*args):
47
+ '''
48
+ Compute the Kruskal-Wallis H-test for independent samples
49
+ '''
50
+ p = np.nan
51
+ t = np.nan
52
+ try:
53
+ tmp = stats.kruskal(*args, nan_policy='omit')
54
+ p = tmp.pvalue
55
+ t = tmp.statistic
56
+ except:
57
+ pass
58
+ return (p,t)
59
+
60
+ def main():
61
+ usage="%prog [options]" + "\n"
62
+ parser = OptionParser(usage,version="%prog " + __version__)
63
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (.gz, .bz2).")
64
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated two columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\". Note: automatically switch to use Kruskal-Wallis H-test if more than two groups were defined in this file.")
65
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
66
+ (options,args)=parser.parse_args()
67
+
68
+ print ()
69
+
70
+ if not (options.input_file):
71
+ print (__doc__)
72
+ parser.print_help()
73
+ sys.exit(101)
74
+
75
+ if not (options.group_file):
76
+ print (__doc__)
77
+ parser.print_help()
78
+ sys.exit(102)
79
+
80
+ if not (options.out_file):
81
+ print (__doc__)
82
+ parser.print_help()
83
+ sys.exit(103)
84
+
85
+ FOUT = open(options.out_file + '.pval.txt','w')
86
+
87
+ printlog("Read group file \"%s\" ..." % (options.group_file))
88
+ (s,g) = read_grp_file1(options.group_file)
89
+ s2g = dict(zip(s,g))
90
+ g2s = collections.defaultdict(list)
91
+
92
+ for k,v in s2g.items():
93
+ g2s[v].append(k)
94
+
95
+ group_IDs = sorted(g2s.keys())
96
+ for g in group_IDs:
97
+ print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
98
+ print ('\t\t' + ','.join(g2s[g]))
99
+
100
+ if len(group_IDs) < 2:
101
+ printlog("You must have at least two groups!", file=sys.stderr)
102
+ sys.exit(1)
103
+ elif len(group_IDs) == 2:
104
+ printlog("Perfrom Mann-Whitney rank test of two samples ...")
105
+ elif len(group_IDs) >= 3:
106
+ printlog("Perfrom Kruskal-Wallis H-test ...")
107
+
108
+ line_num = 1
109
+ probe_list = []
110
+ p_list = []
111
+ for l in ireader.reader(options.input_file):
112
+ f = l.split()
113
+ if len(f) == 0: continue
114
+ if line_num == 1:
115
+ sample_IDs = f[1:]
116
+
117
+ # check if sample ID matches
118
+ for s in s2g:
119
+ if s not in sample_IDs:
120
+ printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
121
+ sys.exit(3)
122
+ else:
123
+ g2values = collections.defaultdict(list)
124
+ probe_ID = f[0]
125
+ beta_values = f[1:]
126
+ for s,b in zip(sample_IDs, beta_values):
127
+
128
+ #deal with non-numerical values
129
+ try:
130
+ b = float(b)
131
+ except:
132
+ b = np.nan
133
+
134
+ #skip if s not in group file
135
+ if s not in s2g:
136
+ continue
137
+
138
+ gid = s2g[s]
139
+ g2values[gid].append(b)
140
+
141
+ if len(g2values) == 2:
142
+ a = np.array(g2values[group_IDs[0]])
143
+ b = np.array(g2values[group_IDs[1]])
144
+ (pval,tscore) = mwu_test(a,b)
145
+ elif len(g2values) >= 3:
146
+ tmp = []
147
+ for g in group_IDs:
148
+ tmp.append(np.array(g2values[g]))
149
+ (pval,tscore) = kruskal_test(*tmp)
150
+ probe_list.append(probe_ID)
151
+ p_list.append(pval)
152
+ line_num += 1
153
+
154
+ printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
155
+ adjusted_p = {}
156
+ q_list = padjust.multiple_testing_correction(p_list)
157
+ for id,p,q in zip(probe_list, p_list, q_list):
158
+ adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
159
+
160
+ printlog("Writing to %s" % (options.out_file + '.pval.txt'))
161
+ line_num = 1
162
+ for l in ireader.reader(options.input_file):
163
+ if line_num == 1:
164
+ print (l + '\tpval\tadj.pval', file=FOUT)
165
+ else:
166
+ f = l.split()
167
+ probe_ID = f[0]
168
+ print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
169
+ line_num += 1
170
+ FOUT.close()
171
+
172
+
173
+
174
+
175
+ if __name__=='__main__':
176
+ main()
@@ -0,0 +1,222 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program performs differential CpG analysis based on beta values. It uses Student's
7
+ t-test for two-group comparison and ANOVA for multiple groups comparison.
8
+ """
9
+
10
+
11
+ import sys,os
12
+ import collections
13
+ import subprocess
14
+ import numpy as np
15
+ from scipy import stats
16
+ from optparse import OptionParser
17
+ from cpgmodule import ireader
18
+ from cpgmodule.utils import *
19
+ from cpgmodule import BED
20
+ from cpgmodule import padjust
21
+ from cpgmodule._version import __version__
22
+
23
+ __author__ = "Liguo Wang"
24
+ __copyright__ = "Copyleft"
25
+ __credits__ = []
26
+ __license__ = "GPL"
27
+ __maintainer__ = "Liguo Wang"
28
+ __email__ = "wang.liguo@mayo.edu"
29
+ __status__ = "Development"
30
+
31
+ def standard_ttest(a, b, equalVar=True, nanPolicy='omit'):
32
+ '''
33
+ Calculate the T-test for the means of two independent samples of scores.
34
+ '''
35
+ p = np.nan
36
+ t = np.nan
37
+ try:
38
+ tmp = stats.ttest_ind(a, b, equal_var = equalVar, nan_policy = nanPolicy)
39
+ p = tmp.pvalue
40
+ t = tmp.statistic
41
+ except:
42
+ pass
43
+ return (p,t)
44
+
45
+ def paired_ttest(a, b, nanPolicy='omit'):
46
+ '''
47
+ Calculate the T-test on TWO RELATED samples of scores, a and b.
48
+ '''
49
+ p = np.nan
50
+ t = np.nan
51
+ try:
52
+ tmp = stats.ttest_rel(a,b, nan_policy = nanPolicy)
53
+ p = tmp.pvalue
54
+ t = tmp.statistic
55
+ except:
56
+ pass
57
+
58
+ return (p,t)
59
+
60
+ def anova(*args):
61
+ '''
62
+ The one-way ANOVA tests the null hypothesis that three or more groups have the same population mean
63
+ '''
64
+ p = np.nan
65
+ t = np.nan
66
+ try:
67
+ tmp = stats.f_oneway(*args)
68
+ p = tmp.pvalue
69
+ t = tmp.statistic
70
+ except:
71
+ pass
72
+ return (p,t)
73
+
74
+ def main():
75
+ usage="%prog [options]" + "\n"
76
+ parser = OptionParser(usage,version="%prog " + __version__)
77
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (.gz, .bz2).")
78
+ parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\". Note: automatically switch to use ANOVA if more than 2 groups were defined in this file.")
79
+ parser.add_option("-p","--paired",action="store_true",default=False,dest="paired",help="If True, performs a paired t-test (the paired sampels are matched by the order). If False, performs a standard independent 2 sample t-test. default=%default")
80
+ parser.add_option("-w","--welch",action="store_true",default=False,dest="welch_ttest",help="If True, performs Welch's t-test which does not assume the two samples have equal variance. If False, performs a standard two-sample t-test (i.e. assuming the two samples have equal variance). default=%default")
81
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
82
+ (options,args)=parser.parse_args()
83
+
84
+ print ()
85
+ #print (options.paired)
86
+ #print (options.welch_ttest)
87
+ #sys.exit()
88
+ if not (options.input_file):
89
+ print (__doc__)
90
+ parser.print_help()
91
+ sys.exit(101)
92
+
93
+ if not (options.group_file):
94
+ print (__doc__)
95
+ parser.print_help()
96
+ sys.exit(102)
97
+
98
+ if not (options.out_file):
99
+ print (__doc__)
100
+ parser.print_help()
101
+ sys.exit(103)
102
+
103
+ FOUT = open(options.out_file + '.pval.txt','w')
104
+ #ROUT = open(options.out_file + '.r','w')
105
+
106
+ printlog("Read group file \"%s\" ..." % (options.group_file))
107
+ (ss,gs) = read_grp_file1(options.group_file)
108
+
109
+ s2g = {}
110
+ for s,g in zip(ss,gs):
111
+ s2g[s] = g
112
+
113
+ g2s = collections.defaultdict(list)
114
+ for s,g in zip(ss, gs):
115
+ g2s[g].append(s)
116
+
117
+ group_IDs = sorted(g2s.keys())
118
+ for g in group_IDs:
119
+ print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
120
+ print ('\t\t' + ','.join(g2s[g]))
121
+
122
+ if len(group_IDs) < 2:
123
+ printlog("You must have at least two groups!", file=sys.stderr)
124
+ sys.exit(1)
125
+ elif (len(group_IDs) == 2) and (options.paired is True):
126
+ printlog("Perfrom paired t-test of two related samples ...")
127
+ if len(g2s[group_IDs[0]]) != len(g2s[group_IDs[1]]):
128
+ printlog("Unequal sample size. Cannot perform paired t-test.")
129
+ sys.exit(2)
130
+ elif (len(group_IDs) == 2) and (options.paired is False):
131
+ printlog("Perfrom standard t-test of two independent samples ...")
132
+ elif len(group_IDs) >= 3:
133
+ printlog("Perfrom ANOVA ...")
134
+
135
+ line_num = 1
136
+ probe_list = []
137
+ p_list = []
138
+ delta_beta = {}
139
+ for l in ireader.reader(options.input_file):
140
+ f = l.split()
141
+ if len(f) == 0: continue
142
+ if line_num == 1:
143
+
144
+ sample_IDs = f[1:]
145
+
146
+ # check if sample ID matches
147
+ for s in s2g:
148
+ if s not in sample_IDs:
149
+ printlog("Cannot find sample ID \"%s\" from file \"%s\". Exclude this sample from differential analysis." % (s, options.input_file))
150
+ #sys.exit(3)
151
+ else:
152
+ g2values = collections.defaultdict(list)
153
+ probe_ID = f[0]
154
+ beta_values = f[1:]
155
+ for s,b in zip(sample_IDs, beta_values):
156
+
157
+ #deal with non-numerical values
158
+ try:
159
+ b = float(b)
160
+ except:
161
+ b = np.nan
162
+
163
+ #skip if s not in group file
164
+ if s not in s2g:
165
+ continue
166
+
167
+ gid = s2g[s]
168
+ g2values[gid].append(b)
169
+
170
+ if len(g2values) == 2:
171
+ a = np.array(g2values[group_IDs[0]])
172
+ b = np.array(g2values[group_IDs[1]])
173
+ try:
174
+ delta_beta[probe_ID] = np.mean(a) - np.mean(b)
175
+ except:
176
+ delta_beta[probe_ID] = np.nan
177
+ if options.paired:
178
+ (pval,tscore) = paired_ttest(a,b)
179
+ else:
180
+ (pval,tscore) = standard_ttest(a,b, equalVar = options.welch_ttest)
181
+ elif len(g2values) >= 3:
182
+ tmp = []
183
+ for g in group_IDs:
184
+ tmp.append(np.array(g2values[g]))
185
+ (pval,tscore) = anova(*tmp)
186
+ if pval >= 0 and pval <= 1:
187
+ probe_list.append(probe_ID)
188
+ p_list.append(pval)
189
+ else:
190
+ continue
191
+ line_num += 1
192
+
193
+ printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
194
+ adjusted_p = {}
195
+ q_list = padjust.multiple_testing_correction(p_list)
196
+ for id,p,q in zip(probe_list, p_list, q_list):
197
+ adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
198
+
199
+ printlog("Writing to %s" % (options.out_file + '.pval.txt'))
200
+ line_num = 1
201
+ for l in ireader.reader(options.input_file):
202
+ if line_num == 1:
203
+ print (l + '\tdelta_beta\tpval\tadj.pval', file=FOUT)
204
+ else:
205
+ f = l.split()
206
+ probe_ID = f[0]
207
+ if probe_ID in delta_beta:
208
+ pass
209
+ else:
210
+ delta_beta[probe_ID] = 'n/a'
211
+ try:
212
+ print (l + '\t' + str(delta_beta[probe_ID]) + '\t' + adjusted_p[probe_ID], file=FOUT)
213
+ except:
214
+ print (l + '\t' + 'n/a' + '\t' + 'n/a' + '\t' + 'n/a', file=FOUT)
215
+ line_num += 1
216
+ FOUT.close()
217
+
218
+
219
+
220
+
221
+ if __name__=='__main__':
222
+ main()