cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,99 @@
1
+ #!python
2
+
3
+ """
4
+ #=========================================================================================
5
+ This program picks the top N rows (according to standard deviation) from the input file.
6
+ The resulting file can be used for clustering/PCA analysis.
7
+
8
+ Example of input data file
9
+ ---------------------------
10
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
11
+ cg_001 0.831035 0.878022 0.794427 0.880911
12
+ cg_002 0.249544 0.209949 0.234294 0.236680
13
+ cg_003 0.845065 0.843957 0.840184 0.824286
14
+ """
15
+
16
+ import sys,os
17
+ import collections
18
+ import subprocess
19
+ import numpy as np
20
+ from optparse import OptionParser
21
+ from cpgmodule._version import __version__
22
+ from cpgmodule import ireader
23
+ from cpgmodule.utils import *
24
+ from cpgmodule import BED
25
+ import pandas as pd
26
+
27
+ __author__ = "Liguo Wang"
28
+ __copyright__ = "Copyleft"
29
+ __credits__ = []
30
+ __license__ = "GPL"
31
+ __maintainer__ = "Liguo Wang"
32
+ __email__ = "wang.liguo@mayo.edu"
33
+ __status__ = "Development"
34
+
35
+
36
+ def main():
37
+
38
+ usage="%prog [options]" + "\n"
39
+ parser = OptionParser(usage,version="%prog " + __version__)
40
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
41
+ parser.add_option("-c","--count",action="store",type='int', dest="cpg_count", default=1000, help="Number of most variable CpGs (ranked by standard deviation) to keep. default=%default" )
42
+ parser.add_option("-s","--score",action="store",type='string', dest="score_type", default='std', help="The type of score used to rank CpGs. Must be one of 'std' or 'mean'. default=%default" )
43
+ parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
44
+ (options,args)=parser.parse_args()
45
+
46
+ print ()
47
+ if not (options.input_file):
48
+ print (__doc__)
49
+ parser.print_help()
50
+ sys.exit(101)
51
+
52
+ if not (options.out_file):
53
+ print (__doc__)
54
+ parser.print_help()
55
+ sys.exit(103)
56
+
57
+ printlog("Reading input file: \"%s\"" % (options.input_file))
58
+ df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
59
+
60
+ #remove any rows with NAs
61
+ df2 = df1.dropna(axis=0, how='any')
62
+ printlog("%d rows with missing values were removed." % (len(df1) - len(df2)))
63
+
64
+
65
+
66
+ if options.score_type.lower() == 'std':
67
+ #calculate stdev for each row
68
+ row_stds = df2.std(axis=1)
69
+ df2.loc[:, 'Stdev'] = row_stds
70
+
71
+ #sorted data frame by stdev (decreasingly). Then take the top count,. Then remove Stdev column
72
+ printlog("Sorting by the standard deviation (decreasingly) ... ")
73
+ df3 = df2.sort_values(by=['Stdev'], ascending=False)
74
+
75
+ printlog("Data frame with sorted Stdev is saved to file: %s" % options.out_file + '.sortedStdev.tsv')
76
+ df3.to_csv(options.out_file + '.sortedStdev.tsv', sep = "\t",float_format='%.6f')
77
+
78
+ df4 = df3[0:options.cpg_count].drop('Stdev',axis=1)
79
+ printlog("Top %d rows of Data frame is saved to file: %s" % (options.cpg_count, options.out_file + '.sortedStdev.topN.tsv'))
80
+ df4.to_csv(options.out_file + '.sortedStdev.topN.tsv', sep="\t",float_format='%.6f')
81
+ elif options.score_type.lower() == 'mean':
82
+ #calculate mean for each row
83
+ row_means = df2.mean(axis=1)
84
+ df2['Mean'] = row_means
85
+
86
+ #sorted data frame by mean (decreasingly). Then take the top count,. Then remove Stdev column
87
+ printlog("Sorting by the mean (decreasingly) ... ")
88
+ df3 = df2.sort_values(by=['Mean'], ascending=False)
89
+
90
+ printlog("Data frame with sorted Mean is saved to file: %s" % options.out_file + '.sortedMean.tsv')
91
+ df3.to_csv(options.out_file + '.sortedMean.tsv', sep = "\t",float_format='%.6f')
92
+
93
+ df4 = df3[0:options.cpg_count].drop('Mean',axis=1)
94
+ printlog("Top %d rows of Data frame is saved to file: %s" % (options.cpg_count, options.out_file + '.sortedMean.topN.tsv'))
95
+ df4.to_csv(options.out_file + '.sortedMean.topN.tsv', sep="\t",float_format='%.6f')
96
+
97
+
98
+ if __name__=='__main__':
99
+ main()
@@ -0,0 +1,190 @@
1
+ #!python
2
+
3
+ """
4
+ Description
5
+ -----------
6
+ This program uses the Bayesian Gaussian Mixture model (BGMM) to trichotmize beta values into
7
+ three status:
8
+ * Un-methylated (labeled as "0" in result file)
9
+ * Semi- or particial-methylated (labeled as "1" in result file)
10
+ * Full-methylated (labeled as "2" in result file)
11
+ * unassigned (labeled as "-1" in result file)
12
+ """
13
+
14
+
15
+ import sys,os
16
+ import collections
17
+ import numpy as np
18
+ from optparse import OptionParser
19
+ from sklearn import mixture
20
+ from time import strftime
21
+ from cpgmodule._version import __version__
22
+ from cpgmodule import ireader
23
+ from cpgmodule.utils import *
24
+ import pandas as pd
25
+
26
+ __author__ = "Liguo Wang"
27
+ __copyright__ = "Copyleft"
28
+ __credits__ = []
29
+ __license__ = "GPL"
30
+ __maintainer__ = "Liguo Wang"
31
+ __email__ = "wang.liguo@mayo.edu"
32
+ __status__ = "Development"
33
+
34
+
35
+ def load_data(infile):
36
+ """
37
+ Input file is tab or space separated plain text file.
38
+ *The first row contains sample IDs (must be unique)
39
+ *The first column contains probe IDs (must be unique)
40
+ *Each cell (except for the 1st row and 1st column) contains Beta-value
41
+
42
+ Example:
43
+
44
+ Probe sample_1 sample_2 sample_3 ...
45
+ cg09835024 0.0547 0.1187 0.0625 ...
46
+ cg25813447 0.428 0.3746 0.0666 ...
47
+ cg07779434 0.3713 0.4194 0.0493 ...
48
+ """
49
+
50
+ printlog("Reading input file: \"%s\"" % infile)
51
+ df1 = pd.read_csv(infile, index_col = 0, sep="\t")
52
+
53
+ #remove any rows with NAs
54
+ df2 = df1.dropna(axis=0, how='any')
55
+ printlog("%d rows with missing values were removed." % (len(df1) - len(df2)))
56
+
57
+ print ("\tTotal samples: %d" % (len(df2.columns)), file=sys.stderr)
58
+ print ("\tTotal probes: %d" % len(df2), file=sys.stderr)
59
+ return df2
60
+
61
+ def build_GMM(d,rnd):
62
+ """
63
+ Return means of components of Gaussian Mixture Model.
64
+ d is data frame returned by "load_data" function.
65
+ rnd is a random number. You get exactly the same results when running multiple times using the same random number. Must be integer.
66
+ """
67
+
68
+ bgmm_models = collections.defaultdict(list)
69
+ for s_id in sorted(d.columns):
70
+ printlog ("Building Bayesian Gaussian Mixture model for subject: %s ...\r" % s_id)
71
+ bgmm = mixture.BayesianGaussianMixture(n_components=3, covariance_type='full',max_iter=50000,tol=0.001,random_state=rnd)
72
+ bgmm_models[s_id] = bgmm.fit(d[s_id].values.reshape(-1,1))
73
+ #print (bgmm_models)
74
+ return bgmm_models
75
+
76
+
77
+ def summary_GMM(m):
78
+ """
79
+ Summarize BGMM models returned by "build_GMM"
80
+ """
81
+ printlog ("Summerzie GMM models ...")
82
+
83
+ FOUT = open('summary_report.txt','w')
84
+
85
+ print ("\n\n#means of components", file=FOUT)
86
+ print ("Subject_ID\tUnmethyl\tSemiMethyl\tMethyl",file=FOUT)
87
+ for k,v in m.items():
88
+ print (k + '\t' + '\t'.join([str(i) for i in sorted(v.means_[:,0])]),file=FOUT)
89
+
90
+
91
+ print ("\n\n#Weights of components", file=FOUT)
92
+ print ("Subject_ID\tUnmethyl\tSemiMethyl\tMethyl", file=FOUT)
93
+ for k,v in m.items():
94
+ print (k + '\t' + '\t'.join([str(i) for i in sorted(v.weights_)]), file=FOUT)
95
+
96
+
97
+ print ("\n\n#Converge status and n_iter", file=FOUT)
98
+ print ("Subject_ID\tConverged\tn_iter", file=FOUT)
99
+ for k,v in m.items():
100
+ print (k + '\t' + '\t'.join([str(i) for i in (v.converged_, v.n_iter_)]), file=FOUT)
101
+ FOUT.close()
102
+
103
+ printlog ("Reports were saved into \"summary_report.txt\".")
104
+
105
+ def trichotmize(d,m, prob_cutoff):
106
+ """
107
+ trichotmize beta-value into one of ('0','0.5','1')
108
+ 0 : Un-methylation
109
+ 0.5: Semi- or particial-methylation
110
+ 1: Methylation
111
+
112
+ d is beta value object returned by "load_data" function
113
+ m is BGMM models returned by 'build_GMM' function
114
+
115
+ """
116
+ probe_IDs = list(d.index)
117
+
118
+ for s_id in sorted(m.keys()):
119
+ printlog ("Writing to \"%s\" ..." % (s_id + ".results.txt"))
120
+ FOUT = open(s_id + ".results.txt",'w')
121
+ methyl_lables = {} #key is index (index can be 0,1 or 2 corresponding to 3 components), value is 0, 1 or 2 corresponding to Un-, Semi- and full- methylation
122
+ component_means = m[s_id].means_[:,0] # list of component means
123
+ betas = d[s_id]
124
+ for idx,val in enumerate(component_means):
125
+ if val == max(component_means):
126
+ methyl_lables[idx] = '2' # full methyl
127
+ elif val == min(component_means):
128
+ methyl_lables[idx] = '0' # un-methyl
129
+ else:
130
+ methyl_lables[idx] = '1' # semi-methyl
131
+
132
+ probs = m[s_id].predict_proba(d[s_id].values.reshape(-1,1)) # list of probabilities of components: [[ 4.33638063e-035 9.54842259e-001 4.51577411e-002],...]
133
+
134
+ print ("#Prob_of_0: Probability of CpG belonging to un-methylation group", file=FOUT)
135
+ print ("#Prob_of_1: Probability of CpG belonging to semi- or particial-methylation group", file=FOUT)
136
+ print ("#Prob_of_2: Probability of CpG belonging to full-methylation group", file=FOUT)
137
+ print ("#Assigned_lable: -1 = 'unassigned', 0 = 'un-methylation', 1 = 'semi- or particial-methylation', 2 = 'full-methylation'", file=FOUT)
138
+ print ("Probe_ID" + '\tBeta_value\t' + '\t'.join(['Prob_of_' + methyl_lables[0], 'Prob_of_' + methyl_lables[1], 'Prob_of_' + methyl_lables[2]]) + '\t' + 'Assigned_lable', file=FOUT)
139
+ for probe_ID, beta, p in zip(probe_IDs, betas, probs):
140
+ p_list = list(p)
141
+ #print probe_ID
142
+ #print p_list
143
+ if methyl_lables[p_list.index(max(p_list))] == '1':
144
+ if max(p_list) >= prob_cutoff:
145
+ print (probe_ID + '\t' + str(beta) + '\t' + '\t'.join([str(i) for i in p_list]) + '\t' + methyl_lables[p_list.index(max(p_list))], file=FOUT)
146
+ else:
147
+ print (probe_ID + '\t' + str(beta) + '\t' + '\t'.join([str(i) for i in p_list]) + '\t' + '-1', file=FOUT)
148
+ else:
149
+ print (probe_ID + '\t' + str(beta) + '\t' + '\t'.join([str(i) for i in p_list]) + '\t' + methyl_lables[p_list.index(max(p_list))], file=FOUT)
150
+ FOUT.close()
151
+
152
+ def main():
153
+ print (__doc__)
154
+ usage="%prog [options]" + "\n"
155
+ parser = OptionParser(usage,version="%prog " + __version__)
156
+ parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input plain text file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing probe IDs (must be unique).")
157
+ parser.add_option("-c","--prob-cut",action="store",type="float",dest="prob_cutoff",default=0.95,help="Probability cutoff to assign a probe into \"semi- or particial-methylated\" class. default=%default")
158
+ parser.add_option("-r","--report",action="store_true",dest="report_summary",default=False, help="If True, generates \"summary_report.txt\" file. default=%default")
159
+ parser.add_option("-s","--seed",action="store",type='int', dest="random_state",default=99, help="The seed used by the random number generator. default=%default")
160
+ (options,args)=parser.parse_args()
161
+
162
+ print ()
163
+ #print (options.report_summary)
164
+ #sys.exit()
165
+ if not (options.input_file):
166
+ parser.print_help()
167
+ sys.exit(0)
168
+ if not os.path.exists(options.input_file):
169
+ print ('\n\n' + options.input_file + " does NOT exists" + '\n',file=sys.stderr)
170
+ sys.exit(0)
171
+
172
+ infile = options.input_file
173
+
174
+ #step1: read beta value file
175
+ dat = load_data(infile)
176
+
177
+ #step2: build BGMM models
178
+ GMMs = build_GMM(dat, rnd = options.random_state)
179
+
180
+ #step3: Summerize BGMM models
181
+ if options.report_summary:
182
+ summary_GMM(GMMs)
183
+
184
+ #step4: Classification
185
+ trichotmize(dat, GMMs, options.prob_cutoff)
186
+
187
+
188
+ if __name__=='__main__':
189
+ main()
190
+