PyPI - cpgtools - Versions diffs - 2.0.5__py3-none-any.whl - Mend

cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

cpgmodule/BED.py +441 -0
cpgmodule/MI.py +193 -0
cpgmodule/__init__.py +0 -0
cpgmodule/_version.py +1 -0
cpgmodule/cgID.py +866897 -0
cpgmodule/data/AltumAge_cpg.pkl +0 -0
cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
cpgmodule/data/AltumAge_scaler.pkl +0 -0
cpgmodule/data/GA_Bohlin.pkl +0 -0
cpgmodule/data/GA_Haftorn.pkl +0 -0
cpgmodule/data/GA_Knight.pkl +0 -0
cpgmodule/data/GA_Lee_CPC.pkl +0 -0
cpgmodule/data/GA_Lee_RPC.pkl +0 -0
cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
cpgmodule/data/GA_Mayne.pkl +0 -0
cpgmodule/data/Hannum.pkl +0 -0
cpgmodule/data/Horvath_2013.pkl +0 -0
cpgmodule/data/Horvath_2018.pkl +0 -0
cpgmodule/data/Levine.pkl +0 -0
cpgmodule/data/Lu_DNAmTL.pkl +0 -0
cpgmodule/data/Ped_McEwen.pkl +0 -0
cpgmodule/data/Ped_Wu.pkl +0 -0
cpgmodule/data/Zhang_BLUP.pkl +0 -0
cpgmodule/data/Zhang_EN.pkl +0 -0
cpgmodule/data/__init__.py +0 -0
cpgmodule/extend_bed.py +147 -0
cpgmodule/imotif.py +348 -0
cpgmodule/ireader.py +28 -0
cpgmodule/methylClock.py +53 -0
cpgmodule/padjust.py +58 -0
cpgmodule/region2gene.py +170 -0
cpgmodule/utils.py +642 -0
cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
cpgtools-2.0.5.dist-info/METADATA +59 -0
cpgtools-2.0.5.dist-info/RECORD +104 -0
cpgtools-2.0.5.dist-info/WHEEL +5 -0
cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
cpgtools-2.0.5.dist-info/top_level.txt +5 -0
impyute/__init__.py +3 -0
impyute/contrib/__init__.py +7 -0
impyute/contrib/compare.py +69 -0
impyute/contrib/count_missing.py +30 -0
impyute/contrib/describe.py +63 -0
impyute/cs/__init__.py +11 -0
impyute/cs/buck_iterative.py +82 -0
impyute/cs/central_tendency.py +84 -0
impyute/cs/em.py +52 -0
impyute/cs/fast_knn.py +130 -0
impyute/cs/random.py +27 -0
impyute/dataset/__init__.py +6 -0
impyute/dataset/base.py +137 -0
impyute/dataset/corrupt.py +55 -0
impyute/deletion/__init__.py +5 -0
impyute/deletion/complete_case.py +21 -0
impyute/ops/__init__.py +12 -0
impyute/ops/error.py +9 -0
impyute/ops/inverse_distance_weighting.py +31 -0
impyute/ops/matrix.py +47 -0
impyute/ops/testing.py +20 -0
impyute/ops/util.py +96 -0
impyute/ops/wrapper.py +179 -0
impyute/ts/__init__.py +6 -0
impyute/ts/locf.py +57 -0
impyute/ts/moving_window.py +128 -0
impyutelib.py +890 -0
missingpy/__init__.py +4 -0
missingpy/knnimpute.py +328 -0
missingpy/missforest.py +556 -0
missingpy/pairwise_external.py +315 -0
missingpy/tests/__init__.py +0 -0
missingpy/tests/test_knnimpute.py +605 -0
missingpy/tests/test_missforest.py +409 -0
missingpy/utils.py +124 -0
misspylib.py +565 -0

cpgtools-2.0.5.data/scripts/beta_topN.py ADDED Viewed

@@ -0,0 +1,99 @@
+#!python
+"""
+#=========================================================================================
+This program picks the top N rows (according to standard deviation) from the input file.
+The resulting file can be used for clustering/PCA analysis.
+Example of input data file
+---------------------------
+CpG_ID	Sample_01	Sample_02	Sample_03	Sample_04
+cg_001	0.831035	0.878022	0.794427	0.880911
+cg_002	0.249544	0.209949	0.234294	0.236680
+cg_003	0.845065	0.843957	0.840184	0.824286
+"""
+import sys,os
+import collections
+import subprocess
+import numpy as np
+from optparse import OptionParser
+from cpgmodule._version import __version__
+from cpgmodule import ireader
+from cpgmodule.utils import *
+from cpgmodule import BED
+import pandas as pd
+__author__ = "Liguo Wang"
+__copyright__ = "Copyleft"
+__credits__ = []
+__license__ = "GPL"
+__maintainer__ = "Liguo Wang"
+__email__ = "wang.liguo@mayo.edu"
+__status__ = "Development"
+def main():
+	usage="%prog [options]" + "\n"
+	parser = OptionParser(usage,version="%prog " + __version__)
+	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
+	parser.add_option("-c","--count",action="store",type='int', dest="cpg_count", default=1000, help="Number of most variable CpGs (ranked by standard deviation) to keep. default=%default" )
+	parser.add_option("-s","--score",action="store",type='string', dest="score_type", default='std', help="The type of score used to rank CpGs. Must be one of 'std' or 'mean'. default=%default" )
+	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
+	(options,args)=parser.parse_args()
+	print ()
+	if not (options.input_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(101)
+	if not (options.out_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(103)
+	printlog("Reading input file: \"%s\"" % (options.input_file))
+	df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
+	#remove any rows with NAs
+	df2 = df1.dropna(axis=0, how='any')
+	printlog("%d rows with missing values were removed." % (len(df1) - len(df2)))
+	if options.score_type.lower() == 'std':
+		#calculate stdev for each row
+		row_stds = df2.std(axis=1)
+		df2.loc[:, 'Stdev'] =  row_stds
+		#sorted data frame by stdev (decreasingly). Then take the top count,. Then remove Stdev column
+		printlog("Sorting by the standard deviation (decreasingly) ... ")
+		df3 = df2.sort_values(by=['Stdev'], ascending=False)
+		printlog("Data frame with sorted Stdev is saved to file: %s" % options.out_file + '.sortedStdev.tsv')
+		df3.to_csv(options.out_file + '.sortedStdev.tsv', sep = "\t",float_format='%.6f')
+		df4 = df3[0:options.cpg_count].drop('Stdev',axis=1)
+		printlog("Top %d rows of Data frame is saved to file: %s" % (options.cpg_count, options.out_file + '.sortedStdev.topN.tsv'))
+		df4.to_csv(options.out_file + '.sortedStdev.topN.tsv', sep="\t",float_format='%.6f')
+	elif options.score_type.lower() == 'mean':
+		#calculate mean for each row
+		row_means = df2.mean(axis=1)
+		df2['Mean'] =  row_means
+		#sorted data frame by mean (decreasingly). Then take the top count,. Then remove Stdev column
+		printlog("Sorting by the mean (decreasingly) ... ")
+		df3 = df2.sort_values(by=['Mean'], ascending=False)
+		printlog("Data frame with sorted Mean is saved to file: %s" % options.out_file + '.sortedMean.tsv')
+		df3.to_csv(options.out_file + '.sortedMean.tsv', sep = "\t",float_format='%.6f')
+		df4 = df3[0:options.cpg_count].drop('Mean',axis=1)
+		printlog("Top %d rows of Data frame is saved to file: %s" % (options.cpg_count, options.out_file + '.sortedMean.topN.tsv'))
+		df4.to_csv(options.out_file + '.sortedMean.topN.tsv', sep="\t",float_format='%.6f')
+if __name__=='__main__':
+	main()

cpgtools-2.0.5.data/scripts/beta_trichotmize.py ADDED Viewed

@@ -0,0 +1,190 @@
+#!python
+"""
+Description
+-----------
+This program uses the Bayesian Gaussian Mixture model (BGMM) to trichotmize beta values into
+three status:
+ * Un-methylated (labeled as "0" in result file)
+ * Semi- or particial-methylated (labeled as "1" in result file)
+ * Full-methylated (labeled as "2" in result file)
+ * unassigned (labeled as "-1" in result file)
+"""
+import sys,os
+import collections
+import numpy as np
+from optparse import OptionParser
+from sklearn import mixture
+from time import strftime
+from cpgmodule._version import __version__
+from cpgmodule import ireader
+from cpgmodule.utils import *
+import pandas as pd
+__author__ = "Liguo Wang"
+__copyright__ = "Copyleft"
+__credits__ = []
+__license__ = "GPL"
+__maintainer__ = "Liguo Wang"
+__email__ = "wang.liguo@mayo.edu"
+__status__ = "Development"
+def load_data(infile):
+	"""
+	Input file is tab or space separated plain text file.
+	*The first row contains sample IDs (must be unique)
+	*The first column contains probe IDs (must be unique)
+	*Each cell (except for the 1st row and 1st column) contains Beta-value
+	Example:
+	Probe			sample_1	sample_2	sample_3 ...
+	cg09835024		0.0547		0.1187		0.0625	...
+	cg25813447		0.428		0.3746		0.0666	...
+	cg07779434		0.3713		0.4194		0.0493	...
+	"""
+	printlog("Reading input file: \"%s\"" % infile)
+	df1 = pd.read_csv(infile, index_col = 0, sep="\t")
+	#remove any rows with NAs
+	df2 = df1.dropna(axis=0, how='any')
+	printlog("%d rows with missing values were removed." % (len(df1) - len(df2)))
+	print ("\tTotal samples: %d" % (len(df2.columns)), file=sys.stderr)
+	print ("\tTotal probes: %d" % len(df2), file=sys.stderr)
+	return df2
+def build_GMM(d,rnd):
+	"""
+	Return means of components of Gaussian Mixture Model.
+	d is data frame returned by "load_data" function.
+	rnd is a random number. You get exactly the same results when running multiple times using the same random number. Must be integer.
+	"""
+	bgmm_models = collections.defaultdict(list)
+	for s_id in sorted(d.columns):
+		printlog ("Building Bayesian Gaussian Mixture model for subject: %s ...\r" % s_id)
+		bgmm = mixture.BayesianGaussianMixture(n_components=3, covariance_type='full',max_iter=50000,tol=0.001,random_state=rnd)
+		bgmm_models[s_id] = bgmm.fit(d[s_id].values.reshape(-1,1))
+	#print (bgmm_models)
+	return bgmm_models
+def summary_GMM(m):
+	"""
+	Summarize BGMM models returned by "build_GMM"
+	"""
+	printlog ("Summerzie GMM models ...")
+	FOUT = open('summary_report.txt','w')
+	print ("\n\n#means of components", file=FOUT)
+	print ("Subject_ID\tUnmethyl\tSemiMethyl\tMethyl",file=FOUT)
+	for k,v in m.items():
+		print (k + '\t' + '\t'.join([str(i) for i in sorted(v.means_[:,0])]),file=FOUT)
+	print ("\n\n#Weights of components", file=FOUT)
+	print ("Subject_ID\tUnmethyl\tSemiMethyl\tMethyl", file=FOUT)
+	for k,v in m.items():
+		print (k + '\t' + '\t'.join([str(i) for i in sorted(v.weights_)]), file=FOUT)
+	print ("\n\n#Converge status and n_iter", file=FOUT)
+	print ("Subject_ID\tConverged\tn_iter", file=FOUT)
+	for k,v in m.items():
+		print (k + '\t' + '\t'.join([str(i) for i in (v.converged_, v.n_iter_)]), file=FOUT)
+	FOUT.close()
+	printlog ("Reports were saved into \"summary_report.txt\".")
+def trichotmize(d,m, prob_cutoff):
+	"""
+	trichotmize beta-value into one of ('0','0.5','1')
+	0 : Un-methylation
+	0.5: Semi- or particial-methylation
+	1: Methylation
+	d is beta value object returned by "load_data" function
+	m is BGMM models returned by 'build_GMM' function
+	"""
+	probe_IDs = list(d.index)
+	for s_id in sorted(m.keys()):
+		printlog ("Writing to \"%s\" ..." % (s_id + ".results.txt"))
+		FOUT = open(s_id + ".results.txt",'w')
+		methyl_lables = {}	#key is index (index can be 0,1 or 2 corresponding to 3 components), value is 0, 1 or 2 corresponding to Un-, Semi- and full- methylation
+		component_means = m[s_id].means_[:,0]	# list of component means
+		betas = d[s_id]
+		for idx,val in enumerate(component_means):
+			if val == max(component_means):
+				methyl_lables[idx] = '2'	# full methyl
+			elif val == min(component_means):
+				methyl_lables[idx] = '0'	# un-methyl
+			else:
+				methyl_lables[idx] = '1'	# semi-methyl
+		probs = m[s_id].predict_proba(d[s_id].values.reshape(-1,1))	# list of probabilities of components: [[  4.33638063e-035   9.54842259e-001   4.51577411e-002],...]
+		print ("#Prob_of_0: Probability of CpG belonging to un-methylation group", file=FOUT)
+		print ("#Prob_of_1: Probability of CpG belonging to semi- or particial-methylation group", file=FOUT)
+		print ("#Prob_of_2: Probability of CpG belonging to full-methylation group", file=FOUT)
+		print ("#Assigned_lable: -1 = 'unassigned', 0 = 'un-methylation', 1 = 'semi- or particial-methylation', 2 = 'full-methylation'", file=FOUT)
+		print ("Probe_ID" + '\tBeta_value\t' + '\t'.join(['Prob_of_' + methyl_lables[0], 'Prob_of_' + methyl_lables[1], 'Prob_of_' + methyl_lables[2]]) + '\t' + 'Assigned_lable', file=FOUT)
+		for probe_ID, beta, p in zip(probe_IDs, betas, probs):
+			p_list = list(p)
+			#print probe_ID
+			#print p_list
+			if methyl_lables[p_list.index(max(p_list))] == '1':
+				if max(p_list) >= prob_cutoff:
+					print (probe_ID + '\t' + str(beta) + '\t' + '\t'.join([str(i) for i in p_list]) + '\t' + methyl_lables[p_list.index(max(p_list))], file=FOUT)
+				else:
+					print (probe_ID + '\t' + str(beta) + '\t' + '\t'.join([str(i) for i in p_list]) + '\t' + '-1', file=FOUT)
+			else:
+				print (probe_ID + '\t' + str(beta) + '\t' + '\t'.join([str(i) for i in p_list]) + '\t' + methyl_lables[p_list.index(max(p_list))], file=FOUT)
+		FOUT.close()
+def main():
+	print (__doc__)
+	usage="%prog [options]" + "\n"
+	parser = OptionParser(usage,version="%prog " + __version__)
+	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Input plain text file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing probe IDs (must be unique).")
+	parser.add_option("-c","--prob-cut",action="store",type="float",dest="prob_cutoff",default=0.95,help="Probability cutoff to assign a probe into \"semi- or particial-methylated\" class. default=%default")
+	parser.add_option("-r","--report",action="store_true",dest="report_summary",default=False, help="If True, generates \"summary_report.txt\" file.  default=%default")
+	parser.add_option("-s","--seed",action="store",type='int', dest="random_state",default=99, help="The seed used by the random number generator. default=%default")
+	(options,args)=parser.parse_args()
+	print ()
+	#print (options.report_summary)
+	#sys.exit()
+	if not (options.input_file):
+		parser.print_help()
+		sys.exit(0)
+	if not os.path.exists(options.input_file):
+		print ('\n\n' + options.input_file + " does NOT exists" + '\n',file=sys.stderr)
+		sys.exit(0)
+	infile = options.input_file
+	#step1: read beta value file
+	dat = load_data(infile)
+	#step2: build BGMM models
+	GMMs = build_GMM(dat, rnd = options.random_state)
+	#step3: Summerize BGMM models
+	if options.report_summary:
+		summary_GMM(GMMs)
+	#step4: Classification
+	trichotmize(dat, GMMs, options.prob_cutoff)
+if __name__=='__main__':
+	main()