PyPI - cpgtools - Versions diffs - 2.0.5__py3-none-any.whl - Mend

cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

cpgmodule/BED.py +441 -0
cpgmodule/MI.py +193 -0
cpgmodule/__init__.py +0 -0
cpgmodule/_version.py +1 -0
cpgmodule/cgID.py +866897 -0
cpgmodule/data/AltumAge_cpg.pkl +0 -0
cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
cpgmodule/data/AltumAge_scaler.pkl +0 -0
cpgmodule/data/GA_Bohlin.pkl +0 -0
cpgmodule/data/GA_Haftorn.pkl +0 -0
cpgmodule/data/GA_Knight.pkl +0 -0
cpgmodule/data/GA_Lee_CPC.pkl +0 -0
cpgmodule/data/GA_Lee_RPC.pkl +0 -0
cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
cpgmodule/data/GA_Mayne.pkl +0 -0
cpgmodule/data/Hannum.pkl +0 -0
cpgmodule/data/Horvath_2013.pkl +0 -0
cpgmodule/data/Horvath_2018.pkl +0 -0
cpgmodule/data/Levine.pkl +0 -0
cpgmodule/data/Lu_DNAmTL.pkl +0 -0
cpgmodule/data/Ped_McEwen.pkl +0 -0
cpgmodule/data/Ped_Wu.pkl +0 -0
cpgmodule/data/Zhang_BLUP.pkl +0 -0
cpgmodule/data/Zhang_EN.pkl +0 -0
cpgmodule/data/__init__.py +0 -0
cpgmodule/extend_bed.py +147 -0
cpgmodule/imotif.py +348 -0
cpgmodule/ireader.py +28 -0
cpgmodule/methylClock.py +53 -0
cpgmodule/padjust.py +58 -0
cpgmodule/region2gene.py +170 -0
cpgmodule/utils.py +642 -0
cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
cpgtools-2.0.5.dist-info/METADATA +59 -0
cpgtools-2.0.5.dist-info/RECORD +104 -0
cpgtools-2.0.5.dist-info/WHEEL +5 -0
cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
cpgtools-2.0.5.dist-info/top_level.txt +5 -0
impyute/__init__.py +3 -0
impyute/contrib/__init__.py +7 -0
impyute/contrib/compare.py +69 -0
impyute/contrib/count_missing.py +30 -0
impyute/contrib/describe.py +63 -0
impyute/cs/__init__.py +11 -0
impyute/cs/buck_iterative.py +82 -0
impyute/cs/central_tendency.py +84 -0
impyute/cs/em.py +52 -0
impyute/cs/fast_knn.py +130 -0
impyute/cs/random.py +27 -0
impyute/dataset/__init__.py +6 -0
impyute/dataset/base.py +137 -0
impyute/dataset/corrupt.py +55 -0
impyute/deletion/__init__.py +5 -0
impyute/deletion/complete_case.py +21 -0
impyute/ops/__init__.py +12 -0
impyute/ops/error.py +9 -0
impyute/ops/inverse_distance_weighting.py +31 -0
impyute/ops/matrix.py +47 -0
impyute/ops/testing.py +20 -0
impyute/ops/util.py +96 -0
impyute/ops/wrapper.py +179 -0
impyute/ts/__init__.py +6 -0
impyute/ts/locf.py +57 -0
impyute/ts/moving_window.py +128 -0
impyutelib.py +890 -0
missingpy/__init__.py +4 -0
missingpy/knnimpute.py +328 -0
missingpy/missforest.py +556 -0
missingpy/pairwise_external.py +315 -0
missingpy/tests/__init__.py +0 -0
missingpy/tests/test_knnimpute.py +605 -0
missingpy/tests/test_missforest.py +409 -0
missingpy/utils.py +124 -0
misspylib.py +565 -0

cpgtools-2.0.5.data/scripts/dmc_nonparametric.py ADDED Viewed

@@ -0,0 +1,176 @@
+#!python
+"""
+Description
+-----------
+This program performs differential CpG analysis using the  Mann-Whitney U test
+for two group comparison, and the Kruskal-Wallis H-test for multiple group
+comparison.
+"""
+import sys,os
+import collections
+import subprocess
+import numpy as np
+from scipy import stats
+from optparse import OptionParser
+from cpgmodule import ireader
+from cpgmodule.utils import *
+from cpgmodule import BED
+from cpgmodule import padjust
+from cpgmodule._version import __version__
+__author__ = "Liguo Wang"
+__copyright__ = "Copyleft"
+__credits__ = []
+__license__ = "GPL"
+__maintainer__ = "Liguo Wang"
+__email__ = "wang.liguo@mayo.edu"
+__status__ = "Development"
+def mwu_test(a, b):
+	'''
+	mann-whitney U test of two samples.
+	'''
+	p = np.nan
+	t = np.nan
+	try:
+		tmp = stats.mannwhitneyu(a, b,  alternative='two-sided')
+		p = tmp.pvalue
+		t = tmp.statistic
+	except:
+		pass
+	return (p,t)
+def kruskal_test(*args):
+	'''
+	Compute the Kruskal-Wallis H-test for independent samples
+	'''
+	p = np.nan
+	t = np.nan
+	try:
+		tmp = stats.kruskal(*args, nan_policy='omit')
+		p = tmp.pvalue
+		t = tmp.statistic
+	except:
+		pass
+	return (p,t)
+def main():
+	usage="%prog [options]" + "\n"
+	parser = OptionParser(usage,version="%prog " + __version__)
+	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (.gz, .bz2).")
+	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated two columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs. It must have a header row. Sample IDs should match to the \"Data file\". Note: automatically switch to use  Kruskal-Wallis H-test if more than two groups were defined in this file.")
+	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
+	(options,args)=parser.parse_args()
+	print ()
+	if not (options.input_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(101)
+	if not (options.group_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(102)
+	if not (options.out_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(103)
+	FOUT = open(options.out_file + '.pval.txt','w')
+	printlog("Read group file \"%s\" ..." % (options.group_file))
+	(s,g) = read_grp_file1(options.group_file)
+	s2g = dict(zip(s,g))
+	g2s = collections.defaultdict(list)
+	for k,v in s2g.items():
+		g2s[v].append(k)
+	group_IDs = sorted(g2s.keys())
+	for g in group_IDs:
+		print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
+		print ('\t\t' + ','.join(g2s[g]))
+	if len(group_IDs) < 2:
+		printlog("You must have at least two groups!", file=sys.stderr)
+		sys.exit(1)
+	elif len(group_IDs) == 2:
+		printlog("Perfrom Mann-Whitney rank test of two samples ...")
+	elif len(group_IDs) >= 3:
+		printlog("Perfrom Kruskal-Wallis H-test ...")
+	line_num = 1
+	probe_list = []
+	p_list = []
+	for l in ireader.reader(options.input_file):
+		f = l.split()
+		if len(f) == 0: continue
+		if line_num == 1:
+			sample_IDs = f[1:]
+			# check if sample ID matches
+			for s in s2g:
+				if s not in sample_IDs:
+					printlog("Cannot find sample ID \"%s\" from file \"%s\"" % (s, options.input_file))
+					sys.exit(3)
+		else:
+			g2values = collections.defaultdict(list)
+			probe_ID = f[0]
+			beta_values = f[1:]
+			for s,b in zip(sample_IDs, beta_values):
+				#deal with non-numerical values
+				try:
+					b = float(b)
+				except:
+					b = np.nan
+				#skip if s not in group file
+				if s not in s2g:
+					continue
+				gid = s2g[s]
+				g2values[gid].append(b)
+			if len(g2values) == 2:
+				a = np.array(g2values[group_IDs[0]])
+				b = np.array(g2values[group_IDs[1]])
+				(pval,tscore) = mwu_test(a,b)
+			elif len(g2values) >= 3:
+				tmp = []
+				for g in group_IDs:
+					tmp.append(np.array(g2values[g]))
+				(pval,tscore) = kruskal_test(*tmp)
+			probe_list.append(probe_ID)
+			p_list.append(pval)
+		line_num += 1
+	printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
+	adjusted_p = {}
+	q_list =  padjust.multiple_testing_correction(p_list)
+	for id,p,q in zip(probe_list, p_list, q_list):
+		adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
+	printlog("Writing to %s" % (options.out_file + '.pval.txt'))
+	line_num = 1
+	for l in ireader.reader(options.input_file):
+		if line_num == 1:
+			print (l + '\tpval\tadj.pval', file=FOUT)
+		else:
+			f = l.split()
+			probe_ID = f[0]
+			print (l + '\t' + adjusted_p[probe_ID], file=FOUT)
+		line_num += 1
+	FOUT.close()
+if __name__=='__main__':
+	main()

cpgtools-2.0.5.data/scripts/dmc_ttest.py ADDED Viewed

@@ -0,0 +1,222 @@
+#!python
+"""
+Description
+-----------
+This program performs differential CpG analysis based on beta values. It uses Student's
+t-test for two-group comparison and ANOVA for multiple groups comparison.
+"""
+import sys,os
+import collections
+import subprocess
+import numpy as np
+from scipy import stats
+from optparse import OptionParser
+from cpgmodule import ireader
+from cpgmodule.utils import *
+from cpgmodule import BED
+from cpgmodule import padjust
+from cpgmodule._version import __version__
+__author__ = "Liguo Wang"
+__copyright__ = "Copyleft"
+__credits__ = []
+__license__ = "GPL"
+__maintainer__ = "Liguo Wang"
+__email__ = "wang.liguo@mayo.edu"
+__status__ = "Development"
+def standard_ttest(a, b, equalVar=True, nanPolicy='omit'):
+	'''
+	Calculate the T-test for the means of two independent samples of scores.
+	'''
+	p = np.nan
+	t = np.nan
+	try:
+		tmp = stats.ttest_ind(a, b, equal_var = equalVar, nan_policy = nanPolicy)
+		p = tmp.pvalue
+		t = tmp.statistic
+	except:
+		pass
+	return (p,t)
+def paired_ttest(a, b, nanPolicy='omit'):
+	'''
+	Calculate the T-test on TWO RELATED samples of scores, a and b.
+	'''
+	p = np.nan
+	t = np.nan
+	try:
+		tmp = stats.ttest_rel(a,b, nan_policy = nanPolicy)
+		p = tmp.pvalue
+		t = tmp.statistic
+	except:
+		pass
+	return (p,t)
+def anova(*args):
+	'''
+	The one-way ANOVA tests the null hypothesis that three or more groups have the same population mean
+	'''
+	p = np.nan
+	t = np.nan
+	try:
+		tmp = stats.f_oneway(*args)
+		p = tmp.pvalue
+		t = tmp.statistic
+	except:
+		pass
+	return (p,t)
+def main():
+	usage="%prog [options]" + "\n"
+	parser = OptionParser(usage,version="%prog " + __version__)
+	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Data file containing beta values with the 1st row containing sample IDs (must be unique) and the 1st column containing CpG positions or probe IDs (must be unique). Except for the 1st row and 1st column, any non-numerical values will be considered as \"missing values\" and ignored. This file can be a regular text file or compressed file (.gz, .bz2).")
+	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Group file defining the biological group of each sample. It is a comma-separated 2 columns file with the 1st column containing sample IDs, and the 2nd column containing group IDs.  It must have a header row. Sample IDs should match to the \"Data file\". Note: automatically switch to use ANOVA if more than 2 groups were defined in this file.")
+	parser.add_option("-p","--paired",action="store_true",default=False,dest="paired",help="If True, performs a paired t-test (the paired sampels are matched by the order). If False, performs a standard independent 2 sample t-test. default=%default")
+	parser.add_option("-w","--welch",action="store_true",default=False,dest="welch_ttest",help="If True, performs Welch's t-test which does not assume the two samples have equal variance.  If False, performs a standard two-sample t-test (i.e. assuming the two samples have equal variance). default=%default")
+	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
+	(options,args)=parser.parse_args()
+	print ()
+	#print (options.paired)
+	#print (options.welch_ttest)
+	#sys.exit()
+	if not (options.input_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(101)
+	if not (options.group_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(102)
+	if not (options.out_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(103)
+	FOUT = open(options.out_file + '.pval.txt','w')
+	#ROUT = open(options.out_file + '.r','w')
+	printlog("Read group file \"%s\" ..." % (options.group_file))
+	(ss,gs) = read_grp_file1(options.group_file)
+	s2g = {}
+	for s,g in zip(ss,gs):
+		s2g[s] = g
+	g2s = collections.defaultdict(list)
+	for s,g in zip(ss, gs):
+		g2s[g].append(s)
+	group_IDs = sorted(g2s.keys())
+	for g in group_IDs:
+		print ("\tGroup %s has %d samples:" % (g, len(g2s[g])))
+		print ('\t\t' + ','.join(g2s[g]))
+	if len(group_IDs) < 2:
+		printlog("You must have at least two groups!", file=sys.stderr)
+		sys.exit(1)
+	elif (len(group_IDs) == 2) and (options.paired is True):
+		printlog("Perfrom paired t-test of two related samples ...")
+		if len(g2s[group_IDs[0]]) != len(g2s[group_IDs[1]]):
+			printlog("Unequal sample size. Cannot perform paired t-test.")
+			sys.exit(2)
+	elif (len(group_IDs) == 2) and (options.paired is False):
+		printlog("Perfrom standard t-test of two independent samples ...")
+	elif len(group_IDs) >= 3:
+		printlog("Perfrom ANOVA ...")
+	line_num = 1
+	probe_list = []
+	p_list = []
+	delta_beta = {}
+	for l in ireader.reader(options.input_file):
+		f = l.split()
+		if len(f) == 0: continue
+		if line_num == 1:
+			sample_IDs = f[1:]
+			# check if sample ID matches
+			for s in s2g:
+				if s not in sample_IDs:
+					printlog("Cannot find sample ID \"%s\" from file \"%s\". Exclude this sample from differential analysis." % (s, options.input_file))
+					#sys.exit(3)
+		else:
+			g2values = collections.defaultdict(list)
+			probe_ID = f[0]
+			beta_values = f[1:]
+			for s,b in zip(sample_IDs, beta_values):
+				#deal with non-numerical values
+				try:
+					b = float(b)
+				except:
+					b = np.nan
+				#skip if s not in group file
+				if s not in s2g:
+					continue
+				gid = s2g[s]
+				g2values[gid].append(b)
+			if len(g2values) == 2:
+				a = np.array(g2values[group_IDs[0]])
+				b = np.array(g2values[group_IDs[1]])
+				try:
+					delta_beta[probe_ID] = np.mean(a) - np.mean(b)
+				except:
+					delta_beta[probe_ID] = np.nan
+				if options.paired:
+					(pval,tscore) = paired_ttest(a,b)
+				else:
+					(pval,tscore) = standard_ttest(a,b, equalVar = options.welch_ttest)
+			elif len(g2values) >= 3:
+				tmp = []
+				for g in group_IDs:
+					tmp.append(np.array(g2values[g]))
+				(pval,tscore) = anova(*tmp)
+			if pval >= 0 and pval <= 1:
+				probe_list.append(probe_ID)
+				p_list.append(pval)
+			else:
+				continue
+		line_num += 1
+	printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
+	adjusted_p = {}
+	q_list =  padjust.multiple_testing_correction(p_list)
+	for id,p,q in zip(probe_list, p_list, q_list):
+		adjusted_p[id] = '\t'.join([str(i) for i in (p,q)])
+	printlog("Writing to %s" % (options.out_file + '.pval.txt'))
+	line_num = 1
+	for l in ireader.reader(options.input_file):
+		if line_num == 1:
+			print (l + '\tdelta_beta\tpval\tadj.pval', file=FOUT)
+		else:
+			f = l.split()
+			probe_ID = f[0]
+			if probe_ID in delta_beta:
+				pass
+			else:
+				delta_beta[probe_ID] = 'n/a'
+			try:
+				print (l + '\t' + str(delta_beta[probe_ID]) + '\t' + adjusted_p[probe_ID], file=FOUT)
+			except:
+				print (l + '\t' + 'n/a' + '\t' + 'n/a' + '\t' + 'n/a', file=FOUT)
+		line_num += 1
+	FOUT.close()
+if __name__=='__main__':
+	main()