PyPI - cpgtools - Versions diffs - 2.0.2__tar.gz → 2.0.4__tar.gz - Mend

cpgtools 2.0.2tar.gz → 2.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cpgtools might be problematic. Click here for more details.

Files changed (111) hide show

{cpgtools-2.0.2 → cpgtools-2.0.4}/MANIFEST.in RENAMED Viewed

@@ -4,8 +4,8 @@ include PKG-INFO
 include LICENSE
 include distribute_setup.py
-recursive-include lib *.pyx
-recursive-include lib *.py
-recursive-include lib *.pkl
-recursive-include bin *
+recursive-include src *.pyx
+recursive-include src *.py
+recursive-include src *.pkl
+recursive-include scripts *
 recursive-include doc *

{cpgtools-2.0.2 → cpgtools-2.0.4}/PKG-INFO RENAMED Viewed

@@ -1,35 +1,15 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: cpgtools
-Version: 2.0.2
+Version: 2.0.4
 Summary: Tools to analyze and visualize DNA methylation data
 Author-email: Liguo Wang <wangliguo78@gmail.com>
 Maintainer-email: Liguo Wang <wangliguo78@gmail.com>
-License: Copyright (c) 2024 The Python Packaging Authority
-        Permission is hereby granted, free of charge, to any person obtaining a copy
-        of this software and associated documentation files (the "Software"), to deal
-        in the Software without restriction, including without limitation the rights
-        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-        copies of the Software, and to permit persons to whom the Software is
-        furnished to do so, subject to the following conditions:
-        The above copyright notice and this permission notice shall be included in all
-        copies or substantial portions of the Software.
-        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-        SOFTWARE.
+License-Expression: MIT
 Project-URL: Documentation, https://cpgtools.readthedocs.io/en/latest/index.html
 Project-URL: Repository, https://github.com/liguowang/cpgtools.git
 Keywords: DNA methylation,EPIC,450K,850K,935K,RRBS,WGBS
 Classifier: Programming Language :: Python :: 3
 Classifier: Development Status :: 4 - Beta
-Classifier: License :: OSI Approved :: MIT License
 Classifier: Environment :: Console
 Classifier: Intended Audience :: Science/Research
 Classifier: Operating System :: MacOS :: MacOS X
@@ -37,7 +17,7 @@ Classifier: Operating System :: POSIX
 Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
 Requires-Python: >=3.5
 Description-Content-Type: text/markdown
-License-File: LICENSE
+License-File: LICENSE.txt
 Requires-Dist: numpy
 Requires-Dist: scipy
 Requires-Dist: scikit-learn
@@ -46,6 +26,8 @@ Requires-Dist: bx-python
 Requires-Dist: pandas
 Requires-Dist: umap-learn
 Requires-Dist: fancyimpute
+Requires-Dist: pycombat
+Dynamic: license-file
 ## Install CpGtools using [pip](https://pip.pypa.io/en/stable/)

{cpgtools-2.0.2 → cpgtools-2.0.4}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 #Project's meta data
 [project]
-version = "2.0.2"
+version = "2.0.4"
 name = "cpgtools"
 authors = [
   {name="Liguo Wang", email="wangliguo78@gmail.com"},
@@ -15,7 +15,7 @@ maintainers = [
 ]
 description = "Tools to analyze and visualize DNA methylation data"
 readme = "README.md"
-license = {file = "LICENSE"}
+license = "MIT"
 requires-python = ">=3.5"
 dependencies = [
@@ -27,12 +27,12 @@ dependencies = [
 	"pandas",
 	"umap-learn",
 	"fancyimpute",
+	"pycombat",
 ]
 classifiers=[
 	"Programming Language :: Python :: 3",
 	'Development Status :: 4 - Beta',
-	"License :: OSI Approved :: MIT License",
 	'Environment :: Console',
 	'Intended Audience :: Science/Research',
 	'Operating System :: MacOS :: MacOS X',

cpgtools-2.0.4/scripts/.DS_Store ADDED Viewed

Binary file

{cpgtools-2.0.2 → cpgtools-2.0.4}/scripts/beta_PCA.py RENAMED Viewed

@@ -73,7 +73,10 @@ def main():
 		print (__doc__)
 		parser.print_help()
 		sys.exit(101)
+	if not (options.group_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(101)
 	if not (options.out_file):
 		print (__doc__)
 		parser.print_help()

cpgtools-2.0.4/scripts/beta_remove_batch_effects.py ADDED Viewed

@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Description
+-----------
+This program corrects batch effect.
+Example of input data file
+---------------------------
+CpG_ID	Sample_01	Sample_02	Sample_03	Sample_04
+cg_001	0.831035	0.878022	0.794427	0.880911
+cg_002	0.249544	0.209949	0.234294	0.236680
+cg_003	0.845065	0.843957	0.840184	0.824286
+...
+Example of batch file
+-------------------------------
+Sample,Group
+Sample_01,plate_1
+Sample_02,plate_1
+Sample_03,plate_2
+Sample_04,plate_2
+...
+Notes
+-----
+* Rows with missing values will be removed
+"""
+import sys
+import subprocess
+from optparse import OptionParser
+from cpgmodule.utils import *
+from cpgmodule._version import __version__
+import pandas as pd
+#from sklearn.preprocessing import StandardScaler
+#from sklearn.decomposition import PCA
+from collections import Counter
+from combat.pycombat import pycombat
+import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
+__author__ = "Liguo Wang"
+__copyright__ = "Copyleft"
+__credits__ = []
+__license__ = "GPL"
+__maintainer__ = "Liguo Wang"
+__email__ = "wang.liguo@mayo.edu"
+__status__ = "Development"
+def pick_colors(n):
+	my_colors = list(mcolors.CSS4_COLORS.keys())
+	if n > len(my_colors):
+		print ("Too many colors requested", file = sys.stderr)
+		sys.exit()
+	return my_colors[0:n]
+def box_plot(df, s_colors,  out_png, ylab="Beta values", title=""):
+	s_names = df.columns
+	fig, ax = plt.subplots()
+	bplot = ax.boxplot(df, patch_artist=True, tick_labels = s_names)
+	for patch, color in zip(bplot['boxes'], s_colors):
+		patch.set_facecolor(color)
+	ax.set_xticklabels(s_names, rotation='vertical')
+	plt.ylabel(ylab)
+	plt.title(title)
+	plt.savefig(out_png)
+def main():
+	usage="%prog [options]" + "\n"
+	parser = OptionParser(usage,version="%prog " + __version__)
+	parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
+	parser.add_option("-g","--group",action="store",type="string",dest="group_file",help="Comma-separated group file defining the batch groups of each sample.")
+	parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.")
+	(options,args)=parser.parse_args()
+	print ()
+	if not (options.input_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(101)
+	if not (options.group_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(101)
+	if not (options.out_file):
+		print (__doc__)
+		parser.print_help()
+		sys.exit(103)
+	beta_out = options.out_file + '.combat.tsv'
+	beta_boxplot_before = options.out_file + '.boxplot.png'
+	beta_boxplot_after = options.out_file + '.boxplot_combat.png'
+	printlog("Reading input file: \"%s\" ..." % (options.input_file))
+	df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
+	#remove NA and transpose
+	df2 = df1.dropna(axis=0, how='any')
+	printlog("%d rows with missing values were removed." % (len(df1.index) - len(df2.index)))
+	printlog("Reading group file: \"%s\" ..." % (options.group_file))
+	group = pd.read_csv(options.group_file, index_col=0, names=['Sample_ID', 'Group_ID'])
+	#check if sample IDs are unique
+	if len(group.index) != len(group.index.unique()):
+		print ("Sample IDs are not unique", file = sys.stderr)
+		sys.exit()
+	group.index = group.index.map(str)
+	printlog("Group/batch \"%s\" contains %d samples" % (options.group_file, len(group.index)))
+	# a list of unique group names, and their frequencies
+	group_info = Counter(group['Group_ID'])
+	print(list(group['Group_ID']))
+	# a list of unique colors
+	color_names = pick_colors(len(group_info))
+	color_list = []
+	for name,count in zip(color_names, list(group_info.values())):
+		color_list.extend([name]*count)
+	printlog("Generate boxplot before correction. Save to '%s'" % beta_boxplot_before)
+	box_plot(df2, s_colors=color_list, out_png=beta_boxplot_before, title="Before batch effects correction")
+	# remove batch effect
+	printlog("Removing batch effect ...")
+	df_corrected = pycombat(df2, list(group['Group_ID']))
+	df_corrected.to_csv(beta_out,sep="\t")
+	printlog("Generate boxplot after correction. Save to '%s'" % beta_boxplot_after)
+	box_plot(df_corrected, s_colors=color_list, out_png=beta_boxplot_after, title="After batch effects correction")
+if __name__=='__main__':
+	main()

{cpgtools-2.0.2 → cpgtools-2.0.4}/scripts/dmc_ttest.py RENAMED Viewed

@@ -189,7 +189,7 @@ def main():
 			else:
 				continue
 		line_num += 1
 	printlog("Perfrom Benjamini-Hochberg (aka FDR) correction ...")
 	adjusted_p = {}
 	q_list =  padjust.multiple_testing_correction(p_list)
@@ -204,6 +204,10 @@ def main():
 		else:
 			f = l.split()
 			probe_ID = f[0]
+			if probe_ID in delta_beta:
+				pass
+			else:
+				delta_beta[probe_ID] = 'n/a'
 			try:
 				print (l + '\t' + str(delta_beta[probe_ID]) + '\t' + adjusted_p[probe_ID], file=FOUT)
 			except:

cpgtools-2.0.2/scripts/beta_imputation.py → cpgtools-2.0.4/scripts/predict_missing.py RENAMED Viewed

@@ -20,6 +20,7 @@ from impyute.cs.em import em
 from impyute.ops.util import toy_df,insert_na
 from impyute.cs.random import random_impute
 from impyute.cs.buck_iterative import buck_iterative
+from impyute.ts.moving_window import moving_window
 from missingpy import MissForest
 #use pip to install fancyimpute
@@ -64,6 +65,7 @@ def nafiller():
                     the same row or column.",
         'FillRef': "Impute missing values using values from an external \
                     reference dataset.",
+        'MW': "Interpolate the missing values with moving window.",
         'KNN': "Impute missing values using scikit-learn's KNNImputer function. \
                 Note: slow for large datasets.",
         'KNN2': "Impute missing values using KNN2",
@@ -98,6 +100,7 @@ def nafiller():
     FillMax_parser = sub_parsers.add_parser('FillMax', help=commands['FillMax'])
     FillRand_parser = sub_parsers.add_parser('FillRand', help=commands['FillRand'])
     FillRef_parser = sub_parsers.add_parser('FillRef', help=commands['FillRef'])
+    MW_parser = sub_parsers.add_parser('MW', help=commands['MW'])
     KNN_parser = sub_parsers.add_parser('KNN', help=commands['KNN'])
     fKNN_parser = sub_parsers.add_parser('fKNN', help=commands['fKNN'])
     EM_parser = sub_parsers.add_parser('EM', help=commands['EM'])
@@ -116,7 +119,7 @@ def nafiller():
         'output', type=str, metavar='out_df',
         help="Output data frame.")
     DropNA_parser.add_argument(
-        '-a', '--axis', type=int, choices=range(2), default=0,
+        '--axis', type=int, choices=range(2), default=0,
         help="0 : drop rows with any missing values, 1 : drop columns with \
             missing values. Default: 0")
     DropNA_parser.add_argument(
@@ -130,7 +133,7 @@ def nafiller():
         'output', type=str, metavar='out_df',
         help="Output data frame.")
     FillValue_parser.add_argument(
-        '-s', '--score', type=float, default=0.0,
+        '--score', type=float, default=0.0,
         help="The value uesd to fill all NAs.")
     FillValue_parser.add_argument(
         '--decimal', type=int, default=5,
@@ -143,7 +146,7 @@ def nafiller():
         'output', type=str, metavar='out_df',
         help="Output data frame.")
     FillMean_parser.add_argument(
-        '-a', '--axis', type=int, choices=range(2), default=1,
+        '--axis', type=int, choices=range(2), default=1,
         help="0 means column, 1 means row. Default: fill NAs with row means")
     FillMean_parser.add_argument(
         '--decimal', type=int, default=5,
@@ -156,7 +159,7 @@ def nafiller():
         'output', type=str, metavar='out_df',
         help="Output data frame.")
     FillMedian_parser.add_argument(
-        '-a', '--axis', type=int, choices=range(2), default=1,
+        '--axis', type=int, choices=range(2), default=1,
         help="0 means column, 1 means row. Default: fill NAs with row medians")
     FillMedian_parser.add_argument(
         '--decimal', type=int, default=5,
@@ -169,7 +172,7 @@ def nafiller():
         'output', type=str, metavar='out_df',
         help="Output data frame.")
     FillMin_parser.add_argument(
-        '-a', '--axis', type=int, choices=range(2), default=1,
+        '--axis', type=int, choices=range(2), default=1,
         help="0 means column, 1 means row. Default: fill NAs with the minimum value of the rows.")
     FillMin_parser.add_argument(
         '--decimal', type=int, default=5,
@@ -182,7 +185,7 @@ def nafiller():
         'output', type=str, metavar='out_df',
         help="Output data frame.")
     FillMax_parser.add_argument(
-        '-a', '--axis', type=int, choices=range(2), default=1,
+        '--axis', type=int, choices=range(2), default=1,
         help="0 means column, 1 means row. Default: fill NAs with the maximum value of the rows.")
     FillMax_parser.add_argument(
         '--decimal', type=int, default=5,
@@ -195,7 +198,7 @@ def nafiller():
         'output', type=str, metavar='out_df',
         help="Output data frame.")
     FillRand_parser.add_argument(
-        '-a', '--axis', type=int, choices=range(2), default=1,
+        '--axis', type=int, choices=range(2), default=1,
         help="0 means column, 1 means row. Default: fill NAs with values randomly selected from rows.")
     FillRand_parser.add_argument(
         '--decimal', type=int, default=5,
@@ -208,12 +211,50 @@ def nafiller():
         'output', type=str, metavar='out_df',
         help="Output data frame.")
     FillRef_parser.add_argument(
-        '-r', '--ref', type=str,
+        '--ref', type=str,
         help="File name of the external reference.")
     FillRef_parser.add_argument(
         '--decimal', type=int, default=5,
         help="Number of decimal places to round each column to. default: %(default)s")
+    MW_parser.add_argument(
+        'input', type=str, metavar='input_df',
+        help="Input data frame.")
+    MW_parser.add_argument(
+        'output', type=str, metavar='out_df',
+        help="Output data frame.")
+    MW_parser.add_argument(
+        '--nindex', type=int, choices=[0, -1, None],
+        default=None,
+        help="Null index. Index of the null value inside the moving average window. \
+            See impyute documentation for details. default: %(default)s")
+    MW_parser.add_argument(
+        '--wsize', type=int, default=5,
+        help="Size of the moving average window/area of values being used \
+            for each local imputation. This number includes the missing value. \
+            default: %(default)s")
+    MW_parser.add_argument(
+        '--errors', type=str, choices=["raise", "coerce", "ignore"],
+        default='coerce',
+        help="Errors will occur with the indexing of the windows - for \
+            example if there is a nan at data[x][0] and `nindex` is set to \
+            -1 or there is a nan at data[x][-1] and `nindex` is set to 0. `\
+            'raise' will raise an error, `coerce` will try again using an \
+            nindex set to the middle and `ignore` will just leave it as a \
+            nan  default: %(default)s")
+    MW_parser.add_argument(
+        '--func', type=str, choices=["mean", "median"],
+        default='mean',
+        help="Function to summerzie values within the moving window. \
+            default: %(default)s")
+    MW_parser.add_argument(
+        '--axis', type=int, choices=range(2), default=1,
+        help="0 means column, 1 means row. Default: fill missing value \
+            with windows moving on rows.")
+    MW_parser.add_argument(
+        '--decimal', type=int, default=5,
+        help="Number of decimal places to round each column to. default: %(default)s")
     KNN_parser.add_argument(
         'input', type=str, metavar='input_df',
         help="Input data frame.")
@@ -306,20 +347,28 @@ def nafiller():
         'output', type=str, metavar='out_df',
         help="Name of the output data frame.")
     ToyDf_parser.add_argument(
-        '-r', '--nrow', type=int, default=10,
+        '--nrow', type=int, default=10,
         help="Number of rows. default: %(default)s")
     ToyDf_parser.add_argument(
-        '-c', '--ncol', type=int, default=10,
+        '--ncol', type=int, default=10,
         help="Number of columns. default: %(default)s")
     ToyDf_parser.add_argument(
-        '--na', type=int, default=5,
+        '--nmiss', type=float, default=5,
         help="Number of missing values ingested into the dataframe. default: %(default)s")
     ToyDf_parser.add_argument(
-        '-s', '--seed', type=int, default=123,
+        '--seed', type=int, default=123,
         help="Seed used to initialize a pseudorandom number generator. default: %(default)s")
     ToyDf_parser.add_argument(
         '--prefix', type=str, default='s',
-        help="Prefix of the column names, a series numbers will be appended to the prefix. default: %(default)s")
+        help="Prefix of the column names, a series numbers will be appended to \
+            the prefix. If this is set to None, as np.ndarray rather than pd.dataframe \
+            will be returned. default: %(default)s")
+    ToyDf_parser.add_argument(
+        '--min', type=float, default=0.0,
+        help="The minimum value. default: %(default)s")
+    ToyDf_parser.add_argument(
+        '--max', type=float, default=1.0,
+        help="The maximum value. default: %(default)s")
     ToyDf_parser.add_argument(
         '--decimal', type=int, default=5,
         help="Number of decimal places to round each column to. default: %(default)s")
@@ -331,10 +380,10 @@ def nafiller():
         'output', type=str, metavar='out_df',
         help="Output data frame.")
     InsertNA_parser.add_argument(
-        '--na', type=int,
+        '--nmiss', type=int,
         help="Number of missing values ingested into the dataframe.")
     InsertNA_parser.add_argument(
-        '-s', '--seed', type=int, default=123,
+        '--seed', type=int, default=123,
         help="Seed used to initialize a pseudorandom number generator. default: %(default)s")
     InsertNA_parser.add_argument(
         '--decimal', type=int, default=5,
@@ -483,6 +532,25 @@ def nafiller():
                     continue
             output_df = input_df
             output_df = output_df.round(args.decimal)
+            output_df.to_csv(args.output, sep="\t", na_rep="NaN")
+            logging.info("File \"%s\" contains %d missing values ..." %
+                        (args.output, output_df.isna().sum().sum()))
+        elif command.lower() == 'mw':
+            input_df = read_df(args.input)
+            logging.info("File \"%s\" contains %d missing values ..." %
+                        (args.input, input_df.isna().sum().sum()))
+            logging.info("Replace missing values using moving window on %s ..." % axis_name[args.axis])
+            if args.axis == 1:
+                output_df = moving_window(
+                    input_df, nindex = args.nindex, wsize=args.wsize,
+                    errors=args.errors, func=getattr(np, args.func))
+                output_df = output_df.round(args.decimal)
+            elif args.axis == 0:
+                output_df = moving_window(
+                    input_df.T, nindex = args.nindex, wsize=args.wsize,
+                    errors=args.errors, func=getattr(np, args.func))
+                output_df = output_df.round(args.decimal).T
             output_df.to_csv(args.output, sep="\t", na_rep="NaN")
             logging.info("File \"%s\" contains %d missing values ..." %
                         (args.output, output_df.isna().sum().sum()))
@@ -576,9 +644,10 @@ def nafiller():
         elif command.lower() == 'toydf':
             logging.info("Generate toy dataframe ...")
-            output_df = toy_df(nrow = args.nrow, ncol = args.ncol,
-                               n_miss = args.na, sample_prefix=args.prefix,
-                               seed=args.seed)
+            output_df = toy_df(n_rows = args.nrow, n_cols = args.ncol,
+                               missingness = args.nmiss, sample_prefix=args.prefix,
+                               min_val = args.min, max_val = args.max,
+                               rand_seed=args.seed)
             #print(output_df)
             output_df = output_df.round(args.decimal)
             output_df.to_csv(args.output, sep="\t", na_rep="NaN")
@@ -589,7 +658,7 @@ def nafiller():
             logging.info("File \"%s\" contains %d missing values ..." %
                          (args.input, input_df.isna().sum().sum()))
             logging.info("Insert %d NAs into dataframe ..." % args.na)
-            output_df = insert_na(df=input_df, n_miss=args.na, seed=args.seed)
+            output_df = insert_na(df=input_df, n_miss=args.nmiss, seed=args.seed)
             output_df = output_df.round(args.decimal)
             output_df.to_csv(args.output, sep="\t", na_rep="NaN")
             logging.info("File \"%s\" contains %d missing values ..." %

{cpgtools-2.0.2 → cpgtools-2.0.4}/setup.py RENAMED Viewed

@@ -11,7 +11,6 @@ script_files = [
 "scripts/CpG_to_gene.py",
 "scripts/beta_PCA.py",
 "scripts/beta_UMAP.py",
-"scripts/beta_imputation.py",
 "scripts/beta_jitter_plot.py",
 "scripts/beta_m_conversion.py",
 "scripts/beta_profile_gene_centered.py",
@@ -22,6 +21,7 @@ script_files = [
 "scripts/beta_tSNE.py",
 "scripts/beta_topN.py",
 "scripts/beta_trichotmize.py",
+"scripts/beta_remove_batch_effects.py",
 "scripts/dmc_Bayes.py",
 "scripts/dmc_bb.py",
 "scripts/dmc_fisher.py",
@@ -30,6 +30,7 @@ script_files = [
 "scripts/dmc_nonparametric.py",
 "scripts/dmc_ttest.py",
 "scripts/predict_sex.py",
+"scripts/predict_missing.py",
 ]
 if __name__ == "__main__":

cpgtools-2.0.4/src/cpgmodule/data/AltumAge_cpg.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/AltumAge_multi_platform_cpgs.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/AltumAge_scaler.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/GA_Bohlin.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/GA_Haftorn.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/GA_Knight.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/GA_Lee_CPC.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/GA_Lee_RPC.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/GA_Lee_refined_RPC.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/GA_Mayne.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/Hannum.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/Horvath_2013.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/Horvath_2018.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/Levine.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/Lu_DNAmTL.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/Ped_McEwen.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/Ped_Wu.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/Zhang_BLUP.pkl ADDED Viewed

Binary file

cpgtools-2.0.4/src/cpgmodule/data/Zhang_EN.pkl ADDED Viewed

Binary file

{cpgtools-2.0.2 → cpgtools-2.0.4}/src/cpgtools.egg-info/PKG-INFO RENAMED Viewed

@@ -1,35 +1,15 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: cpgtools
-Version: 2.0.2
+Version: 2.0.4
 Summary: Tools to analyze and visualize DNA methylation data
 Author-email: Liguo Wang <wangliguo78@gmail.com>
 Maintainer-email: Liguo Wang <wangliguo78@gmail.com>
-License: Copyright (c) 2024 The Python Packaging Authority
-        Permission is hereby granted, free of charge, to any person obtaining a copy
-        of this software and associated documentation files (the "Software"), to deal
-        in the Software without restriction, including without limitation the rights
-        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-        copies of the Software, and to permit persons to whom the Software is
-        furnished to do so, subject to the following conditions:
-        The above copyright notice and this permission notice shall be included in all
-        copies or substantial portions of the Software.
-        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-        SOFTWARE.
+License-Expression: MIT
 Project-URL: Documentation, https://cpgtools.readthedocs.io/en/latest/index.html
 Project-URL: Repository, https://github.com/liguowang/cpgtools.git
 Keywords: DNA methylation,EPIC,450K,850K,935K,RRBS,WGBS
 Classifier: Programming Language :: Python :: 3
 Classifier: Development Status :: 4 - Beta
-Classifier: License :: OSI Approved :: MIT License
 Classifier: Environment :: Console
 Classifier: Intended Audience :: Science/Research
 Classifier: Operating System :: MacOS :: MacOS X
@@ -37,7 +17,7 @@ Classifier: Operating System :: POSIX
 Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
 Requires-Python: >=3.5
 Description-Content-Type: text/markdown
-License-File: LICENSE
+License-File: LICENSE.txt
 Requires-Dist: numpy
 Requires-Dist: scipy
 Requires-Dist: scikit-learn
@@ -46,6 +26,8 @@ Requires-Dist: bx-python
 Requires-Dist: pandas
 Requires-Dist: umap-learn
 Requires-Dist: fancyimpute
+Requires-Dist: pycombat
+Dynamic: license-file
 ## Install CpGtools using [pip](https://pip.pypa.io/en/stable/)

{cpgtools-2.0.2 → cpgtools-2.0.4}/src/cpgtools.egg-info/SOURCES.txt RENAMED Viewed

@@ -1,9 +1,10 @@
-LICENSE
+LICENSE.txt
 MANIFEST.in
 README.md
 distribute_setup.py
 pyproject.toml
 setup.py
+scripts/.DS_Store
 scripts/CpG_aggregation.py
 scripts/CpG_anno_position.py
 scripts/CpG_anno_probe.py
@@ -15,11 +16,11 @@ scripts/CpG_logo.py
 scripts/CpG_to_gene.py
 scripts/beta_PCA.py
 scripts/beta_UMAP.py
-scripts/beta_imputation.py
 scripts/beta_jitter_plot.py
 scripts/beta_m_conversion.py
 scripts/beta_profile_gene_centered.py
 scripts/beta_profile_region.py
+scripts/beta_remove_batch_effects.py
 scripts/beta_selectNBest.py
 scripts/beta_stacked_barplot.py
 scripts/beta_stats.py
@@ -33,6 +34,7 @@ scripts/dmc_glm.py
 scripts/dmc_logit.py
 scripts/dmc_nonparametric.py
 scripts/dmc_ttest.py
+scripts/predict_missing.py
 scripts/predict_sex.py
 src/cpgmodule/BED.py
 src/cpgmodule/MI.py
@@ -46,6 +48,25 @@ src/cpgmodule/methylClock.py
 src/cpgmodule/padjust.py
 src/cpgmodule/region2gene.py
 src/cpgmodule/utils.py
+src/cpgmodule/data/AltumAge_cpg.pkl
+src/cpgmodule/data/AltumAge_multi_platform_cpgs.pkl
+src/cpgmodule/data/AltumAge_scaler.pkl
+src/cpgmodule/data/GA_Bohlin.pkl
+src/cpgmodule/data/GA_Haftorn.pkl
+src/cpgmodule/data/GA_Knight.pkl
+src/cpgmodule/data/GA_Lee_CPC.pkl
+src/cpgmodule/data/GA_Lee_RPC.pkl
+src/cpgmodule/data/GA_Lee_refined_RPC.pkl
+src/cpgmodule/data/GA_Mayne.pkl
+src/cpgmodule/data/Hannum.pkl
+src/cpgmodule/data/Horvath_2013.pkl
+src/cpgmodule/data/Horvath_2018.pkl
+src/cpgmodule/data/Levine.pkl
+src/cpgmodule/data/Lu_DNAmTL.pkl
+src/cpgmodule/data/Ped_McEwen.pkl
+src/cpgmodule/data/Ped_Wu.pkl
+src/cpgmodule/data/Zhang_BLUP.pkl
+src/cpgmodule/data/Zhang_EN.pkl
 src/cpgmodule/data/__init__.py
 src/cpgtools.egg-info/PKG-INFO
 src/cpgtools.egg-info/SOURCES.txt

{cpgtools-2.0.2 → cpgtools-2.0.4}/src/cpgtools.egg-info/requires.txt RENAMED Viewed

@@ -6,3 +6,4 @@ bx-python
 pandas
 umap-learn
 fancyimpute
+pycombat

{cpgtools-2.0.2 → cpgtools-2.0.4}/src/impyute/ops/util.py RENAMED Viewed

@@ -41,20 +41,40 @@ def execute_fn_with_args_and_or_kwargs(fn, args, kwargs):
     except TypeError:
         return fn(*args)
-def toy_df(nrow, ncol, n_miss, sample_prefix, seed):
-    """
-    Make a dataFrame (nrow x ncol) with random values between 0 and 1, add
-    some missing values (n_miss). Generate a toy dataframe for testing purposes.
-    """
-    np.random.seed(seed)
-    data = np.random.rand(nrow*ncol).reshape((nrow, ncol)).astype(float)
-    x_ind = np.random.choice(nrow, n_miss)
-    y_ind = np.random.choice(ncol, n_miss)
-    for x,y in zip(x_ind, y_ind):
-        data[x][y] =  np.nan
-    colNames = [sample_prefix + '_' + str(i) for i in range(0,ncol)]
-    df = pd.DataFrame(data, columns=colNames)
-    return df
+def toy_df(n_rows=20, n_cols=5, missingness=0.2, min_val=0, max_val=1,
+              missing_value=np.nan, rand_seed=1234, sample_prefix=None):
+    """Generate an array or DataFrame with NaNs"""
+    np.random.seed(rand_seed)
+    X = np.random.uniform(
+        low = min_val, high = max_val, size = n_rows * n_cols).reshape(n_rows, n_cols).astype(
+        float)
+    # check missingness
+    if missingness > 0:
+        # If missingness >= 1 then use it as approximate (see below) count
+        if missingness >= 1:
+            n_missing = int(missingness)
+        else:
+            n_missing = int(missingness * n_rows * n_cols)
+            print(n_missing)
+    # Introduce NaNs until n_miss "NAs" are inserted.
+    missing_count = 0
+    for i,j in zip(np.random.choice(n_rows, n_missing), np.random.choice(n_cols, n_missing)):
+        if np.isnan(X[i][j]):
+            continue
+        else:
+            X[i][j] = missing_value
+            missing_count += 1
+        if missing_count >= n_missing:
+            break
+    # check sample_prefix
+    if sample_prefix is None:
+        return X
+    else:
+        colNames = [sample_prefix + '_' + str(i) for i in range(0, n_cols)]
+        return pd.DataFrame(X, columns=colNames)
 def insert_na(df, n_miss, seed):
     np.random.seed(seed)