cpgtools 2.0.0__py3-none-any.whl → 2.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cpgtools might be problematic. Click here for more details.

Files changed (75) hide show
  1. cpgmodule/_version.py +1 -0
  2. cpgmodule/utils.py +35 -0
  3. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_aggregation.py +1 -1
  4. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_anno_position.py +1 -1
  5. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_anno_probe.py +1 -2
  6. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_density_gene_centered.py +1 -1
  7. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_distrb_chrom.py +1 -1
  8. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_distrb_gene_centered.py +1 -1
  9. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_distrb_region.py +1 -3
  10. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_logo.py +1 -1
  11. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/CpG_to_gene.py +1 -1
  12. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_PCA.py +31 -23
  13. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_UMAP.py +29 -22
  14. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_jitter_plot.py +1 -1
  15. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_m_conversion.py +1 -1
  16. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_profile_gene_centered.py +1 -1
  17. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_profile_region.py +1 -1
  18. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_selectNBest.py +9 -6
  19. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_stacked_barplot.py +1 -1
  20. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_stats.py +1 -1
  21. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_tSNE.py +31 -24
  22. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_topN.py +1 -1
  23. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/beta_trichotmize.py +1 -1
  24. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_Bayes.py +1 -1
  25. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_bb.py +1 -1
  26. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_fisher.py +1 -1
  27. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_glm.py +1 -1
  28. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_logit.py +1 -1
  29. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_nonparametric.py +1 -1
  30. {cpgtools-2.0.0.data → cpgtools-2.0.3.data}/scripts/dmc_ttest.py +6 -2
  31. cpgtools-2.0.3.data/scripts/predict_missing.py +673 -0
  32. cpgtools-2.0.3.data/scripts/predict_sex.py +126 -0
  33. cpgtools-2.0.3.dist-info/LICENSE +19 -0
  34. cpgtools-2.0.3.dist-info/METADATA +76 -0
  35. cpgtools-2.0.3.dist-info/RECORD +101 -0
  36. {cpgtools-2.0.0.dist-info → cpgtools-2.0.3.dist-info}/WHEEL +1 -1
  37. cpgtools-2.0.3.dist-info/top_level.txt +3 -0
  38. impyute/__init__.py +3 -0
  39. impyute/contrib/__init__.py +7 -0
  40. impyute/contrib/compare.py +69 -0
  41. impyute/contrib/count_missing.py +30 -0
  42. impyute/contrib/describe.py +63 -0
  43. impyute/cs/__init__.py +11 -0
  44. impyute/cs/buck_iterative.py +82 -0
  45. impyute/cs/central_tendency.py +84 -0
  46. impyute/cs/em.py +52 -0
  47. impyute/cs/fast_knn.py +130 -0
  48. impyute/cs/random.py +27 -0
  49. impyute/dataset/__init__.py +6 -0
  50. impyute/dataset/base.py +137 -0
  51. impyute/dataset/corrupt.py +55 -0
  52. impyute/deletion/__init__.py +5 -0
  53. impyute/deletion/complete_case.py +21 -0
  54. impyute/ops/__init__.py +12 -0
  55. impyute/ops/error.py +9 -0
  56. impyute/ops/inverse_distance_weighting.py +31 -0
  57. impyute/ops/matrix.py +47 -0
  58. impyute/ops/testing.py +20 -0
  59. impyute/ops/util.py +96 -0
  60. impyute/ops/wrapper.py +179 -0
  61. impyute/ts/__init__.py +6 -0
  62. impyute/ts/locf.py +57 -0
  63. impyute/ts/moving_window.py +128 -0
  64. missingpy/__init__.py +4 -0
  65. missingpy/knnimpute.py +328 -0
  66. missingpy/missforest.py +556 -0
  67. missingpy/pairwise_external.py +315 -0
  68. missingpy/tests/__init__.py +0 -0
  69. missingpy/tests/test_knnimpute.py +605 -0
  70. missingpy/tests/test_missforest.py +409 -0
  71. missingpy/utils.py +124 -0
  72. cpgtools-2.0.0.dist-info/LICENSE.txt +0 -674
  73. cpgtools-2.0.0.dist-info/METADATA +0 -28
  74. cpgtools-2.0.0.dist-info/RECORD +0 -64
  75. cpgtools-2.0.0.dist-info/top_level.txt +0 -2
@@ -0,0 +1,126 @@
1
+ #!python
2
+
3
+ """
4
+ #==============================================================================
5
+ Predict sex based on the semi-methylation (also known as genomic imprinting)
6
+ ratio. This method leverages the fact that, due to X chromosome inactivation,
7
+ females have a higher proportion of semi-methylated CpGs on their X chromosomes.
8
+ A log2(ratio) greater than 0 indicates a female, while a log2(ratio) less than
9
+ 0 indicates a male.
10
+
11
+ Example of input data file
12
+ ---------------------------
13
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
14
+ cg_001 0.831035 0.878022 0.794427 0.880911
15
+ cg_002 0.249544 0.209949 0.234294 0.236680
16
+ cg_003 0.845065 0.843957 0.840184 0.824286
17
+
18
+ Example of output file
19
+ ----------------------
20
+ Sample_ID log2_SM_ratio Predicted_sex
21
+ Sample_01 -2.249628052954919 Male
22
+ Sample_02 -2.2671726671830674 Male
23
+ Sample_03 1.4530581933290616 Female
24
+ Sample_04 1.4808015115356654 Female
25
+
26
+ ...
27
+
28
+ """
29
+ import sys
30
+ import numpy as np
31
+ from optparse import OptionParser
32
+ from cpgmodule.utils import printlog
33
+ from cpgmodule import ireader
34
+ import pandas as pd
35
+ from cpgmodule._version import __version__
36
+
37
+ __author__ = "Liguo Wang"
38
+ __copyright__ = "Copyleft"
39
+ __credits__ = []
40
+ __license__ = "GPL"
41
+ __maintainer__ = "Liguo Wang"
42
+ __email__ = "wang.liguo@mayo.edu"
43
+ __status__ = "Development"
44
+
45
+
46
+ def main():
47
+
48
+ usage="%prog [options]" + "\n"
49
+ parser = OptionParser(usage,version="%prog " + __version__)
50
+ parser.add_option("-i","--input_file",action="store", type="string",dest="input_file", help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
51
+ parser.add_option("-x","--xprobe",action="store", type="string",dest="xprobe_file", help="File with CpG IDs mapped to the X chromosome, with one probe listed per row.")
52
+ parser.add_option("-c","--cut",action="store", type='float', dest="cutoff", default=0.0, help="The cutoff of log2(SM ratio) to determine the sex prediction. Log2(SM ratio) greater than this cutoff indicates a female, while a log2(ratio) less than this cutoff indicates a male. default=%default")
53
+ parser.add_option("-o","--output",action="store", type='string', dest="out_file", help="The prefix of the output file.")
54
+ (options,args)=parser.parse_args()
55
+
56
+ print ()
57
+ if not (options.input_file):
58
+ print (__doc__)
59
+ parser.print_help()
60
+ sys.exit(101)
61
+ if not (options.xprobe_file):
62
+ print (__doc__)
63
+ parser.print_help()
64
+ sys.exit(102)
65
+ if not (options.out_file):
66
+ print (__doc__)
67
+ parser.print_help()
68
+ sys.exit(103)
69
+
70
+ printlog("Reading X probes from: \"%s\"" % (options.xprobe_file))
71
+ x_cpgs = set()
72
+ for l in ireader.reader(options.xprobe_file):
73
+ l = l.strip()
74
+ if l.startswith('#'):
75
+ continue
76
+ x_cpgs.add(l)
77
+ printlog("Total %d X probes loaded." % len(x_cpgs))
78
+
79
+ printlog("Reading input file: \"%s\"" % (options.input_file))
80
+ df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
81
+ #print (df1)
82
+
83
+ #remove any rows with NAs
84
+ df2 = df1.dropna(axis=0, how='any')
85
+ printlog("%d CpGs with missing values were removed." % (len(df1) - len(df2)))
86
+ #print (df2)
87
+
88
+ sample_cpg_ids = df2.index
89
+ sample_names = df2.columns
90
+ found_x_cpgs = list(x_cpgs & set(sample_cpg_ids))
91
+ printlog("Found %d CpGs located on the chrX from file: %s" % (len(found_x_cpgs), options.input_file))
92
+
93
+ # only X probes in df3
94
+ df3 = df2.loc[list(found_x_cpgs)]
95
+ #pd.DataFrame.to_csv(df3, options.out_file + '.tmp.tsv', sep="\t", index_label="sample")
96
+
97
+ low_beta_range = [0, 0.2]
98
+ mid_beta_range = [0.3, 0.7]
99
+ high_beta_range = [0.8, 1.0]
100
+
101
+ output = {}
102
+ for s in sample_names:
103
+ output[s] = {}
104
+ low_beta_count = pd.cut(df3[s], low_beta_range).count()
105
+ mid_beta_count = pd.cut(df3[s], mid_beta_range).count()
106
+ high_beta_count = pd.cut(df3[s], high_beta_range).count()
107
+ try:
108
+ ratio = np.log2(mid_beta_count/(low_beta_count + high_beta_count))
109
+ except:
110
+ ratio = np.nan
111
+ output[s]['log2_SM_ratio'] = ratio
112
+
113
+ if ratio > options.cutoff:
114
+ output[s]['Predicted_sex'] = 'Female'
115
+ elif ratio < options.cutoff:
116
+ output[s]['Predicted_sex'] = 'Male'
117
+ else:
118
+ output[s]['Predicted_sex'] = 'Unknown'
119
+ df_out = pd.DataFrame(output).T
120
+
121
+ outfile = options.out_file + '.predicted_sex.tsv'
122
+ printlog("Writing to file: \"%s\"" % outfile)
123
+ pd.DataFrame.to_csv(df_out, outfile, sep="\t", index_label="Sample_ID")
124
+
125
+ if __name__=='__main__':
126
+ main()
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2024 The Python Packaging Authority
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.1
2
+ Name: cpgtools
3
+ Version: 2.0.3
4
+ Summary: Tools to analyze and visualize DNA methylation data
5
+ Author-email: Liguo Wang <wangliguo78@gmail.com>
6
+ Maintainer-email: Liguo Wang <wangliguo78@gmail.com>
7
+ License: Copyright (c) 2024 The Python Packaging Authority
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+
27
+ Project-URL: Documentation, https://cpgtools.readthedocs.io/en/latest/index.html
28
+ Project-URL: Repository, https://github.com/liguowang/cpgtools.git
29
+ Keywords: DNA methylation,EPIC,450K,850K,935K,RRBS,WGBS
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Development Status :: 4 - Beta
32
+ Classifier: License :: OSI Approved :: MIT License
33
+ Classifier: Environment :: Console
34
+ Classifier: Intended Audience :: Science/Research
35
+ Classifier: Operating System :: MacOS :: MacOS X
36
+ Classifier: Operating System :: POSIX
37
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
38
+ Requires-Python: >=3.5
39
+ Description-Content-Type: text/markdown
40
+ License-File: LICENSE
41
+ Requires-Dist: numpy
42
+ Requires-Dist: scipy
43
+ Requires-Dist: scikit-learn
44
+ Requires-Dist: weblogo
45
+ Requires-Dist: bx-python
46
+ Requires-Dist: pandas
47
+ Requires-Dist: umap-learn
48
+ Requires-Dist: fancyimpute
49
+
50
+ ## Install CpGtools using [pip](https://pip.pypa.io/en/stable/)
51
+
52
+ 1. (Optional) Create Virtual Environments (Note: `venv` is available in Python 3.3 and later. You can also use [virtualenv](https://packaging.python.org/en/latest/key_projects/#virtualenv))
53
+
54
+ `$ python3 -m venv my_env` (will create a directory called my_env)
55
+
56
+ `$ source my_env/bin/activate`
57
+
58
+ 2. Install CpGtools
59
+
60
+ `$ pip install cpgtools`
61
+
62
+ or
63
+
64
+ `$ pip install git+https://github.com/liguowang/cpgtools.git`
65
+
66
+ 3. Upgrade
67
+
68
+ `$ pip install cpgtools --upgrade`
69
+
70
+ 4. Uninstall
71
+
72
+ `pip -y uninstall cpgtools`
73
+
74
+ ## Documentation
75
+
76
+ https://cpgtools.readthedocs.io/en/latest/
@@ -0,0 +1,101 @@
1
+ cpgmodule/BED.py,sha256=ZNAhGZPFMFD4UpVwGTDOM0hMLnfXF52zerdMb21sH6s,12675
2
+ cpgmodule/MI.py,sha256=pkUWzKXYeyRuGsGKZdFVxHzFWikkuJavdQ_4fsnN3U4,8649
3
+ cpgmodule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ cpgmodule/_version.py,sha256=tATvJM5shAzfspHYjdVwpV2w3-gDA119NlEYi5X2lFY,22
5
+ cpgmodule/cgID.py,sha256=DLG5Djm6JP0d6MiFvP_keUXv_rYxESWrISuVClTs8kw,16017914
6
+ cpgmodule/extend_bed.py,sha256=sjRxbRJFRumjYHwd6_AMTAYBG29yXQPYp1lDjTv2Q_Q,4648
7
+ cpgmodule/imotif.py,sha256=j7_MQFnowGsVTGCRQHHLUnEzSD0HehZPEHKKZ68Jv24,9227
8
+ cpgmodule/ireader.py,sha256=NRTdFnQvd1AEkYchfgvTVTixBVwUjn8xNyswc0OWU1Q,720
9
+ cpgmodule/methylClock.py,sha256=HyMk3vpVwR3yrkLCPv9fVJs6JCeSchbJEBe7i69jdSs,1202
10
+ cpgmodule/padjust.py,sha256=mvbQ9_crn_S39yvWrv_JPoU5OxZGHaw7cV3deBxTPdk,2389
11
+ cpgmodule/region2gene.py,sha256=iuSYa2-ki-qbL1TqqAlTXGT7g-j1uNIA1y7hzMwVM2U,5347
12
+ cpgmodule/utils.py,sha256=NgKT3aJlRT5pQKKO3_e0WB2_u93pY13F-k-r0pvgKno,16095
13
+ cpgmodule/data/AltumAge_cpg.pkl,sha256=T0pfBWrzQO00-z85QNE2CP0ntU0mtF2WkZeEPiArqTw,264691
14
+ cpgmodule/data/AltumAge_multi_platform_cpgs.pkl,sha256=Bo7ZG6AuxXUmLH6dCFfrlYkiTkeLjyqRgcD7yw4mNZ0,264816
15
+ cpgmodule/data/AltumAge_scaler.pkl,sha256=aDMcC4l0wZJGDi6_MeSpdNoykndfgsUtVKEd6mKf9T0,325509
16
+ cpgmodule/data/GA_Bohlin.pkl,sha256=W72ra6APUZDNPmVJ-kRuaVzpiKGgI_K7nDqSFx3nf3w,2667
17
+ cpgmodule/data/GA_Haftorn.pkl,sha256=ZFCJ2h0DWEaa-pHKdsSuWbQGL1JMwDXBtDyhx9DlZLc,4622
18
+ cpgmodule/data/GA_Knight.pkl,sha256=dsKeIt7PxCcw_tcVObtVKB3KqtF7bPH-vQqbClWI5Tc,25322
19
+ cpgmodule/data/GA_Lee_CPC.pkl,sha256=WfeI1bGnIS5_AIX7qnspyGYUgzGOAJ9JJ9CGQjLgkFo,25373
20
+ cpgmodule/data/GA_Lee_RPC.pkl,sha256=rLhr-sboWqIZ6Y08zODh7bgwXBa4SNkE4ZZaIyQTaYc,25382
21
+ cpgmodule/data/GA_Lee_refined_RPC.pkl,sha256=bXiZIgFSoX7Sr0ai1zHzGXv-EMGBglvx8z5vr3IpccA,25348
22
+ cpgmodule/data/GA_Mayne.pkl,sha256=CUfwiVIywJZwTb0PhRuW-6TQ5H1kO7tR6KHepyzBULg,1899
23
+ cpgmodule/data/Hannum.pkl,sha256=gXJxy6S8lBCUdr_T7ZM3R6hqKI22yev5-2giCPCCdLc,2049
24
+ cpgmodule/data/Horvath_2013.pkl,sha256=qwP614VYmmaESKanQ680sYgaHSWJrP73im_f5duzKgU,8313
25
+ cpgmodule/data/Horvath_2018.pkl,sha256=IQcFNIg-z-Mrq8vEoHI509c64Dt1wtyHInU9YwsYP-w,9225
26
+ cpgmodule/data/Levine.pkl,sha256=OhasOuEAvbDblZfGGFmu82kDqdo5CQ2DjZ8hfCofhvI,11751
27
+ cpgmodule/data/Lu_DNAmTL.pkl,sha256=SdjT9x8A0GbnKoMlDhgslmmINJ8e5Asyk9KQlw_EtJo,3548
28
+ cpgmodule/data/Ped_McEwen.pkl,sha256=-hxOAqFUxORmPLV_FKYZxjOFtabxPzbbzfY6JTitq8k,2654
29
+ cpgmodule/data/Ped_Wu.pkl,sha256=gjXtpdpQ8QVJs1I6UM-VPAvQjNd-JDoy5DzDL-gP-Cg,2924
30
+ cpgmodule/data/Zhang_BLUP.pkl,sha256=dpH0L3qTxMAJ50XtTeNMmqx1khZkZyrH204E37fE37E,7038226
31
+ cpgmodule/data/Zhang_EN.pkl,sha256=RFEOmBQGXZvIwQ7tOYT8itOPU87V2Y02vJA2PfY6TnI,11844
32
+ cpgmodule/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ cpgtools-2.0.3.data/scripts/CpG_aggregation.py,sha256=iSSUanCPlTkT3f6Df_f_b6zEP2Ixzse9zFwJLL5z2Qw,7278
34
+ cpgtools-2.0.3.data/scripts/CpG_anno_position.py,sha256=j8l4v7SpGFaJ3pBVE5qK8coMkxe5h4p1bwmhcPnhxMk,4643
35
+ cpgtools-2.0.3.data/scripts/CpG_anno_probe.py,sha256=bACac7x9zX2E1QJnUUmNKS6YNKg_f4K-jxwl_v93--4,3171
36
+ cpgtools-2.0.3.data/scripts/CpG_density_gene_centered.py,sha256=JM2PrvHCnGAvymBbbIBr30AfXwu69WgJkJNy6PCmCrg,5266
37
+ cpgtools-2.0.3.data/scripts/CpG_distrb_chrom.py,sha256=q3xZ1AH4JEif1DQRG6UXiiUb4jZyt2khzTZld1jaXnA,6207
38
+ cpgtools-2.0.3.data/scripts/CpG_distrb_gene_centered.py,sha256=tLfalDRzklBcNfZxhABjhprcBlk4HgUyxXxznOGVTN8,7736
39
+ cpgtools-2.0.3.data/scripts/CpG_distrb_region.py,sha256=nJGgZLhZe2kYCPnxuW--EemxOcoReu2hdkIAgl-7UAE,5391
40
+ cpgtools-2.0.3.data/scripts/CpG_logo.py,sha256=U8RxYPmakKii1xmEgG22tKfdZBPpjhZbAfKxgF_Z-O8,4597
41
+ cpgtools-2.0.3.data/scripts/CpG_to_gene.py,sha256=fciNtI5N6fO-jZX2eyKgiAhDIsus4SljYSCbEQMeTHI,6114
42
+ cpgtools-2.0.3.data/scripts/beta_PCA.py,sha256=cRlqGC5CQlpsmzhAwy2AoI4_lLwjFh_rvn7ObNbHgYg,7155
43
+ cpgtools-2.0.3.data/scripts/beta_UMAP.py,sha256=SJfxtCsM1NTK5rYnZjtXiDj_x1kJiYZvR1NGHKo0IRI,8079
44
+ cpgtools-2.0.3.data/scripts/beta_jitter_plot.py,sha256=Knja1n1rpD0qe3FHkNPax0p6BYsmAtEQBFB1wBdpHyY,4389
45
+ cpgtools-2.0.3.data/scripts/beta_m_conversion.py,sha256=992tJI0IzkphUaEr_9_CveSRPjKdnyYkCTO10D_xkbg,2848
46
+ cpgtools-2.0.3.data/scripts/beta_profile_gene_centered.py,sha256=Jb9mm8y-1cKH_EepRFhUH5mfBdoAt0zpN_VXETRnQek,7312
47
+ cpgtools-2.0.3.data/scripts/beta_profile_region.py,sha256=GHDPyUFR9XJm2CK0_9wCKossMTWWvn4VwYCCSA8kn2E,5971
48
+ cpgtools-2.0.3.data/scripts/beta_selectNBest.py,sha256=Uu_MvQUm6Zc8MGKuGUEooJ-IL7C1eg_LNRqQsNdLaWs,4638
49
+ cpgtools-2.0.3.data/scripts/beta_stacked_barplot.py,sha256=2fcypD_BE4XqK-vl8hHsgyWqvp3I9oLvg8tF2dd5QZ0,3415
50
+ cpgtools-2.0.3.data/scripts/beta_stats.py,sha256=WBh3Aquk_AHeUcteLPtt2Q2yKpqu5aBA38zaFz_7mVI,3021
51
+ cpgtools-2.0.3.data/scripts/beta_tSNE.py,sha256=0UTxNxyaDb-iNqbeIfFKj9DRURTQcHeJdltr-iY72XM,8450
52
+ cpgtools-2.0.3.data/scripts/beta_topN.py,sha256=gmbmJZoGJt3QljlRHUA8LaKSIurdCNgtn2J9LAIQNCo,3914
53
+ cpgtools-2.0.3.data/scripts/beta_trichotmize.py,sha256=T594UfSCDJHRFyGvov7qetLQs10WU73PGVVCDNrwNPg,7143
54
+ cpgtools-2.0.3.data/scripts/dmc_Bayes.py,sha256=imqKv5x8s_plXQPL3_d9OpqSI-imrSfUj-k39Zf1XqA,13735
55
+ cpgtools-2.0.3.data/scripts/dmc_bb.py,sha256=pOEEHT-hT--r6IswZwQFyGri77OJqbW1y9VVG8pzzpg,8407
56
+ cpgtools-2.0.3.data/scripts/dmc_fisher.py,sha256=t18smNJAtipmDm3xgUg62ccKsFQWcdKBo4Tm2cxx43s,5163
57
+ cpgtools-2.0.3.data/scripts/dmc_glm.py,sha256=_vJiZhbGA-Rv5v5oBU9N9lGND8MTEVAFHHFqW5hgfSw,6516
58
+ cpgtools-2.0.3.data/scripts/dmc_logit.py,sha256=V35wyBgcdiUbsR_NDnEDfs6F2yPyz11ryZvYkkV6Ess,8488
59
+ cpgtools-2.0.3.data/scripts/dmc_nonparametric.py,sha256=eFyUy7jXl8bPyySKZLEi3LVxYkDOKp9S4XBXevYvbVw,4945
60
+ cpgtools-2.0.3.data/scripts/dmc_ttest.py,sha256=DDmWi1udo1gqOAQvFTeRxhIJuAgEmakSNU7P4LKTT7U,6786
61
+ cpgtools-2.0.3.data/scripts/predict_missing.py,sha256=pNSfOD9i7LNLG9BGa80W2-bsCi6qcUlIwoiLnBLbPlo,32465
62
+ cpgtools-2.0.3.data/scripts/predict_sex.py,sha256=ojoMJ6XwBsE1kGpxzKGZ4TZPs5JcUwI_C_9ieEQbB9c,4755
63
+ impyute/__init__.py,sha256=Q07nw2fDjLm_c3EQWMpSo8nZCfC-fvs2lpRG5uha-gg,87
64
+ impyute/contrib/__init__.py,sha256=DgGgN0iBoMfHjzr9edz-ZgtJ2KeyIDlyV1t8hxhcM2c,209
65
+ impyute/contrib/compare.py,sha256=psWygE7PD4Cky4MoTJLAgjaEkTXwqemv3NTKkKVaapo,2852
66
+ impyute/contrib/count_missing.py,sha256=KkCpYeuDP5jRxCcURJbne_k-7Zq1k31oq2aFHpqygx8,742
67
+ impyute/contrib/describe.py,sha256=LI6Xage19-DEEDZVNqaaOjemRAVqVkSlH4mxjjubw-A,2033
68
+ impyute/cs/__init__.py,sha256=ArkuAVHm0z4jl3_UK_FUBF8i2vZUvAZpfVFyryUTm18,373
69
+ impyute/cs/buck_iterative.py,sha256=ceoQ2ij64pXHy7NszErTTRb9h2wctF0CMBexhc6YZSA,3009
70
+ impyute/cs/central_tendency.py,sha256=xHwTVeDL-N-O5refvP536FiVEDODR7s-cgX2Ao9LsfI,2127
71
+ impyute/cs/em.py,sha256=SwvVNMYK3llATR2r1baVCA0hy_fwJIkGD110du9Woos,1631
72
+ impyute/cs/fast_knn.py,sha256=CI4RKMcrbXPi7izUtWo-lnq1BuLYwl8bvbx62dj8Qn0,6246
73
+ impyute/cs/random.py,sha256=-Ti4BwMwUGdsciKTp8OVAgYK6TzGP-R4Nw62UY_xqAs,594
74
+ impyute/dataset/__init__.py,sha256=GK-ktaf86PLZOUlO_oOhvc-55SX1YFLcpjcpgf7a87c,192
75
+ impyute/dataset/base.py,sha256=bNYszhYP62fSeDe8w_Ij4yhEf-dv6NiNGcCdC22OkuY,4445
76
+ impyute/dataset/corrupt.py,sha256=uGCnpofJWfWZYBucQMrwT9yOSpEQAaZSG010Jv6ks1I,1491
77
+ impyute/deletion/__init__.py,sha256=ztemVB8Q6LRaxSv8DDdFzZlGUVUQbsVRA7gvcs_vMhM,124
78
+ impyute/deletion/complete_case.py,sha256=2xBhqNOSmEeW-sBQvl3TVJCcf2Ybmkr8HXNk9xrQ2J0,395
79
+ impyute/ops/__init__.py,sha256=ZP_rQQ0qATGEibUA9I8D1JfD1kRT2Xqw8SsOMPIddog,275
80
+ impyute/ops/error.py,sha256=lyXJcIDyfdrj_JJbsEI18z-t4UKi1mpLTyt3iS2YvTs,226
81
+ impyute/ops/inverse_distance_weighting.py,sha256=88hmyc-dWltP66MbjrW-VHBcVgPOI0V0BkrV30B-Lyc,1017
82
+ impyute/ops/matrix.py,sha256=-eniwqvbvwXNrYw8PmIMD95uMcPMq6HE6OBOyjzR-Vk,837
83
+ impyute/ops/testing.py,sha256=3ZRVL1sc5IIQLPW2GrD6-lF4_nVnSLo7SBW4rfaOMLA,415
84
+ impyute/ops/util.py,sha256=167Xpwib86kT09-pZnJg-VhyVD9r-z7Gkd_L42A2-EY,2838
85
+ impyute/ops/wrapper.py,sha256=bQj3r9WsZxqNw51y3OobyyXPPxCeISUv2jb-Ga2C_-M,6022
86
+ impyute/ts/__init__.py,sha256=veePCYSBfo1t5Ahh3wrVCNveizcniXZYhjyS7ahkIhI,145
87
+ impyute/ts/locf.py,sha256=qusLWfBrepKzFIkeGxIS0S5KnqRgZde2CV6mcF5nf4c,1876
88
+ impyute/ts/moving_window.py,sha256=G3Z4ohhqZXAVPflptuOhktwucbW3tg8FySb7gTtBraE,4991
89
+ missingpy/__init__.py,sha256=H_yX0CtfoG2UaJPRcoZKv0AFhi4n-1hHmfIwfqQXoPk,96
90
+ missingpy/knnimpute.py,sha256=WxniKg6-3pvwJa9mipiCZBqkRL_GGHUcxDu-7b57x3k,13456
91
+ missingpy/missforest.py,sha256=UYRf2YANPwxvRPruN3P6xlhJ5JPZB1x8o4xVhYfIp74,24696
92
+ missingpy/pairwise_external.py,sha256=XU_vW-rMCPD3aH9k0UrP_p90QUHDZqe3QZPoOQmu2vo,13221
93
+ missingpy/utils.py,sha256=sMU4OGHPvRgReT8e_HqGCYvjAJZkE_qlAPDzLo-_M6U,4360
94
+ missingpy/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
+ missingpy/tests/test_knnimpute.py,sha256=rR1OPZ4-IzcIKgWmSEN7UEPvVSRwJe0KU_wTxPr46k0,17527
96
+ missingpy/tests/test_missforest.py,sha256=5YfKZf_xdy9RcXmnlFz7cJDqdnspJDzzrxLnVVWJi3A,13725
97
+ cpgtools-2.0.3.dist-info/LICENSE,sha256=NAIE1kmjlmRNJ1BwR9m9i0jXHmQqqujTnEyIBuIxvwM,1074
98
+ cpgtools-2.0.3.dist-info/METADATA,sha256=N1FqwvSLFmK2DvVyYY4oHcqbo6Vj1B4ii48vESlNPkU,2943
99
+ cpgtools-2.0.3.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
100
+ cpgtools-2.0.3.dist-info/top_level.txt,sha256=L6IX1ORvZ1JssvdU8qOtU99-vbMK9ZSIgYg4zH-rL5Y,28
101
+ cpgtools-2.0.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.38.4)
2
+ Generator: setuptools (75.5.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,3 @@
1
+ cpgmodule
2
+ impyute
3
+ missingpy
impyute/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """ Imputations for cross-sectional and time-series data. """
2
+
3
+ __all__ = ["cs", "ts"]
@@ -0,0 +1,7 @@
1
+ """ Volatile code. Expect stuff in this to change. """
2
+
3
+ from .describe import describe
4
+ from .count_missing import count_missing
5
+ from .compare import compare
6
+
7
+ __all__ = ["describe", "count_missing", "compare"]
@@ -0,0 +1,69 @@
1
+ """impyute.contrib.compare.py"""
2
+ import importlib
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import accuracy_score
5
+ # pylint: disable=too-many-locals, dangerous-default-value
6
+
7
+ def compare(imputed, classifiers=["sklearn.svm.SVC"], log_path=None):
8
+ """
9
+ Given an imputed dataset with labels and a list of supervised machine
10
+ learning model, find accuracy score of all model/imputation pairs.
11
+
12
+ Parameters
13
+ ----------
14
+ imputed: [(str, np.ndarray), (str, np.ndarray)...]
15
+ List of tuples containing (imputation_name, imputed_data) where
16
+ `imputation_name` is a string and `imputed_data` is a tuple where
17
+ `imputed_data`[0] is the data, X and `imputed_data`[1] is the label, y
18
+ classifiers: [str, str...str] (optional)
19
+ Provide a list of classifiers to run imputed data sets on. Right now,
20
+ it ONLY works with sklearn, the format should be like so:
21
+ `sklearn.SUBMODULE.FUNCTION`. More generally its
22
+ 'MODULE.SUBMODULE.FUNCTION'. If providing a custom classifier, make
23
+ sure to add the file location to sys.path first and the classifier
24
+ should also be structured like sklearn (with a `fit` and `predict`
25
+ method).
26
+ log_path: str (optional)
27
+ To write results to a file, provide a relative path
28
+
29
+ Returns
30
+ -------
31
+ results.txt
32
+ Classification results on imputed data
33
+
34
+ """
35
+ clfs = []
36
+ for clf_name in classifiers:
37
+ mod_name, smod_name, fn_name = clf_name.split(".")
38
+ try:
39
+ mod = importlib.import_module("{}.{}".format(mod_name, smod_name))
40
+ fn = getattr(mod, fn_name)
41
+ clfs.append([fn_name, fn])
42
+ except ModuleNotFoundError:
43
+ print("Cannot import '{}' from '{}.{}'".format(fn_name,
44
+ mod_name,
45
+ smod_name))
46
+
47
+ results = {imputation_name: [] for imputation_name, _ in imputed}
48
+
49
+ for imputation_name, data in imputed:
50
+ X, y = data
51
+ X_train, X_test, y_train, y_test = train_test_split(X, y,
52
+ test_size=0.33,
53
+ random_state=0)
54
+ print("Imputation {} =========".format(imputation_name))
55
+ for clf_name, clf in clfs:
56
+ clf = clf()
57
+ clf.fit(X_train, y_train)
58
+ y_pred = clf.predict(X_test)
59
+ accuracy = accuracy_score(y_test, y_pred)
60
+ results[imputation_name].append((clf_name, accuracy))
61
+ print("...{}".format(clf_name))
62
+
63
+ # If not None, write to path
64
+ if log_path:
65
+ with open(log_path, 'w') as f:
66
+ f.write(str(results))
67
+ print("Results saved to {}".format(log_path))
68
+
69
+ return results
@@ -0,0 +1,30 @@
1
+ """ impyute.contrib.count_missing.py """
2
+ import numpy as np
3
+ from impyute.ops import matrix
4
+
5
+ def count_missing(data):
6
+ """ Calculate the total percentage of missing values and also the
7
+ percentage in each column.
8
+
9
+ Parameters
10
+ ----------
11
+ data: np.array
12
+ Data to impute.
13
+
14
+ Returns
15
+ -------
16
+ dict
17
+ Percentage of missing values in total and in each column.
18
+
19
+ """
20
+ size = len(data.flatten())
21
+ nan_xy = matrix.nan_indices(data)
22
+ np.unique(nan_xy)
23
+ counter = {y: 0. for y in np.unique(nan_xy.T[1])}
24
+ change_in_percentage = 1./size
25
+ for _, y in nan_xy:
26
+ counter[y] += change_in_percentage
27
+ total_missing = len(nan_xy)/size
28
+ counter["total"] = total_missing
29
+
30
+ return counter
@@ -0,0 +1,63 @@
1
+ """ impyute.contrib.describe """
2
+ from impyute.ops import matrix
3
+
4
+ def describe(data): # verbose=True):
5
+ """ Print input/output multiple times
6
+
7
+ Eventually will be used instead of matrix.nan_indices everywhere
8
+
9
+ Parameters
10
+ ----------
11
+ data: numpy.nd.array
12
+ The data you want to get a description from
13
+ verbose: boolean(optional)
14
+ Decides whether the description is short or long form
15
+
16
+ Returns
17
+ -------
18
+ dict
19
+ missingness: list
20
+ Confidence interval of data being MCAR, MAR or MNAR - in that order
21
+ nan_xy: list of tuples
22
+ Indices of all null points
23
+ nan_n: list
24
+ Total number of null values for each column
25
+ pmissing_n: float
26
+ Percentage of missing values in dataset
27
+ nan_rows: list
28
+ Indices of all rows that are completely null
29
+ nan_cols: list
30
+ Indices of all columns that are completely null
31
+ mean_rows: list
32
+ Mean value of each row
33
+ mean_cols: list
34
+ Mean value of each column
35
+ std_dev: list
36
+ std dev for each row/column
37
+ min_max: list
38
+ Finds the minimum and maximum for each row
39
+
40
+ """
41
+ # missingness = [0.33, 0.33, 0.33] # find_missingness(data)
42
+ nan_xy = matrix.nan_indices(data)
43
+ nan_n = len(nan_xy)
44
+ pmissing_n = float(nan_n/len(data.flatten))
45
+ # pmissing_rows = ""
46
+ # pmissing_cols = ""
47
+ # nan_rows = ""
48
+ # nan_cols = ""
49
+ # mean_rows = ""
50
+ # mean_cols = ""
51
+ # std_dev = ""
52
+ # "missingness": missingness,
53
+ description = {"nan_xy": nan_xy,
54
+ "nan_n": nan_n,
55
+ "pmissing_n": pmissing_n}
56
+ # "pmissing_rows": pmissing_rows,
57
+ # "pmissing_cols": pmissing_cols,
58
+ # "nan_rows": nan_rows,
59
+ # "nan_cols": nan_cols,
60
+ # "mean_rows": mean_rows,
61
+ # "mean_cols": mean_cols,
62
+ # "std_dev": std_dev}
63
+ return description
impyute/cs/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """ Imputations for cross-sectional data. """
2
+
3
+ from .random import random_impute
4
+ from .central_tendency import mean
5
+ from .central_tendency import mode
6
+ from .central_tendency import median
7
+ from .buck_iterative import buck_iterative
8
+ from .em import em
9
+ from .fast_knn import fast_knn
10
+
11
+ __all__ = ["random_impute", "mean", "mode", "median", "buck_iterative", "em", "fast_knn"]
@@ -0,0 +1,82 @@
1
+ import numpy as np
2
+ from sklearn.linear_model import LinearRegression
3
+ from impyute.ops import matrix
4
+ from impyute.ops import wrapper
5
+ # pylint: disable=too-many-locals
6
+
7
+ @wrapper.wrappers
8
+ @wrapper.checks
9
+ def buck_iterative(data):
10
+ """ Iterative variant of buck's method
11
+
12
+ - Variable to regress on is chosen at random.
13
+ - EM type infinite regression loop stops after change in prediction from
14
+ previous prediction < 10% for all columns with missing values
15
+
16
+ A Method of Estimation of Missing Values in Multivariate Data Suitable for
17
+ use with an Electronic Computer S. F. Buck Journal of the Royal Statistical
18
+ Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306
19
+
20
+ Parameters
21
+ ----------
22
+ data: numpy.ndarray
23
+ Data to impute.
24
+
25
+ Returns
26
+ -------
27
+ numpy.ndarray
28
+ Imputed data.
29
+
30
+ """
31
+ nan_xy = matrix.nan_indices(data)
32
+
33
+ # Add a column of zeros to the index values
34
+ nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1)
35
+
36
+ nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz]
37
+ temp = []
38
+ cols_missing = {y for _, y, _ in nan_xyz}
39
+
40
+ # Step 1: Simple Imputation, these are just placeholders
41
+ for x_i, y_i, value in nan_xyz:
42
+ # Column containing nan value without the nan value
43
+ col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
44
+
45
+ new_value = np.mean(col)
46
+ data[x_i][y_i] = new_value
47
+ temp.append([x_i, y_i, new_value])
48
+ nan_xyz = temp
49
+
50
+ # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)
51
+
52
+ converged = [False] * len(nan_xyz)
53
+ while not all(converged):
54
+ # Step 2: Placeholders are set back to missing for one variable/column
55
+ dependent_col = int(np.random.choice(list(cols_missing)))
56
+ missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col]
57
+
58
+ # Step 3: Perform linear regression using the other variables
59
+ x_train, y_train = [], []
60
+ for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs):
61
+ x_train.append(np.delete(data[x_i], dependent_col))
62
+ y_train.append(data[x_i][dependent_col])
63
+ model = LinearRegression()
64
+ model.fit(x_train, y_train)
65
+
66
+ # Step 4: Missing values for the missing variable/column are replaced
67
+ # with predictions from our new linear regression model
68
+ # For null indices with the dependent column that was randomly chosen
69
+ for i, z in enumerate(nan_xyz):
70
+ x_i = z[0]
71
+ y_i = z[1]
72
+ value = data[x_i, y_i]
73
+ if y_i == dependent_col:
74
+ # Row 'x' without the nan value
75
+ new_value = model.predict([np.delete(data[x_i], dependent_col)])
76
+ data[x_i][y_i] = new_value.reshape(1, -1)
77
+ if value == 0.0:
78
+ delta = (new_value-value)/0.01
79
+ else:
80
+ delta = (new_value-value)/value
81
+ converged[i] = abs(delta) < 0.1
82
+ return data