cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
@@ -0,0 +1,126 @@
1
+ #!python
2
+
3
+ """
4
+ #==============================================================================
5
+ Predict sex based on the semi-methylation (also known as genomic imprinting)
6
+ ratio. This method leverages the fact that, due to X chromosome inactivation,
7
+ females have a higher proportion of semi-methylated CpGs on their X chromosomes.
8
+ A log2(ratio) greater than 0 indicates a female, while a log2(ratio) less than
9
+ 0 indicates a male.
10
+
11
+ Example of input data file
12
+ ---------------------------
13
+ CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
14
+ cg_001 0.831035 0.878022 0.794427 0.880911
15
+ cg_002 0.249544 0.209949 0.234294 0.236680
16
+ cg_003 0.845065 0.843957 0.840184 0.824286
17
+
18
+ Example of output file
19
+ ----------------------
20
+ Sample_ID log2_SM_ratio Predicted_sex
21
+ Sample_01 -2.249628052954919 Male
22
+ Sample_02 -2.2671726671830674 Male
23
+ Sample_03 1.4530581933290616 Female
24
+ Sample_04 1.4808015115356654 Female
25
+
26
+ ...
27
+
28
+ """
29
+ import sys
30
+ import numpy as np
31
+ from optparse import OptionParser
32
+ from cpgmodule.utils import printlog
33
+ from cpgmodule import ireader
34
+ import pandas as pd
35
+ from cpgmodule._version import __version__
36
+
37
+ __author__ = "Liguo Wang"
38
+ __copyright__ = "Copyleft"
39
+ __credits__ = []
40
+ __license__ = "GPL"
41
+ __maintainer__ = "Liguo Wang"
42
+ __email__ = "wang.liguo@mayo.edu"
43
+ __status__ = "Development"
44
+
45
+
46
+ def main():
47
+
48
+ usage="%prog [options]" + "\n"
49
+ parser = OptionParser(usage,version="%prog " + __version__)
50
+ parser.add_option("-i","--input_file",action="store", type="string",dest="input_file", help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
51
+ parser.add_option("-x","--xprobe",action="store", type="string",dest="xprobe_file", help="File with CpG IDs mapped to the X chromosome, with one probe listed per row.")
52
+ parser.add_option("-c","--cut",action="store", type='float', dest="cutoff", default=0.0, help="The cutoff of log2(SM ratio) to determine the sex prediction. Log2(SM ratio) greater than this cutoff indicates a female, while a log2(ratio) less than this cutoff indicates a male. default=%default")
53
+ parser.add_option("-o","--output",action="store", type='string', dest="out_file", help="The prefix of the output file.")
54
+ (options,args)=parser.parse_args()
55
+
56
+ print ()
57
+ if not (options.input_file):
58
+ print (__doc__)
59
+ parser.print_help()
60
+ sys.exit(101)
61
+ if not (options.xprobe_file):
62
+ print (__doc__)
63
+ parser.print_help()
64
+ sys.exit(102)
65
+ if not (options.out_file):
66
+ print (__doc__)
67
+ parser.print_help()
68
+ sys.exit(103)
69
+
70
+ printlog("Reading X probes from: \"%s\"" % (options.xprobe_file))
71
+ x_cpgs = set()
72
+ for l in ireader.reader(options.xprobe_file):
73
+ l = l.strip()
74
+ if l.startswith('#'):
75
+ continue
76
+ x_cpgs.add(l)
77
+ printlog("Total %d X probes loaded." % len(x_cpgs))
78
+
79
+ printlog("Reading input file: \"%s\"" % (options.input_file))
80
+ df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
81
+ #print (df1)
82
+
83
+ #remove any rows with NAs
84
+ df2 = df1.dropna(axis=0, how='any')
85
+ printlog("%d CpGs with missing values were removed." % (len(df1) - len(df2)))
86
+ #print (df2)
87
+
88
+ sample_cpg_ids = df2.index
89
+ sample_names = df2.columns
90
+ found_x_cpgs = list(x_cpgs & set(sample_cpg_ids))
91
+ printlog("Found %d CpGs located on the chrX from file: %s" % (len(found_x_cpgs), options.input_file))
92
+
93
+ # only X probes in df3
94
+ df3 = df2.loc[list(found_x_cpgs)]
95
+ #pd.DataFrame.to_csv(df3, options.out_file + '.tmp.tsv', sep="\t", index_label="sample")
96
+
97
+ low_beta_range = [0, 0.2]
98
+ mid_beta_range = [0.3, 0.7]
99
+ high_beta_range = [0.8, 1.0]
100
+
101
+ output = {}
102
+ for s in sample_names:
103
+ output[s] = {}
104
+ low_beta_count = pd.cut(df3[s], low_beta_range).count()
105
+ mid_beta_count = pd.cut(df3[s], mid_beta_range).count()
106
+ high_beta_count = pd.cut(df3[s], high_beta_range).count()
107
+ try:
108
+ ratio = np.log2(mid_beta_count/(low_beta_count + high_beta_count))
109
+ except:
110
+ ratio = np.nan
111
+ output[s]['log2_SM_ratio'] = ratio
112
+
113
+ if ratio > options.cutoff:
114
+ output[s]['Predicted_sex'] = 'Female'
115
+ elif ratio < options.cutoff:
116
+ output[s]['Predicted_sex'] = 'Male'
117
+ else:
118
+ output[s]['Predicted_sex'] = 'Unknown'
119
+ df_out = pd.DataFrame(output).T
120
+
121
+ outfile = options.out_file + '.predicted_sex.tsv'
122
+ printlog("Writing to file: \"%s\"" % outfile)
123
+ pd.DataFrame.to_csv(df_out, outfile, sep="\t", index_label="Sample_ID")
124
+
125
+ if __name__=='__main__':
126
+ main()
@@ -0,0 +1,59 @@
1
+ Metadata-Version: 2.4
2
+ Name: cpgtools
3
+ Version: 2.0.5
4
+ Summary: Tools to analyze and visualize DNA methylation data
5
+ Author-email: Liguo Wang <wangliguo78@gmail.com>
6
+ Maintainer-email: Liguo Wang <wangliguo78@gmail.com>
7
+ License-Expression: MIT
8
+ Project-URL: Documentation, https://cpgtools.readthedocs.io/en/latest/index.html
9
+ Project-URL: Repository, https://github.com/liguowang/cpgtools.git
10
+ Keywords: DNA methylation,EPIC,450K,850K,935K,RRBS,WGBS
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Operating System :: MacOS :: MacOS X
16
+ Classifier: Operating System :: POSIX
17
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
+ Requires-Python: >=3.5
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE.txt
21
+ Requires-Dist: numpy
22
+ Requires-Dist: scipy
23
+ Requires-Dist: scikit-learn
24
+ Requires-Dist: weblogo
25
+ Requires-Dist: bx-python
26
+ Requires-Dist: pandas
27
+ Requires-Dist: umap-learn
28
+ Requires-Dist: fancyimpute
29
+ Requires-Dist: pycombat
30
+ Requires-Dist: matplotlib
31
+ Dynamic: license-file
32
+
33
+ ## Install CpGtools using [pip](https://pip.pypa.io/en/stable/)
34
+
35
+ 1. (Optional) Create Virtual Environments (Note: `venv` is available in Python 3.3 and later. You can also use [virtualenv](https://packaging.python.org/en/latest/key_projects/#virtualenv))
36
+
37
+ `$ python3 -m venv my_env` (will create a directory called my_env)
38
+
39
+ `$ source my_env/bin/activate`
40
+
41
+ 2. Install CpGtools
42
+
43
+ `$ pip install cpgtools`
44
+
45
+ or
46
+
47
+ `$ pip install git+https://github.com/liguowang/cpgtools.git`
48
+
49
+ 3. Upgrade
50
+
51
+ `$ pip install cpgtools --upgrade`
52
+
53
+ 4. Uninstall
54
+
55
+ `pip -y uninstall cpgtools`
56
+
57
+ ## Documentation
58
+
59
+ https://cpgtools.readthedocs.io/en/latest/
@@ -0,0 +1,104 @@
1
+ impyutelib.py,sha256=cEHb_sKSapIOKubN2estkV5NcAPFENeYwzgeykT24a0,30719
2
+ misspylib.py,sha256=3r-UN1bgKz123KMDrya1nPSEYzrI1Ef0MvyAe4WoqPU,24864
3
+ cpgmodule/BED.py,sha256=ZNAhGZPFMFD4UpVwGTDOM0hMLnfXF52zerdMb21sH6s,12675
4
+ cpgmodule/MI.py,sha256=pkUWzKXYeyRuGsGKZdFVxHzFWikkuJavdQ_4fsnN3U4,8649
5
+ cpgmodule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ cpgmodule/_version.py,sha256=xEb7Z4b8xalXXExBg42XPAhbJKniHzcsEPjp-6S3ppg,22
7
+ cpgmodule/cgID.py,sha256=DLG5Djm6JP0d6MiFvP_keUXv_rYxESWrISuVClTs8kw,16017914
8
+ cpgmodule/extend_bed.py,sha256=sjRxbRJFRumjYHwd6_AMTAYBG29yXQPYp1lDjTv2Q_Q,4648
9
+ cpgmodule/imotif.py,sha256=j7_MQFnowGsVTGCRQHHLUnEzSD0HehZPEHKKZ68Jv24,9227
10
+ cpgmodule/ireader.py,sha256=NRTdFnQvd1AEkYchfgvTVTixBVwUjn8xNyswc0OWU1Q,720
11
+ cpgmodule/methylClock.py,sha256=HyMk3vpVwR3yrkLCPv9fVJs6JCeSchbJEBe7i69jdSs,1202
12
+ cpgmodule/padjust.py,sha256=mvbQ9_crn_S39yvWrv_JPoU5OxZGHaw7cV3deBxTPdk,2389
13
+ cpgmodule/region2gene.py,sha256=iuSYa2-ki-qbL1TqqAlTXGT7g-j1uNIA1y7hzMwVM2U,5347
14
+ cpgmodule/utils.py,sha256=NgKT3aJlRT5pQKKO3_e0WB2_u93pY13F-k-r0pvgKno,16095
15
+ cpgmodule/data/AltumAge_cpg.pkl,sha256=T0pfBWrzQO00-z85QNE2CP0ntU0mtF2WkZeEPiArqTw,264691
16
+ cpgmodule/data/AltumAge_multi_platform_cpgs.pkl,sha256=Bo7ZG6AuxXUmLH6dCFfrlYkiTkeLjyqRgcD7yw4mNZ0,264816
17
+ cpgmodule/data/AltumAge_scaler.pkl,sha256=aDMcC4l0wZJGDi6_MeSpdNoykndfgsUtVKEd6mKf9T0,325509
18
+ cpgmodule/data/GA_Bohlin.pkl,sha256=W72ra6APUZDNPmVJ-kRuaVzpiKGgI_K7nDqSFx3nf3w,2667
19
+ cpgmodule/data/GA_Haftorn.pkl,sha256=ZFCJ2h0DWEaa-pHKdsSuWbQGL1JMwDXBtDyhx9DlZLc,4622
20
+ cpgmodule/data/GA_Knight.pkl,sha256=dsKeIt7PxCcw_tcVObtVKB3KqtF7bPH-vQqbClWI5Tc,25322
21
+ cpgmodule/data/GA_Lee_CPC.pkl,sha256=WfeI1bGnIS5_AIX7qnspyGYUgzGOAJ9JJ9CGQjLgkFo,25373
22
+ cpgmodule/data/GA_Lee_RPC.pkl,sha256=rLhr-sboWqIZ6Y08zODh7bgwXBa4SNkE4ZZaIyQTaYc,25382
23
+ cpgmodule/data/GA_Lee_refined_RPC.pkl,sha256=bXiZIgFSoX7Sr0ai1zHzGXv-EMGBglvx8z5vr3IpccA,25348
24
+ cpgmodule/data/GA_Mayne.pkl,sha256=CUfwiVIywJZwTb0PhRuW-6TQ5H1kO7tR6KHepyzBULg,1899
25
+ cpgmodule/data/Hannum.pkl,sha256=gXJxy6S8lBCUdr_T7ZM3R6hqKI22yev5-2giCPCCdLc,2049
26
+ cpgmodule/data/Horvath_2013.pkl,sha256=qwP614VYmmaESKanQ680sYgaHSWJrP73im_f5duzKgU,8313
27
+ cpgmodule/data/Horvath_2018.pkl,sha256=IQcFNIg-z-Mrq8vEoHI509c64Dt1wtyHInU9YwsYP-w,9225
28
+ cpgmodule/data/Levine.pkl,sha256=OhasOuEAvbDblZfGGFmu82kDqdo5CQ2DjZ8hfCofhvI,11751
29
+ cpgmodule/data/Lu_DNAmTL.pkl,sha256=SdjT9x8A0GbnKoMlDhgslmmINJ8e5Asyk9KQlw_EtJo,3548
30
+ cpgmodule/data/Ped_McEwen.pkl,sha256=-hxOAqFUxORmPLV_FKYZxjOFtabxPzbbzfY6JTitq8k,2654
31
+ cpgmodule/data/Ped_Wu.pkl,sha256=gjXtpdpQ8QVJs1I6UM-VPAvQjNd-JDoy5DzDL-gP-Cg,2924
32
+ cpgmodule/data/Zhang_BLUP.pkl,sha256=dpH0L3qTxMAJ50XtTeNMmqx1khZkZyrH204E37fE37E,7038226
33
+ cpgmodule/data/Zhang_EN.pkl,sha256=RFEOmBQGXZvIwQ7tOYT8itOPU87V2Y02vJA2PfY6TnI,11844
34
+ cpgmodule/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
+ cpgtools-2.0.5.data/scripts/CpG_aggregation.py,sha256=iSSUanCPlTkT3f6Df_f_b6zEP2Ixzse9zFwJLL5z2Qw,7278
36
+ cpgtools-2.0.5.data/scripts/CpG_anno_position.py,sha256=j8l4v7SpGFaJ3pBVE5qK8coMkxe5h4p1bwmhcPnhxMk,4643
37
+ cpgtools-2.0.5.data/scripts/CpG_anno_probe.py,sha256=bACac7x9zX2E1QJnUUmNKS6YNKg_f4K-jxwl_v93--4,3171
38
+ cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py,sha256=JM2PrvHCnGAvymBbbIBr30AfXwu69WgJkJNy6PCmCrg,5266
39
+ cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py,sha256=q3xZ1AH4JEif1DQRG6UXiiUb4jZyt2khzTZld1jaXnA,6207
40
+ cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py,sha256=tLfalDRzklBcNfZxhABjhprcBlk4HgUyxXxznOGVTN8,7736
41
+ cpgtools-2.0.5.data/scripts/CpG_distrb_region.py,sha256=nJGgZLhZe2kYCPnxuW--EemxOcoReu2hdkIAgl-7UAE,5391
42
+ cpgtools-2.0.5.data/scripts/CpG_logo.py,sha256=U8RxYPmakKii1xmEgG22tKfdZBPpjhZbAfKxgF_Z-O8,4597
43
+ cpgtools-2.0.5.data/scripts/CpG_to_gene.py,sha256=fciNtI5N6fO-jZX2eyKgiAhDIsus4SljYSCbEQMeTHI,6114
44
+ cpgtools-2.0.5.data/scripts/beta_PCA.py,sha256=-Cp9vhrNr5nKs6xPLRrQ2NJUj3eTm-kFNvRMv7mKaT4,8487
45
+ cpgtools-2.0.5.data/scripts/beta_UMAP.py,sha256=SJfxtCsM1NTK5rYnZjtXiDj_x1kJiYZvR1NGHKo0IRI,8079
46
+ cpgtools-2.0.5.data/scripts/beta_combat.py,sha256=iX40a_usa_BRPANGvRi8XIIRz_gX8h9-wn3Q7__sT9c,6710
47
+ cpgtools-2.0.5.data/scripts/beta_jitter_plot.py,sha256=Knja1n1rpD0qe3FHkNPax0p6BYsmAtEQBFB1wBdpHyY,4389
48
+ cpgtools-2.0.5.data/scripts/beta_m_conversion.py,sha256=992tJI0IzkphUaEr_9_CveSRPjKdnyYkCTO10D_xkbg,2848
49
+ cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py,sha256=Jb9mm8y-1cKH_EepRFhUH5mfBdoAt0zpN_VXETRnQek,7312
50
+ cpgtools-2.0.5.data/scripts/beta_profile_region.py,sha256=GHDPyUFR9XJm2CK0_9wCKossMTWWvn4VwYCCSA8kn2E,5971
51
+ cpgtools-2.0.5.data/scripts/beta_selectNBest.py,sha256=Uu_MvQUm6Zc8MGKuGUEooJ-IL7C1eg_LNRqQsNdLaWs,4638
52
+ cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py,sha256=2fcypD_BE4XqK-vl8hHsgyWqvp3I9oLvg8tF2dd5QZ0,3415
53
+ cpgtools-2.0.5.data/scripts/beta_stats.py,sha256=WBh3Aquk_AHeUcteLPtt2Q2yKpqu5aBA38zaFz_7mVI,3021
54
+ cpgtools-2.0.5.data/scripts/beta_tSNE.py,sha256=0UTxNxyaDb-iNqbeIfFKj9DRURTQcHeJdltr-iY72XM,8450
55
+ cpgtools-2.0.5.data/scripts/beta_topN.py,sha256=gmbmJZoGJt3QljlRHUA8LaKSIurdCNgtn2J9LAIQNCo,3914
56
+ cpgtools-2.0.5.data/scripts/beta_trichotmize.py,sha256=T594UfSCDJHRFyGvov7qetLQs10WU73PGVVCDNrwNPg,7143
57
+ cpgtools-2.0.5.data/scripts/dmc_Bayes.py,sha256=imqKv5x8s_plXQPL3_d9OpqSI-imrSfUj-k39Zf1XqA,13735
58
+ cpgtools-2.0.5.data/scripts/dmc_bb.py,sha256=pOEEHT-hT--r6IswZwQFyGri77OJqbW1y9VVG8pzzpg,8407
59
+ cpgtools-2.0.5.data/scripts/dmc_fisher.py,sha256=t18smNJAtipmDm3xgUg62ccKsFQWcdKBo4Tm2cxx43s,5163
60
+ cpgtools-2.0.5.data/scripts/dmc_glm.py,sha256=_vJiZhbGA-Rv5v5oBU9N9lGND8MTEVAFHHFqW5hgfSw,6516
61
+ cpgtools-2.0.5.data/scripts/dmc_logit.py,sha256=V35wyBgcdiUbsR_NDnEDfs6F2yPyz11ryZvYkkV6Ess,8488
62
+ cpgtools-2.0.5.data/scripts/dmc_nonparametric.py,sha256=eFyUy7jXl8bPyySKZLEi3LVxYkDOKp9S4XBXevYvbVw,4945
63
+ cpgtools-2.0.5.data/scripts/dmc_ttest.py,sha256=DDmWi1udo1gqOAQvFTeRxhIJuAgEmakSNU7P4LKTT7U,6786
64
+ cpgtools-2.0.5.data/scripts/predict_missing.py,sha256=pNSfOD9i7LNLG9BGa80W2-bsCi6qcUlIwoiLnBLbPlo,32465
65
+ cpgtools-2.0.5.data/scripts/predict_sex.py,sha256=ojoMJ6XwBsE1kGpxzKGZ4TZPs5JcUwI_C_9ieEQbB9c,4755
66
+ cpgtools-2.0.5.dist-info/licenses/LICENSE.txt,sha256=NAIE1kmjlmRNJ1BwR9m9i0jXHmQqqujTnEyIBuIxvwM,1074
67
+ impyute/__init__.py,sha256=Q07nw2fDjLm_c3EQWMpSo8nZCfC-fvs2lpRG5uha-gg,87
68
+ impyute/contrib/__init__.py,sha256=DgGgN0iBoMfHjzr9edz-ZgtJ2KeyIDlyV1t8hxhcM2c,209
69
+ impyute/contrib/compare.py,sha256=psWygE7PD4Cky4MoTJLAgjaEkTXwqemv3NTKkKVaapo,2852
70
+ impyute/contrib/count_missing.py,sha256=KkCpYeuDP5jRxCcURJbne_k-7Zq1k31oq2aFHpqygx8,742
71
+ impyute/contrib/describe.py,sha256=LI6Xage19-DEEDZVNqaaOjemRAVqVkSlH4mxjjubw-A,2033
72
+ impyute/cs/__init__.py,sha256=ArkuAVHm0z4jl3_UK_FUBF8i2vZUvAZpfVFyryUTm18,373
73
+ impyute/cs/buck_iterative.py,sha256=ceoQ2ij64pXHy7NszErTTRb9h2wctF0CMBexhc6YZSA,3009
74
+ impyute/cs/central_tendency.py,sha256=xHwTVeDL-N-O5refvP536FiVEDODR7s-cgX2Ao9LsfI,2127
75
+ impyute/cs/em.py,sha256=SwvVNMYK3llATR2r1baVCA0hy_fwJIkGD110du9Woos,1631
76
+ impyute/cs/fast_knn.py,sha256=CI4RKMcrbXPi7izUtWo-lnq1BuLYwl8bvbx62dj8Qn0,6246
77
+ impyute/cs/random.py,sha256=-Ti4BwMwUGdsciKTp8OVAgYK6TzGP-R4Nw62UY_xqAs,594
78
+ impyute/dataset/__init__.py,sha256=GK-ktaf86PLZOUlO_oOhvc-55SX1YFLcpjcpgf7a87c,192
79
+ impyute/dataset/base.py,sha256=bNYszhYP62fSeDe8w_Ij4yhEf-dv6NiNGcCdC22OkuY,4445
80
+ impyute/dataset/corrupt.py,sha256=uGCnpofJWfWZYBucQMrwT9yOSpEQAaZSG010Jv6ks1I,1491
81
+ impyute/deletion/__init__.py,sha256=ztemVB8Q6LRaxSv8DDdFzZlGUVUQbsVRA7gvcs_vMhM,124
82
+ impyute/deletion/complete_case.py,sha256=2xBhqNOSmEeW-sBQvl3TVJCcf2Ybmkr8HXNk9xrQ2J0,395
83
+ impyute/ops/__init__.py,sha256=ZP_rQQ0qATGEibUA9I8D1JfD1kRT2Xqw8SsOMPIddog,275
84
+ impyute/ops/error.py,sha256=lyXJcIDyfdrj_JJbsEI18z-t4UKi1mpLTyt3iS2YvTs,226
85
+ impyute/ops/inverse_distance_weighting.py,sha256=88hmyc-dWltP66MbjrW-VHBcVgPOI0V0BkrV30B-Lyc,1017
86
+ impyute/ops/matrix.py,sha256=-eniwqvbvwXNrYw8PmIMD95uMcPMq6HE6OBOyjzR-Vk,837
87
+ impyute/ops/testing.py,sha256=3ZRVL1sc5IIQLPW2GrD6-lF4_nVnSLo7SBW4rfaOMLA,415
88
+ impyute/ops/util.py,sha256=167Xpwib86kT09-pZnJg-VhyVD9r-z7Gkd_L42A2-EY,2838
89
+ impyute/ops/wrapper.py,sha256=bQj3r9WsZxqNw51y3OobyyXPPxCeISUv2jb-Ga2C_-M,6022
90
+ impyute/ts/__init__.py,sha256=veePCYSBfo1t5Ahh3wrVCNveizcniXZYhjyS7ahkIhI,145
91
+ impyute/ts/locf.py,sha256=qusLWfBrepKzFIkeGxIS0S5KnqRgZde2CV6mcF5nf4c,1876
92
+ impyute/ts/moving_window.py,sha256=G3Z4ohhqZXAVPflptuOhktwucbW3tg8FySb7gTtBraE,4991
93
+ missingpy/__init__.py,sha256=H_yX0CtfoG2UaJPRcoZKv0AFhi4n-1hHmfIwfqQXoPk,96
94
+ missingpy/knnimpute.py,sha256=WxniKg6-3pvwJa9mipiCZBqkRL_GGHUcxDu-7b57x3k,13456
95
+ missingpy/missforest.py,sha256=UYRf2YANPwxvRPruN3P6xlhJ5JPZB1x8o4xVhYfIp74,24696
96
+ missingpy/pairwise_external.py,sha256=XU_vW-rMCPD3aH9k0UrP_p90QUHDZqe3QZPoOQmu2vo,13221
97
+ missingpy/utils.py,sha256=sMU4OGHPvRgReT8e_HqGCYvjAJZkE_qlAPDzLo-_M6U,4360
98
+ missingpy/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
+ missingpy/tests/test_knnimpute.py,sha256=rR1OPZ4-IzcIKgWmSEN7UEPvVSRwJe0KU_wTxPr46k0,17527
100
+ missingpy/tests/test_missforest.py,sha256=5YfKZf_xdy9RcXmnlFz7cJDqdnspJDzzrxLnVVWJi3A,13725
101
+ cpgtools-2.0.5.dist-info/METADATA,sha256=96OWgsyt7bxriebSZX-_6ITD7eaKHcQy0n0p25zS-is,1756
102
+ cpgtools-2.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
103
+ cpgtools-2.0.5.dist-info/top_level.txt,sha256=6kK5IWs15OSnu-SKn2Y6tWI924eoNXNZRIu2xGFLj94,49
104
+ cpgtools-2.0.5.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2024 The Python Packaging Authority
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ cpgmodule
2
+ impyute
3
+ impyutelib
4
+ missingpy
5
+ misspylib
impyute/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """ Imputations for cross-sectional and time-series data. """
2
+
3
+ __all__ = ["cs", "ts"]
@@ -0,0 +1,7 @@
1
+ """ Volatile code. Expect stuff in this to change. """
2
+
3
+ from .describe import describe
4
+ from .count_missing import count_missing
5
+ from .compare import compare
6
+
7
+ __all__ = ["describe", "count_missing", "compare"]
@@ -0,0 +1,69 @@
1
+ """impyute.contrib.compare.py"""
2
+ import importlib
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import accuracy_score
5
+ # pylint: disable=too-many-locals, dangerous-default-value
6
+
7
+ def compare(imputed, classifiers=["sklearn.svm.SVC"], log_path=None):
8
+ """
9
+ Given an imputed dataset with labels and a list of supervised machine
10
+ learning model, find accuracy score of all model/imputation pairs.
11
+
12
+ Parameters
13
+ ----------
14
+ imputed: [(str, np.ndarray), (str, np.ndarray)...]
15
+ List of tuples containing (imputation_name, imputed_data) where
16
+ `imputation_name` is a string and `imputed_data` is a tuple where
17
+ `imputed_data`[0] is the data, X and `imputed_data`[1] is the label, y
18
+ classifiers: [str, str...str] (optional)
19
+ Provide a list of classifiers to run imputed data sets on. Right now,
20
+ it ONLY works with sklearn, the format should be like so:
21
+ `sklearn.SUBMODULE.FUNCTION`. More generally its
22
+ 'MODULE.SUBMODULE.FUNCTION'. If providing a custom classifier, make
23
+ sure to add the file location to sys.path first and the classifier
24
+ should also be structured like sklearn (with a `fit` and `predict`
25
+ method).
26
+ log_path: str (optional)
27
+ To write results to a file, provide a relative path
28
+
29
+ Returns
30
+ -------
31
+ results.txt
32
+ Classification results on imputed data
33
+
34
+ """
35
+ clfs = []
36
+ for clf_name in classifiers:
37
+ mod_name, smod_name, fn_name = clf_name.split(".")
38
+ try:
39
+ mod = importlib.import_module("{}.{}".format(mod_name, smod_name))
40
+ fn = getattr(mod, fn_name)
41
+ clfs.append([fn_name, fn])
42
+ except ModuleNotFoundError:
43
+ print("Cannot import '{}' from '{}.{}'".format(fn_name,
44
+ mod_name,
45
+ smod_name))
46
+
47
+ results = {imputation_name: [] for imputation_name, _ in imputed}
48
+
49
+ for imputation_name, data in imputed:
50
+ X, y = data
51
+ X_train, X_test, y_train, y_test = train_test_split(X, y,
52
+ test_size=0.33,
53
+ random_state=0)
54
+ print("Imputation {} =========".format(imputation_name))
55
+ for clf_name, clf in clfs:
56
+ clf = clf()
57
+ clf.fit(X_train, y_train)
58
+ y_pred = clf.predict(X_test)
59
+ accuracy = accuracy_score(y_test, y_pred)
60
+ results[imputation_name].append((clf_name, accuracy))
61
+ print("...{}".format(clf_name))
62
+
63
+ # If not None, write to path
64
+ if log_path:
65
+ with open(log_path, 'w') as f:
66
+ f.write(str(results))
67
+ print("Results saved to {}".format(log_path))
68
+
69
+ return results
@@ -0,0 +1,30 @@
1
+ """ impyute.contrib.count_missing.py """
2
+ import numpy as np
3
+ from impyute.ops import matrix
4
+
5
+ def count_missing(data):
6
+ """ Calculate the total percentage of missing values and also the
7
+ percentage in each column.
8
+
9
+ Parameters
10
+ ----------
11
+ data: np.array
12
+ Data to impute.
13
+
14
+ Returns
15
+ -------
16
+ dict
17
+ Percentage of missing values in total and in each column.
18
+
19
+ """
20
+ size = len(data.flatten())
21
+ nan_xy = matrix.nan_indices(data)
22
+ np.unique(nan_xy)
23
+ counter = {y: 0. for y in np.unique(nan_xy.T[1])}
24
+ change_in_percentage = 1./size
25
+ for _, y in nan_xy:
26
+ counter[y] += change_in_percentage
27
+ total_missing = len(nan_xy)/size
28
+ counter["total"] = total_missing
29
+
30
+ return counter
@@ -0,0 +1,63 @@
1
+ """ impyute.contrib.describe """
2
+ from impyute.ops import matrix
3
+
4
+ def describe(data): # verbose=True):
5
+ """ Print input/output multiple times
6
+
7
+ Eventually will be used instead of matrix.nan_indices everywhere
8
+
9
+ Parameters
10
+ ----------
11
+ data: numpy.nd.array
12
+ The data you want to get a description from
13
+ verbose: boolean(optional)
14
+ Decides whether the description is short or long form
15
+
16
+ Returns
17
+ -------
18
+ dict
19
+ missingness: list
20
+ Confidence interval of data being MCAR, MAR or MNAR - in that order
21
+ nan_xy: list of tuples
22
+ Indices of all null points
23
+ nan_n: list
24
+ Total number of null values for each column
25
+ pmissing_n: float
26
+ Percentage of missing values in dataset
27
+ nan_rows: list
28
+ Indices of all rows that are completely null
29
+ nan_cols: list
30
+ Indices of all columns that are completely null
31
+ mean_rows: list
32
+ Mean value of each row
33
+ mean_cols: list
34
+ Mean value of each column
35
+ std_dev: list
36
+ std dev for each row/column
37
+ min_max: list
38
+ Finds the minimum and maximum for each row
39
+
40
+ """
41
+ # missingness = [0.33, 0.33, 0.33] # find_missingness(data)
42
+ nan_xy = matrix.nan_indices(data)
43
+ nan_n = len(nan_xy)
44
+ pmissing_n = float(nan_n/len(data.flatten))
45
+ # pmissing_rows = ""
46
+ # pmissing_cols = ""
47
+ # nan_rows = ""
48
+ # nan_cols = ""
49
+ # mean_rows = ""
50
+ # mean_cols = ""
51
+ # std_dev = ""
52
+ # "missingness": missingness,
53
+ description = {"nan_xy": nan_xy,
54
+ "nan_n": nan_n,
55
+ "pmissing_n": pmissing_n}
56
+ # "pmissing_rows": pmissing_rows,
57
+ # "pmissing_cols": pmissing_cols,
58
+ # "nan_rows": nan_rows,
59
+ # "nan_cols": nan_cols,
60
+ # "mean_rows": mean_rows,
61
+ # "mean_cols": mean_cols,
62
+ # "std_dev": std_dev}
63
+ return description
impyute/cs/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """ Imputations for cross-sectional data. """
2
+
3
+ from .random import random_impute
4
+ from .central_tendency import mean
5
+ from .central_tendency import mode
6
+ from .central_tendency import median
7
+ from .buck_iterative import buck_iterative
8
+ from .em import em
9
+ from .fast_knn import fast_knn
10
+
11
+ __all__ = ["random_impute", "mean", "mode", "median", "buck_iterative", "em", "fast_knn"]
@@ -0,0 +1,82 @@
1
+ import numpy as np
2
+ from sklearn.linear_model import LinearRegression
3
+ from impyute.ops import matrix
4
+ from impyute.ops import wrapper
5
+ # pylint: disable=too-many-locals
6
+
7
+ @wrapper.wrappers
8
+ @wrapper.checks
9
+ def buck_iterative(data):
10
+ """ Iterative variant of buck's method
11
+
12
+ - Variable to regress on is chosen at random.
13
+ - EM type infinite regression loop stops after change in prediction from
14
+ previous prediction < 10% for all columns with missing values
15
+
16
+ A Method of Estimation of Missing Values in Multivariate Data Suitable for
17
+ use with an Electronic Computer S. F. Buck Journal of the Royal Statistical
18
+ Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306
19
+
20
+ Parameters
21
+ ----------
22
+ data: numpy.ndarray
23
+ Data to impute.
24
+
25
+ Returns
26
+ -------
27
+ numpy.ndarray
28
+ Imputed data.
29
+
30
+ """
31
+ nan_xy = matrix.nan_indices(data)
32
+
33
+ # Add a column of zeros to the index values
34
+ nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1)
35
+
36
+ nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz]
37
+ temp = []
38
+ cols_missing = {y for _, y, _ in nan_xyz}
39
+
40
+ # Step 1: Simple Imputation, these are just placeholders
41
+ for x_i, y_i, value in nan_xyz:
42
+ # Column containing nan value without the nan value
43
+ col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
44
+
45
+ new_value = np.mean(col)
46
+ data[x_i][y_i] = new_value
47
+ temp.append([x_i, y_i, new_value])
48
+ nan_xyz = temp
49
+
50
+ # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)
51
+
52
+ converged = [False] * len(nan_xyz)
53
+ while not all(converged):
54
+ # Step 2: Placeholders are set back to missing for one variable/column
55
+ dependent_col = int(np.random.choice(list(cols_missing)))
56
+ missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col]
57
+
58
+ # Step 3: Perform linear regression using the other variables
59
+ x_train, y_train = [], []
60
+ for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs):
61
+ x_train.append(np.delete(data[x_i], dependent_col))
62
+ y_train.append(data[x_i][dependent_col])
63
+ model = LinearRegression()
64
+ model.fit(x_train, y_train)
65
+
66
+ # Step 4: Missing values for the missing variable/column are replaced
67
+ # with predictions from our new linear regression model
68
+ # For null indices with the dependent column that was randomly chosen
69
+ for i, z in enumerate(nan_xyz):
70
+ x_i = z[0]
71
+ y_i = z[1]
72
+ value = data[x_i, y_i]
73
+ if y_i == dependent_col:
74
+ # Row 'x' without the nan value
75
+ new_value = model.predict([np.delete(data[x_i], dependent_col)])
76
+ data[x_i][y_i] = new_value.reshape(1, -1)
77
+ if value == 0.0:
78
+ delta = (new_value-value)/0.01
79
+ else:
80
+ delta = (new_value-value)/value
81
+ converged[i] = abs(delta) < 0.1
82
+ return data