cpgtools 2.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cpgmodule/BED.py +441 -0
- cpgmodule/MI.py +193 -0
- cpgmodule/__init__.py +0 -0
- cpgmodule/_version.py +1 -0
- cpgmodule/cgID.py +866897 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgmodule/data/__init__.py +0 -0
- cpgmodule/extend_bed.py +147 -0
- cpgmodule/imotif.py +348 -0
- cpgmodule/ireader.py +28 -0
- cpgmodule/methylClock.py +53 -0
- cpgmodule/padjust.py +58 -0
- cpgmodule/region2gene.py +170 -0
- cpgmodule/utils.py +642 -0
- cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
- cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
- cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
- cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
- cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
- cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
- cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
- cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
- cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
- cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
- cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
- cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
- cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
- cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
- cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
- cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
- cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
- cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
- cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
- cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
- cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
- cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
- cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
- cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
- cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
- cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
- cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
- cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.5.dist-info/METADATA +59 -0
- cpgtools-2.0.5.dist-info/RECORD +104 -0
- cpgtools-2.0.5.dist-info/WHEEL +5 -0
- cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
- cpgtools-2.0.5.dist-info/top_level.txt +5 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +96 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- impyutelib.py +890 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- misspylib.py +565 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
#==============================================================================
|
|
5
|
+
Predict sex based on the semi-methylation (also known as genomic imprinting)
|
|
6
|
+
ratio. This method leverages the fact that, due to X chromosome inactivation,
|
|
7
|
+
females have a higher proportion of semi-methylated CpGs on their X chromosomes.
|
|
8
|
+
A log2(ratio) greater than 0 indicates a female, while a log2(ratio) less than
|
|
9
|
+
0 indicates a male.
|
|
10
|
+
|
|
11
|
+
Example of input data file
|
|
12
|
+
---------------------------
|
|
13
|
+
CpG_ID Sample_01 Sample_02 Sample_03 Sample_04
|
|
14
|
+
cg_001 0.831035 0.878022 0.794427 0.880911
|
|
15
|
+
cg_002 0.249544 0.209949 0.234294 0.236680
|
|
16
|
+
cg_003 0.845065 0.843957 0.840184 0.824286
|
|
17
|
+
|
|
18
|
+
Example of output file
|
|
19
|
+
----------------------
|
|
20
|
+
Sample_ID log2_SM_ratio Predicted_sex
|
|
21
|
+
Sample_01 -2.249628052954919 Male
|
|
22
|
+
Sample_02 -2.2671726671830674 Male
|
|
23
|
+
Sample_03 1.4530581933290616 Female
|
|
24
|
+
Sample_04 1.4808015115356654 Female
|
|
25
|
+
|
|
26
|
+
...
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
import sys
|
|
30
|
+
import numpy as np
|
|
31
|
+
from optparse import OptionParser
|
|
32
|
+
from cpgmodule.utils import printlog
|
|
33
|
+
from cpgmodule import ireader
|
|
34
|
+
import pandas as pd
|
|
35
|
+
from cpgmodule._version import __version__
|
|
36
|
+
|
|
37
|
+
__author__ = "Liguo Wang"
|
|
38
|
+
__copyright__ = "Copyleft"
|
|
39
|
+
__credits__ = []
|
|
40
|
+
__license__ = "GPL"
|
|
41
|
+
__maintainer__ = "Liguo Wang"
|
|
42
|
+
__email__ = "wang.liguo@mayo.edu"
|
|
43
|
+
__status__ = "Development"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def main():
|
|
47
|
+
|
|
48
|
+
usage="%prog [options]" + "\n"
|
|
49
|
+
parser = OptionParser(usage,version="%prog " + __version__)
|
|
50
|
+
parser.add_option("-i","--input_file",action="store", type="string",dest="input_file", help="Tab-separated data frame file containing beta values with the 1st row containing sample IDs and the 1st column containing CpG IDs.")
|
|
51
|
+
parser.add_option("-x","--xprobe",action="store", type="string",dest="xprobe_file", help="File with CpG IDs mapped to the X chromosome, with one probe listed per row.")
|
|
52
|
+
parser.add_option("-c","--cut",action="store", type='float', dest="cutoff", default=0.0, help="The cutoff of log2(SM ratio) to determine the sex prediction. Log2(SM ratio) greater than this cutoff indicates a female, while a log2(ratio) less than this cutoff indicates a male. default=%default")
|
|
53
|
+
parser.add_option("-o","--output",action="store", type='string', dest="out_file", help="The prefix of the output file.")
|
|
54
|
+
(options,args)=parser.parse_args()
|
|
55
|
+
|
|
56
|
+
print ()
|
|
57
|
+
if not (options.input_file):
|
|
58
|
+
print (__doc__)
|
|
59
|
+
parser.print_help()
|
|
60
|
+
sys.exit(101)
|
|
61
|
+
if not (options.xprobe_file):
|
|
62
|
+
print (__doc__)
|
|
63
|
+
parser.print_help()
|
|
64
|
+
sys.exit(102)
|
|
65
|
+
if not (options.out_file):
|
|
66
|
+
print (__doc__)
|
|
67
|
+
parser.print_help()
|
|
68
|
+
sys.exit(103)
|
|
69
|
+
|
|
70
|
+
printlog("Reading X probes from: \"%s\"" % (options.xprobe_file))
|
|
71
|
+
x_cpgs = set()
|
|
72
|
+
for l in ireader.reader(options.xprobe_file):
|
|
73
|
+
l = l.strip()
|
|
74
|
+
if l.startswith('#'):
|
|
75
|
+
continue
|
|
76
|
+
x_cpgs.add(l)
|
|
77
|
+
printlog("Total %d X probes loaded." % len(x_cpgs))
|
|
78
|
+
|
|
79
|
+
printlog("Reading input file: \"%s\"" % (options.input_file))
|
|
80
|
+
df1 = pd.read_csv(options.input_file, index_col = 0, sep="\t")
|
|
81
|
+
#print (df1)
|
|
82
|
+
|
|
83
|
+
#remove any rows with NAs
|
|
84
|
+
df2 = df1.dropna(axis=0, how='any')
|
|
85
|
+
printlog("%d CpGs with missing values were removed." % (len(df1) - len(df2)))
|
|
86
|
+
#print (df2)
|
|
87
|
+
|
|
88
|
+
sample_cpg_ids = df2.index
|
|
89
|
+
sample_names = df2.columns
|
|
90
|
+
found_x_cpgs = list(x_cpgs & set(sample_cpg_ids))
|
|
91
|
+
printlog("Found %d CpGs located on the chrX from file: %s" % (len(found_x_cpgs), options.input_file))
|
|
92
|
+
|
|
93
|
+
# only X probes in df3
|
|
94
|
+
df3 = df2.loc[list(found_x_cpgs)]
|
|
95
|
+
#pd.DataFrame.to_csv(df3, options.out_file + '.tmp.tsv', sep="\t", index_label="sample")
|
|
96
|
+
|
|
97
|
+
low_beta_range = [0, 0.2]
|
|
98
|
+
mid_beta_range = [0.3, 0.7]
|
|
99
|
+
high_beta_range = [0.8, 1.0]
|
|
100
|
+
|
|
101
|
+
output = {}
|
|
102
|
+
for s in sample_names:
|
|
103
|
+
output[s] = {}
|
|
104
|
+
low_beta_count = pd.cut(df3[s], low_beta_range).count()
|
|
105
|
+
mid_beta_count = pd.cut(df3[s], mid_beta_range).count()
|
|
106
|
+
high_beta_count = pd.cut(df3[s], high_beta_range).count()
|
|
107
|
+
try:
|
|
108
|
+
ratio = np.log2(mid_beta_count/(low_beta_count + high_beta_count))
|
|
109
|
+
except:
|
|
110
|
+
ratio = np.nan
|
|
111
|
+
output[s]['log2_SM_ratio'] = ratio
|
|
112
|
+
|
|
113
|
+
if ratio > options.cutoff:
|
|
114
|
+
output[s]['Predicted_sex'] = 'Female'
|
|
115
|
+
elif ratio < options.cutoff:
|
|
116
|
+
output[s]['Predicted_sex'] = 'Male'
|
|
117
|
+
else:
|
|
118
|
+
output[s]['Predicted_sex'] = 'Unknown'
|
|
119
|
+
df_out = pd.DataFrame(output).T
|
|
120
|
+
|
|
121
|
+
outfile = options.out_file + '.predicted_sex.tsv'
|
|
122
|
+
printlog("Writing to file: \"%s\"" % outfile)
|
|
123
|
+
pd.DataFrame.to_csv(df_out, outfile, sep="\t", index_label="Sample_ID")
|
|
124
|
+
|
|
125
|
+
if __name__=='__main__':
|
|
126
|
+
main()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cpgtools
|
|
3
|
+
Version: 2.0.5
|
|
4
|
+
Summary: Tools to analyze and visualize DNA methylation data
|
|
5
|
+
Author-email: Liguo Wang <wangliguo78@gmail.com>
|
|
6
|
+
Maintainer-email: Liguo Wang <wangliguo78@gmail.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Documentation, https://cpgtools.readthedocs.io/en/latest/index.html
|
|
9
|
+
Project-URL: Repository, https://github.com/liguowang/cpgtools.git
|
|
10
|
+
Keywords: DNA methylation,EPIC,450K,850K,935K,RRBS,WGBS
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
16
|
+
Classifier: Operating System :: POSIX
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
18
|
+
Requires-Python: >=3.5
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE.txt
|
|
21
|
+
Requires-Dist: numpy
|
|
22
|
+
Requires-Dist: scipy
|
|
23
|
+
Requires-Dist: scikit-learn
|
|
24
|
+
Requires-Dist: weblogo
|
|
25
|
+
Requires-Dist: bx-python
|
|
26
|
+
Requires-Dist: pandas
|
|
27
|
+
Requires-Dist: umap-learn
|
|
28
|
+
Requires-Dist: fancyimpute
|
|
29
|
+
Requires-Dist: pycombat
|
|
30
|
+
Requires-Dist: matplotlib
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
## Install CpGtools using [pip](https://pip.pypa.io/en/stable/)
|
|
34
|
+
|
|
35
|
+
1. (Optional) Create Virtual Environments (Note: `venv` is available in Python 3.3 and later. You can also use [virtualenv](https://packaging.python.org/en/latest/key_projects/#virtualenv))
|
|
36
|
+
|
|
37
|
+
`$ python3 -m venv my_env` (will create a directory called my_env)
|
|
38
|
+
|
|
39
|
+
`$ source my_env/bin/activate`
|
|
40
|
+
|
|
41
|
+
2. Install CpGtools
|
|
42
|
+
|
|
43
|
+
`$ pip install cpgtools`
|
|
44
|
+
|
|
45
|
+
or
|
|
46
|
+
|
|
47
|
+
`$ pip install git+https://github.com/liguowang/cpgtools.git`
|
|
48
|
+
|
|
49
|
+
3. Upgrade
|
|
50
|
+
|
|
51
|
+
`$ pip install cpgtools --upgrade`
|
|
52
|
+
|
|
53
|
+
4. Uninstall
|
|
54
|
+
|
|
55
|
+
`pip -y uninstall cpgtools`
|
|
56
|
+
|
|
57
|
+
## Documentation
|
|
58
|
+
|
|
59
|
+
https://cpgtools.readthedocs.io/en/latest/
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
impyutelib.py,sha256=cEHb_sKSapIOKubN2estkV5NcAPFENeYwzgeykT24a0,30719
|
|
2
|
+
misspylib.py,sha256=3r-UN1bgKz123KMDrya1nPSEYzrI1Ef0MvyAe4WoqPU,24864
|
|
3
|
+
cpgmodule/BED.py,sha256=ZNAhGZPFMFD4UpVwGTDOM0hMLnfXF52zerdMb21sH6s,12675
|
|
4
|
+
cpgmodule/MI.py,sha256=pkUWzKXYeyRuGsGKZdFVxHzFWikkuJavdQ_4fsnN3U4,8649
|
|
5
|
+
cpgmodule/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
cpgmodule/_version.py,sha256=xEb7Z4b8xalXXExBg42XPAhbJKniHzcsEPjp-6S3ppg,22
|
|
7
|
+
cpgmodule/cgID.py,sha256=DLG5Djm6JP0d6MiFvP_keUXv_rYxESWrISuVClTs8kw,16017914
|
|
8
|
+
cpgmodule/extend_bed.py,sha256=sjRxbRJFRumjYHwd6_AMTAYBG29yXQPYp1lDjTv2Q_Q,4648
|
|
9
|
+
cpgmodule/imotif.py,sha256=j7_MQFnowGsVTGCRQHHLUnEzSD0HehZPEHKKZ68Jv24,9227
|
|
10
|
+
cpgmodule/ireader.py,sha256=NRTdFnQvd1AEkYchfgvTVTixBVwUjn8xNyswc0OWU1Q,720
|
|
11
|
+
cpgmodule/methylClock.py,sha256=HyMk3vpVwR3yrkLCPv9fVJs6JCeSchbJEBe7i69jdSs,1202
|
|
12
|
+
cpgmodule/padjust.py,sha256=mvbQ9_crn_S39yvWrv_JPoU5OxZGHaw7cV3deBxTPdk,2389
|
|
13
|
+
cpgmodule/region2gene.py,sha256=iuSYa2-ki-qbL1TqqAlTXGT7g-j1uNIA1y7hzMwVM2U,5347
|
|
14
|
+
cpgmodule/utils.py,sha256=NgKT3aJlRT5pQKKO3_e0WB2_u93pY13F-k-r0pvgKno,16095
|
|
15
|
+
cpgmodule/data/AltumAge_cpg.pkl,sha256=T0pfBWrzQO00-z85QNE2CP0ntU0mtF2WkZeEPiArqTw,264691
|
|
16
|
+
cpgmodule/data/AltumAge_multi_platform_cpgs.pkl,sha256=Bo7ZG6AuxXUmLH6dCFfrlYkiTkeLjyqRgcD7yw4mNZ0,264816
|
|
17
|
+
cpgmodule/data/AltumAge_scaler.pkl,sha256=aDMcC4l0wZJGDi6_MeSpdNoykndfgsUtVKEd6mKf9T0,325509
|
|
18
|
+
cpgmodule/data/GA_Bohlin.pkl,sha256=W72ra6APUZDNPmVJ-kRuaVzpiKGgI_K7nDqSFx3nf3w,2667
|
|
19
|
+
cpgmodule/data/GA_Haftorn.pkl,sha256=ZFCJ2h0DWEaa-pHKdsSuWbQGL1JMwDXBtDyhx9DlZLc,4622
|
|
20
|
+
cpgmodule/data/GA_Knight.pkl,sha256=dsKeIt7PxCcw_tcVObtVKB3KqtF7bPH-vQqbClWI5Tc,25322
|
|
21
|
+
cpgmodule/data/GA_Lee_CPC.pkl,sha256=WfeI1bGnIS5_AIX7qnspyGYUgzGOAJ9JJ9CGQjLgkFo,25373
|
|
22
|
+
cpgmodule/data/GA_Lee_RPC.pkl,sha256=rLhr-sboWqIZ6Y08zODh7bgwXBa4SNkE4ZZaIyQTaYc,25382
|
|
23
|
+
cpgmodule/data/GA_Lee_refined_RPC.pkl,sha256=bXiZIgFSoX7Sr0ai1zHzGXv-EMGBglvx8z5vr3IpccA,25348
|
|
24
|
+
cpgmodule/data/GA_Mayne.pkl,sha256=CUfwiVIywJZwTb0PhRuW-6TQ5H1kO7tR6KHepyzBULg,1899
|
|
25
|
+
cpgmodule/data/Hannum.pkl,sha256=gXJxy6S8lBCUdr_T7ZM3R6hqKI22yev5-2giCPCCdLc,2049
|
|
26
|
+
cpgmodule/data/Horvath_2013.pkl,sha256=qwP614VYmmaESKanQ680sYgaHSWJrP73im_f5duzKgU,8313
|
|
27
|
+
cpgmodule/data/Horvath_2018.pkl,sha256=IQcFNIg-z-Mrq8vEoHI509c64Dt1wtyHInU9YwsYP-w,9225
|
|
28
|
+
cpgmodule/data/Levine.pkl,sha256=OhasOuEAvbDblZfGGFmu82kDqdo5CQ2DjZ8hfCofhvI,11751
|
|
29
|
+
cpgmodule/data/Lu_DNAmTL.pkl,sha256=SdjT9x8A0GbnKoMlDhgslmmINJ8e5Asyk9KQlw_EtJo,3548
|
|
30
|
+
cpgmodule/data/Ped_McEwen.pkl,sha256=-hxOAqFUxORmPLV_FKYZxjOFtabxPzbbzfY6JTitq8k,2654
|
|
31
|
+
cpgmodule/data/Ped_Wu.pkl,sha256=gjXtpdpQ8QVJs1I6UM-VPAvQjNd-JDoy5DzDL-gP-Cg,2924
|
|
32
|
+
cpgmodule/data/Zhang_BLUP.pkl,sha256=dpH0L3qTxMAJ50XtTeNMmqx1khZkZyrH204E37fE37E,7038226
|
|
33
|
+
cpgmodule/data/Zhang_EN.pkl,sha256=RFEOmBQGXZvIwQ7tOYT8itOPU87V2Y02vJA2PfY6TnI,11844
|
|
34
|
+
cpgmodule/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
|
+
cpgtools-2.0.5.data/scripts/CpG_aggregation.py,sha256=iSSUanCPlTkT3f6Df_f_b6zEP2Ixzse9zFwJLL5z2Qw,7278
|
|
36
|
+
cpgtools-2.0.5.data/scripts/CpG_anno_position.py,sha256=j8l4v7SpGFaJ3pBVE5qK8coMkxe5h4p1bwmhcPnhxMk,4643
|
|
37
|
+
cpgtools-2.0.5.data/scripts/CpG_anno_probe.py,sha256=bACac7x9zX2E1QJnUUmNKS6YNKg_f4K-jxwl_v93--4,3171
|
|
38
|
+
cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py,sha256=JM2PrvHCnGAvymBbbIBr30AfXwu69WgJkJNy6PCmCrg,5266
|
|
39
|
+
cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py,sha256=q3xZ1AH4JEif1DQRG6UXiiUb4jZyt2khzTZld1jaXnA,6207
|
|
40
|
+
cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py,sha256=tLfalDRzklBcNfZxhABjhprcBlk4HgUyxXxznOGVTN8,7736
|
|
41
|
+
cpgtools-2.0.5.data/scripts/CpG_distrb_region.py,sha256=nJGgZLhZe2kYCPnxuW--EemxOcoReu2hdkIAgl-7UAE,5391
|
|
42
|
+
cpgtools-2.0.5.data/scripts/CpG_logo.py,sha256=U8RxYPmakKii1xmEgG22tKfdZBPpjhZbAfKxgF_Z-O8,4597
|
|
43
|
+
cpgtools-2.0.5.data/scripts/CpG_to_gene.py,sha256=fciNtI5N6fO-jZX2eyKgiAhDIsus4SljYSCbEQMeTHI,6114
|
|
44
|
+
cpgtools-2.0.5.data/scripts/beta_PCA.py,sha256=-Cp9vhrNr5nKs6xPLRrQ2NJUj3eTm-kFNvRMv7mKaT4,8487
|
|
45
|
+
cpgtools-2.0.5.data/scripts/beta_UMAP.py,sha256=SJfxtCsM1NTK5rYnZjtXiDj_x1kJiYZvR1NGHKo0IRI,8079
|
|
46
|
+
cpgtools-2.0.5.data/scripts/beta_combat.py,sha256=iX40a_usa_BRPANGvRi8XIIRz_gX8h9-wn3Q7__sT9c,6710
|
|
47
|
+
cpgtools-2.0.5.data/scripts/beta_jitter_plot.py,sha256=Knja1n1rpD0qe3FHkNPax0p6BYsmAtEQBFB1wBdpHyY,4389
|
|
48
|
+
cpgtools-2.0.5.data/scripts/beta_m_conversion.py,sha256=992tJI0IzkphUaEr_9_CveSRPjKdnyYkCTO10D_xkbg,2848
|
|
49
|
+
cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py,sha256=Jb9mm8y-1cKH_EepRFhUH5mfBdoAt0zpN_VXETRnQek,7312
|
|
50
|
+
cpgtools-2.0.5.data/scripts/beta_profile_region.py,sha256=GHDPyUFR9XJm2CK0_9wCKossMTWWvn4VwYCCSA8kn2E,5971
|
|
51
|
+
cpgtools-2.0.5.data/scripts/beta_selectNBest.py,sha256=Uu_MvQUm6Zc8MGKuGUEooJ-IL7C1eg_LNRqQsNdLaWs,4638
|
|
52
|
+
cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py,sha256=2fcypD_BE4XqK-vl8hHsgyWqvp3I9oLvg8tF2dd5QZ0,3415
|
|
53
|
+
cpgtools-2.0.5.data/scripts/beta_stats.py,sha256=WBh3Aquk_AHeUcteLPtt2Q2yKpqu5aBA38zaFz_7mVI,3021
|
|
54
|
+
cpgtools-2.0.5.data/scripts/beta_tSNE.py,sha256=0UTxNxyaDb-iNqbeIfFKj9DRURTQcHeJdltr-iY72XM,8450
|
|
55
|
+
cpgtools-2.0.5.data/scripts/beta_topN.py,sha256=gmbmJZoGJt3QljlRHUA8LaKSIurdCNgtn2J9LAIQNCo,3914
|
|
56
|
+
cpgtools-2.0.5.data/scripts/beta_trichotmize.py,sha256=T594UfSCDJHRFyGvov7qetLQs10WU73PGVVCDNrwNPg,7143
|
|
57
|
+
cpgtools-2.0.5.data/scripts/dmc_Bayes.py,sha256=imqKv5x8s_plXQPL3_d9OpqSI-imrSfUj-k39Zf1XqA,13735
|
|
58
|
+
cpgtools-2.0.5.data/scripts/dmc_bb.py,sha256=pOEEHT-hT--r6IswZwQFyGri77OJqbW1y9VVG8pzzpg,8407
|
|
59
|
+
cpgtools-2.0.5.data/scripts/dmc_fisher.py,sha256=t18smNJAtipmDm3xgUg62ccKsFQWcdKBo4Tm2cxx43s,5163
|
|
60
|
+
cpgtools-2.0.5.data/scripts/dmc_glm.py,sha256=_vJiZhbGA-Rv5v5oBU9N9lGND8MTEVAFHHFqW5hgfSw,6516
|
|
61
|
+
cpgtools-2.0.5.data/scripts/dmc_logit.py,sha256=V35wyBgcdiUbsR_NDnEDfs6F2yPyz11ryZvYkkV6Ess,8488
|
|
62
|
+
cpgtools-2.0.5.data/scripts/dmc_nonparametric.py,sha256=eFyUy7jXl8bPyySKZLEi3LVxYkDOKp9S4XBXevYvbVw,4945
|
|
63
|
+
cpgtools-2.0.5.data/scripts/dmc_ttest.py,sha256=DDmWi1udo1gqOAQvFTeRxhIJuAgEmakSNU7P4LKTT7U,6786
|
|
64
|
+
cpgtools-2.0.5.data/scripts/predict_missing.py,sha256=pNSfOD9i7LNLG9BGa80W2-bsCi6qcUlIwoiLnBLbPlo,32465
|
|
65
|
+
cpgtools-2.0.5.data/scripts/predict_sex.py,sha256=ojoMJ6XwBsE1kGpxzKGZ4TZPs5JcUwI_C_9ieEQbB9c,4755
|
|
66
|
+
cpgtools-2.0.5.dist-info/licenses/LICENSE.txt,sha256=NAIE1kmjlmRNJ1BwR9m9i0jXHmQqqujTnEyIBuIxvwM,1074
|
|
67
|
+
impyute/__init__.py,sha256=Q07nw2fDjLm_c3EQWMpSo8nZCfC-fvs2lpRG5uha-gg,87
|
|
68
|
+
impyute/contrib/__init__.py,sha256=DgGgN0iBoMfHjzr9edz-ZgtJ2KeyIDlyV1t8hxhcM2c,209
|
|
69
|
+
impyute/contrib/compare.py,sha256=psWygE7PD4Cky4MoTJLAgjaEkTXwqemv3NTKkKVaapo,2852
|
|
70
|
+
impyute/contrib/count_missing.py,sha256=KkCpYeuDP5jRxCcURJbne_k-7Zq1k31oq2aFHpqygx8,742
|
|
71
|
+
impyute/contrib/describe.py,sha256=LI6Xage19-DEEDZVNqaaOjemRAVqVkSlH4mxjjubw-A,2033
|
|
72
|
+
impyute/cs/__init__.py,sha256=ArkuAVHm0z4jl3_UK_FUBF8i2vZUvAZpfVFyryUTm18,373
|
|
73
|
+
impyute/cs/buck_iterative.py,sha256=ceoQ2ij64pXHy7NszErTTRb9h2wctF0CMBexhc6YZSA,3009
|
|
74
|
+
impyute/cs/central_tendency.py,sha256=xHwTVeDL-N-O5refvP536FiVEDODR7s-cgX2Ao9LsfI,2127
|
|
75
|
+
impyute/cs/em.py,sha256=SwvVNMYK3llATR2r1baVCA0hy_fwJIkGD110du9Woos,1631
|
|
76
|
+
impyute/cs/fast_knn.py,sha256=CI4RKMcrbXPi7izUtWo-lnq1BuLYwl8bvbx62dj8Qn0,6246
|
|
77
|
+
impyute/cs/random.py,sha256=-Ti4BwMwUGdsciKTp8OVAgYK6TzGP-R4Nw62UY_xqAs,594
|
|
78
|
+
impyute/dataset/__init__.py,sha256=GK-ktaf86PLZOUlO_oOhvc-55SX1YFLcpjcpgf7a87c,192
|
|
79
|
+
impyute/dataset/base.py,sha256=bNYszhYP62fSeDe8w_Ij4yhEf-dv6NiNGcCdC22OkuY,4445
|
|
80
|
+
impyute/dataset/corrupt.py,sha256=uGCnpofJWfWZYBucQMrwT9yOSpEQAaZSG010Jv6ks1I,1491
|
|
81
|
+
impyute/deletion/__init__.py,sha256=ztemVB8Q6LRaxSv8DDdFzZlGUVUQbsVRA7gvcs_vMhM,124
|
|
82
|
+
impyute/deletion/complete_case.py,sha256=2xBhqNOSmEeW-sBQvl3TVJCcf2Ybmkr8HXNk9xrQ2J0,395
|
|
83
|
+
impyute/ops/__init__.py,sha256=ZP_rQQ0qATGEibUA9I8D1JfD1kRT2Xqw8SsOMPIddog,275
|
|
84
|
+
impyute/ops/error.py,sha256=lyXJcIDyfdrj_JJbsEI18z-t4UKi1mpLTyt3iS2YvTs,226
|
|
85
|
+
impyute/ops/inverse_distance_weighting.py,sha256=88hmyc-dWltP66MbjrW-VHBcVgPOI0V0BkrV30B-Lyc,1017
|
|
86
|
+
impyute/ops/matrix.py,sha256=-eniwqvbvwXNrYw8PmIMD95uMcPMq6HE6OBOyjzR-Vk,837
|
|
87
|
+
impyute/ops/testing.py,sha256=3ZRVL1sc5IIQLPW2GrD6-lF4_nVnSLo7SBW4rfaOMLA,415
|
|
88
|
+
impyute/ops/util.py,sha256=167Xpwib86kT09-pZnJg-VhyVD9r-z7Gkd_L42A2-EY,2838
|
|
89
|
+
impyute/ops/wrapper.py,sha256=bQj3r9WsZxqNw51y3OobyyXPPxCeISUv2jb-Ga2C_-M,6022
|
|
90
|
+
impyute/ts/__init__.py,sha256=veePCYSBfo1t5Ahh3wrVCNveizcniXZYhjyS7ahkIhI,145
|
|
91
|
+
impyute/ts/locf.py,sha256=qusLWfBrepKzFIkeGxIS0S5KnqRgZde2CV6mcF5nf4c,1876
|
|
92
|
+
impyute/ts/moving_window.py,sha256=G3Z4ohhqZXAVPflptuOhktwucbW3tg8FySb7gTtBraE,4991
|
|
93
|
+
missingpy/__init__.py,sha256=H_yX0CtfoG2UaJPRcoZKv0AFhi4n-1hHmfIwfqQXoPk,96
|
|
94
|
+
missingpy/knnimpute.py,sha256=WxniKg6-3pvwJa9mipiCZBqkRL_GGHUcxDu-7b57x3k,13456
|
|
95
|
+
missingpy/missforest.py,sha256=UYRf2YANPwxvRPruN3P6xlhJ5JPZB1x8o4xVhYfIp74,24696
|
|
96
|
+
missingpy/pairwise_external.py,sha256=XU_vW-rMCPD3aH9k0UrP_p90QUHDZqe3QZPoOQmu2vo,13221
|
|
97
|
+
missingpy/utils.py,sha256=sMU4OGHPvRgReT8e_HqGCYvjAJZkE_qlAPDzLo-_M6U,4360
|
|
98
|
+
missingpy/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
99
|
+
missingpy/tests/test_knnimpute.py,sha256=rR1OPZ4-IzcIKgWmSEN7UEPvVSRwJe0KU_wTxPr46k0,17527
|
|
100
|
+
missingpy/tests/test_missforest.py,sha256=5YfKZf_xdy9RcXmnlFz7cJDqdnspJDzzrxLnVVWJi3A,13725
|
|
101
|
+
cpgtools-2.0.5.dist-info/METADATA,sha256=96OWgsyt7bxriebSZX-_6ITD7eaKHcQy0n0p25zS-is,1756
|
|
102
|
+
cpgtools-2.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
103
|
+
cpgtools-2.0.5.dist-info/top_level.txt,sha256=6kK5IWs15OSnu-SKn2Y6tWI924eoNXNZRIu2xGFLj94,49
|
|
104
|
+
cpgtools-2.0.5.dist-info/RECORD,,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2024 The Python Packaging Authority
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
impyute/__init__.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""impyute.contrib.compare.py"""
|
|
2
|
+
import importlib
|
|
3
|
+
from sklearn.model_selection import train_test_split
|
|
4
|
+
from sklearn.metrics import accuracy_score
|
|
5
|
+
# pylint: disable=too-many-locals, dangerous-default-value
|
|
6
|
+
|
|
7
|
+
def compare(imputed, classifiers=["sklearn.svm.SVC"], log_path=None):
|
|
8
|
+
"""
|
|
9
|
+
Given an imputed dataset with labels and a list of supervised machine
|
|
10
|
+
learning model, find accuracy score of all model/imputation pairs.
|
|
11
|
+
|
|
12
|
+
Parameters
|
|
13
|
+
----------
|
|
14
|
+
imputed: [(str, np.ndarray), (str, np.ndarray)...]
|
|
15
|
+
List of tuples containing (imputation_name, imputed_data) where
|
|
16
|
+
`imputation_name` is a string and `imputed_data` is a tuple where
|
|
17
|
+
`imputed_data`[0] is the data, X and `imputed_data`[1] is the label, y
|
|
18
|
+
classifiers: [str, str...str] (optional)
|
|
19
|
+
Provide a list of classifiers to run imputed data sets on. Right now,
|
|
20
|
+
it ONLY works with sklearn, the format should be like so:
|
|
21
|
+
`sklearn.SUBMODULE.FUNCTION`. More generally its
|
|
22
|
+
'MODULE.SUBMODULE.FUNCTION'. If providing a custom classifier, make
|
|
23
|
+
sure to add the file location to sys.path first and the classifier
|
|
24
|
+
should also be structured like sklearn (with a `fit` and `predict`
|
|
25
|
+
method).
|
|
26
|
+
log_path: str (optional)
|
|
27
|
+
To write results to a file, provide a relative path
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
results.txt
|
|
32
|
+
Classification results on imputed data
|
|
33
|
+
|
|
34
|
+
"""
|
|
35
|
+
clfs = []
|
|
36
|
+
for clf_name in classifiers:
|
|
37
|
+
mod_name, smod_name, fn_name = clf_name.split(".")
|
|
38
|
+
try:
|
|
39
|
+
mod = importlib.import_module("{}.{}".format(mod_name, smod_name))
|
|
40
|
+
fn = getattr(mod, fn_name)
|
|
41
|
+
clfs.append([fn_name, fn])
|
|
42
|
+
except ModuleNotFoundError:
|
|
43
|
+
print("Cannot import '{}' from '{}.{}'".format(fn_name,
|
|
44
|
+
mod_name,
|
|
45
|
+
smod_name))
|
|
46
|
+
|
|
47
|
+
results = {imputation_name: [] for imputation_name, _ in imputed}
|
|
48
|
+
|
|
49
|
+
for imputation_name, data in imputed:
|
|
50
|
+
X, y = data
|
|
51
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y,
|
|
52
|
+
test_size=0.33,
|
|
53
|
+
random_state=0)
|
|
54
|
+
print("Imputation {} =========".format(imputation_name))
|
|
55
|
+
for clf_name, clf in clfs:
|
|
56
|
+
clf = clf()
|
|
57
|
+
clf.fit(X_train, y_train)
|
|
58
|
+
y_pred = clf.predict(X_test)
|
|
59
|
+
accuracy = accuracy_score(y_test, y_pred)
|
|
60
|
+
results[imputation_name].append((clf_name, accuracy))
|
|
61
|
+
print("...{}".format(clf_name))
|
|
62
|
+
|
|
63
|
+
# If not None, write to path
|
|
64
|
+
if log_path:
|
|
65
|
+
with open(log_path, 'w') as f:
|
|
66
|
+
f.write(str(results))
|
|
67
|
+
print("Results saved to {}".format(log_path))
|
|
68
|
+
|
|
69
|
+
return results
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
""" impyute.contrib.count_missing.py """
|
|
2
|
+
import numpy as np
|
|
3
|
+
from impyute.ops import matrix
|
|
4
|
+
|
|
5
|
+
def count_missing(data):
|
|
6
|
+
""" Calculate the total percentage of missing values and also the
|
|
7
|
+
percentage in each column.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
data: np.array
|
|
12
|
+
Data to impute.
|
|
13
|
+
|
|
14
|
+
Returns
|
|
15
|
+
-------
|
|
16
|
+
dict
|
|
17
|
+
Percentage of missing values in total and in each column.
|
|
18
|
+
|
|
19
|
+
"""
|
|
20
|
+
size = len(data.flatten())
|
|
21
|
+
nan_xy = matrix.nan_indices(data)
|
|
22
|
+
np.unique(nan_xy)
|
|
23
|
+
counter = {y: 0. for y in np.unique(nan_xy.T[1])}
|
|
24
|
+
change_in_percentage = 1./size
|
|
25
|
+
for _, y in nan_xy:
|
|
26
|
+
counter[y] += change_in_percentage
|
|
27
|
+
total_missing = len(nan_xy)/size
|
|
28
|
+
counter["total"] = total_missing
|
|
29
|
+
|
|
30
|
+
return counter
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
""" impyute.contrib.describe """
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
|
|
4
|
+
def describe(data): # verbose=True):
|
|
5
|
+
""" Print input/output multiple times
|
|
6
|
+
|
|
7
|
+
Eventually will be used instead of matrix.nan_indices everywhere
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
data: numpy.nd.array
|
|
12
|
+
The data you want to get a description from
|
|
13
|
+
verbose: boolean(optional)
|
|
14
|
+
Decides whether the description is short or long form
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
dict
|
|
19
|
+
missingness: list
|
|
20
|
+
Confidence interval of data being MCAR, MAR or MNAR - in that order
|
|
21
|
+
nan_xy: list of tuples
|
|
22
|
+
Indices of all null points
|
|
23
|
+
nan_n: list
|
|
24
|
+
Total number of null values for each column
|
|
25
|
+
pmissing_n: float
|
|
26
|
+
Percentage of missing values in dataset
|
|
27
|
+
nan_rows: list
|
|
28
|
+
Indices of all rows that are completely null
|
|
29
|
+
nan_cols: list
|
|
30
|
+
Indices of all columns that are completely null
|
|
31
|
+
mean_rows: list
|
|
32
|
+
Mean value of each row
|
|
33
|
+
mean_cols: list
|
|
34
|
+
Mean value of each column
|
|
35
|
+
std_dev: list
|
|
36
|
+
std dev for each row/column
|
|
37
|
+
min_max: list
|
|
38
|
+
Finds the minimum and maximum for each row
|
|
39
|
+
|
|
40
|
+
"""
|
|
41
|
+
# missingness = [0.33, 0.33, 0.33] # find_missingness(data)
|
|
42
|
+
nan_xy = matrix.nan_indices(data)
|
|
43
|
+
nan_n = len(nan_xy)
|
|
44
|
+
pmissing_n = float(nan_n/len(data.flatten))
|
|
45
|
+
# pmissing_rows = ""
|
|
46
|
+
# pmissing_cols = ""
|
|
47
|
+
# nan_rows = ""
|
|
48
|
+
# nan_cols = ""
|
|
49
|
+
# mean_rows = ""
|
|
50
|
+
# mean_cols = ""
|
|
51
|
+
# std_dev = ""
|
|
52
|
+
# "missingness": missingness,
|
|
53
|
+
description = {"nan_xy": nan_xy,
|
|
54
|
+
"nan_n": nan_n,
|
|
55
|
+
"pmissing_n": pmissing_n}
|
|
56
|
+
# "pmissing_rows": pmissing_rows,
|
|
57
|
+
# "pmissing_cols": pmissing_cols,
|
|
58
|
+
# "nan_rows": nan_rows,
|
|
59
|
+
# "nan_cols": nan_cols,
|
|
60
|
+
# "mean_rows": mean_rows,
|
|
61
|
+
# "mean_cols": mean_cols,
|
|
62
|
+
# "std_dev": std_dev}
|
|
63
|
+
return description
|
impyute/cs/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
""" Imputations for cross-sectional data. """
|
|
2
|
+
|
|
3
|
+
from .random import random_impute
|
|
4
|
+
from .central_tendency import mean
|
|
5
|
+
from .central_tendency import mode
|
|
6
|
+
from .central_tendency import median
|
|
7
|
+
from .buck_iterative import buck_iterative
|
|
8
|
+
from .em import em
|
|
9
|
+
from .fast_knn import fast_knn
|
|
10
|
+
|
|
11
|
+
__all__ = ["random_impute", "mean", "mode", "median", "buck_iterative", "em", "fast_knn"]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.linear_model import LinearRegression
|
|
3
|
+
from impyute.ops import matrix
|
|
4
|
+
from impyute.ops import wrapper
|
|
5
|
+
# pylint: disable=too-many-locals
|
|
6
|
+
|
|
7
|
+
@wrapper.wrappers
|
|
8
|
+
@wrapper.checks
|
|
9
|
+
def buck_iterative(data):
|
|
10
|
+
""" Iterative variant of buck's method
|
|
11
|
+
|
|
12
|
+
- Variable to regress on is chosen at random.
|
|
13
|
+
- EM type infinite regression loop stops after change in prediction from
|
|
14
|
+
previous prediction < 10% for all columns with missing values
|
|
15
|
+
|
|
16
|
+
A Method of Estimation of Missing Values in Multivariate Data Suitable for
|
|
17
|
+
use with an Electronic Computer S. F. Buck Journal of the Royal Statistical
|
|
18
|
+
Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
data: numpy.ndarray
|
|
23
|
+
Data to impute.
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
numpy.ndarray
|
|
28
|
+
Imputed data.
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
nan_xy = matrix.nan_indices(data)
|
|
32
|
+
|
|
33
|
+
# Add a column of zeros to the index values
|
|
34
|
+
nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1)
|
|
35
|
+
|
|
36
|
+
nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz]
|
|
37
|
+
temp = []
|
|
38
|
+
cols_missing = {y for _, y, _ in nan_xyz}
|
|
39
|
+
|
|
40
|
+
# Step 1: Simple Imputation, these are just placeholders
|
|
41
|
+
for x_i, y_i, value in nan_xyz:
|
|
42
|
+
# Column containing nan value without the nan value
|
|
43
|
+
col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
|
|
44
|
+
|
|
45
|
+
new_value = np.mean(col)
|
|
46
|
+
data[x_i][y_i] = new_value
|
|
47
|
+
temp.append([x_i, y_i, new_value])
|
|
48
|
+
nan_xyz = temp
|
|
49
|
+
|
|
50
|
+
# Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)
|
|
51
|
+
|
|
52
|
+
converged = [False] * len(nan_xyz)
|
|
53
|
+
while not all(converged):
|
|
54
|
+
# Step 2: Placeholders are set back to missing for one variable/column
|
|
55
|
+
dependent_col = int(np.random.choice(list(cols_missing)))
|
|
56
|
+
missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col]
|
|
57
|
+
|
|
58
|
+
# Step 3: Perform linear regression using the other variables
|
|
59
|
+
x_train, y_train = [], []
|
|
60
|
+
for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs):
|
|
61
|
+
x_train.append(np.delete(data[x_i], dependent_col))
|
|
62
|
+
y_train.append(data[x_i][dependent_col])
|
|
63
|
+
model = LinearRegression()
|
|
64
|
+
model.fit(x_train, y_train)
|
|
65
|
+
|
|
66
|
+
# Step 4: Missing values for the missing variable/column are replaced
|
|
67
|
+
# with predictions from our new linear regression model
|
|
68
|
+
# For null indices with the dependent column that was randomly chosen
|
|
69
|
+
for i, z in enumerate(nan_xyz):
|
|
70
|
+
x_i = z[0]
|
|
71
|
+
y_i = z[1]
|
|
72
|
+
value = data[x_i, y_i]
|
|
73
|
+
if y_i == dependent_col:
|
|
74
|
+
# Row 'x' without the nan value
|
|
75
|
+
new_value = model.predict([np.delete(data[x_i], dependent_col)])
|
|
76
|
+
data[x_i][y_i] = new_value.reshape(1, -1)
|
|
77
|
+
if value == 0.0:
|
|
78
|
+
delta = (new_value-value)/0.01
|
|
79
|
+
else:
|
|
80
|
+
delta = (new_value-value)/value
|
|
81
|
+
converged[i] = abs(delta) < 0.1
|
|
82
|
+
return data
|