cpgtools 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cpgtools might be problematic. Click here for more details.
- cpgmodule/_version.py +1 -0
- cpgmodule/utils.py +35 -0
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_aggregation.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_position.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_probe.py +1 -2
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_density_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_chrom.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_region.py +1 -3
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_logo.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/CpG_to_gene.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_PCA.py +31 -23
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_UMAP.py +29 -22
- cpgtools-2.0.2.data/scripts/beta_imputation.py +604 -0
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_jitter_plot.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_m_conversion.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_gene_centered.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_region.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_selectNBest.py +9 -6
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_stacked_barplot.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_stats.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_tSNE.py +31 -24
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_topN.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/beta_trichotmize.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_Bayes.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_bb.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_fisher.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_glm.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_logit.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_nonparametric.py +1 -1
- {cpgtools-2.0.0.data → cpgtools-2.0.2.data}/scripts/dmc_ttest.py +1 -1
- cpgtools-2.0.2.data/scripts/predict_sex.py +126 -0
- cpgtools-2.0.2.dist-info/LICENSE +19 -0
- cpgtools-2.0.2.dist-info/METADATA +76 -0
- cpgtools-2.0.2.dist-info/RECORD +82 -0
- {cpgtools-2.0.0.dist-info → cpgtools-2.0.2.dist-info}/WHEEL +1 -1
- cpgtools-2.0.2.dist-info/top_level.txt +3 -0
- impyute/__init__.py +3 -0
- impyute/contrib/__init__.py +7 -0
- impyute/contrib/compare.py +69 -0
- impyute/contrib/count_missing.py +30 -0
- impyute/contrib/describe.py +63 -0
- impyute/cs/__init__.py +11 -0
- impyute/cs/buck_iterative.py +82 -0
- impyute/cs/central_tendency.py +84 -0
- impyute/cs/em.py +52 -0
- impyute/cs/fast_knn.py +130 -0
- impyute/cs/random.py +27 -0
- impyute/dataset/__init__.py +6 -0
- impyute/dataset/base.py +137 -0
- impyute/dataset/corrupt.py +55 -0
- impyute/deletion/__init__.py +5 -0
- impyute/deletion/complete_case.py +21 -0
- impyute/ops/__init__.py +12 -0
- impyute/ops/error.py +9 -0
- impyute/ops/inverse_distance_weighting.py +31 -0
- impyute/ops/matrix.py +47 -0
- impyute/ops/testing.py +20 -0
- impyute/ops/util.py +76 -0
- impyute/ops/wrapper.py +179 -0
- impyute/ts/__init__.py +6 -0
- impyute/ts/locf.py +57 -0
- impyute/ts/moving_window.py +128 -0
- missingpy/__init__.py +4 -0
- missingpy/knnimpute.py +328 -0
- missingpy/missforest.py +556 -0
- missingpy/pairwise_external.py +315 -0
- missingpy/tests/__init__.py +0 -0
- missingpy/tests/test_knnimpute.py +605 -0
- missingpy/tests/test_missforest.py +409 -0
- missingpy/utils.py +124 -0
- cpgmodule/data/AltumAge_cpg.pkl +0 -0
- cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
- cpgmodule/data/AltumAge_scaler.pkl +0 -0
- cpgmodule/data/GA_Bohlin.pkl +0 -0
- cpgmodule/data/GA_Haftorn.pkl +0 -0
- cpgmodule/data/GA_Knight.pkl +0 -0
- cpgmodule/data/GA_Lee_CPC.pkl +0 -0
- cpgmodule/data/GA_Lee_RPC.pkl +0 -0
- cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
- cpgmodule/data/GA_Mayne.pkl +0 -0
- cpgmodule/data/Hannum.pkl +0 -0
- cpgmodule/data/Horvath_2013.pkl +0 -0
- cpgmodule/data/Horvath_2018.pkl +0 -0
- cpgmodule/data/Levine.pkl +0 -0
- cpgmodule/data/Lu_DNAmTL.pkl +0 -0
- cpgmodule/data/Ped_McEwen.pkl +0 -0
- cpgmodule/data/Ped_Wu.pkl +0 -0
- cpgmodule/data/Zhang_BLUP.pkl +0 -0
- cpgmodule/data/Zhang_EN.pkl +0 -0
- cpgtools-2.0.0.dist-info/LICENSE.txt +0 -674
- cpgtools-2.0.0.dist-info/METADATA +0 -28
- cpgtools-2.0.0.dist-info/RECORD +0 -64
- cpgtools-2.0.0.dist-info/top_level.txt +0 -2
impyute/__init__.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""impyute.contrib.compare.py"""
|
|
2
|
+
import importlib
|
|
3
|
+
from sklearn.model_selection import train_test_split
|
|
4
|
+
from sklearn.metrics import accuracy_score
|
|
5
|
+
# pylint: disable=too-many-locals, dangerous-default-value
|
|
6
|
+
|
|
7
|
+
def compare(imputed, classifiers=["sklearn.svm.SVC"], log_path=None):
|
|
8
|
+
"""
|
|
9
|
+
Given an imputed dataset with labels and a list of supervised machine
|
|
10
|
+
learning model, find accuracy score of all model/imputation pairs.
|
|
11
|
+
|
|
12
|
+
Parameters
|
|
13
|
+
----------
|
|
14
|
+
imputed: [(str, np.ndarray), (str, np.ndarray)...]
|
|
15
|
+
List of tuples containing (imputation_name, imputed_data) where
|
|
16
|
+
`imputation_name` is a string and `imputed_data` is a tuple where
|
|
17
|
+
`imputed_data`[0] is the data, X and `imputed_data`[1] is the label, y
|
|
18
|
+
classifiers: [str, str...str] (optional)
|
|
19
|
+
Provide a list of classifiers to run imputed data sets on. Right now,
|
|
20
|
+
it ONLY works with sklearn, the format should be like so:
|
|
21
|
+
`sklearn.SUBMODULE.FUNCTION`. More generally its
|
|
22
|
+
'MODULE.SUBMODULE.FUNCTION'. If providing a custom classifier, make
|
|
23
|
+
sure to add the file location to sys.path first and the classifier
|
|
24
|
+
should also be structured like sklearn (with a `fit` and `predict`
|
|
25
|
+
method).
|
|
26
|
+
log_path: str (optional)
|
|
27
|
+
To write results to a file, provide a relative path
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
results.txt
|
|
32
|
+
Classification results on imputed data
|
|
33
|
+
|
|
34
|
+
"""
|
|
35
|
+
clfs = []
|
|
36
|
+
for clf_name in classifiers:
|
|
37
|
+
mod_name, smod_name, fn_name = clf_name.split(".")
|
|
38
|
+
try:
|
|
39
|
+
mod = importlib.import_module("{}.{}".format(mod_name, smod_name))
|
|
40
|
+
fn = getattr(mod, fn_name)
|
|
41
|
+
clfs.append([fn_name, fn])
|
|
42
|
+
except ModuleNotFoundError:
|
|
43
|
+
print("Cannot import '{}' from '{}.{}'".format(fn_name,
|
|
44
|
+
mod_name,
|
|
45
|
+
smod_name))
|
|
46
|
+
|
|
47
|
+
results = {imputation_name: [] for imputation_name, _ in imputed}
|
|
48
|
+
|
|
49
|
+
for imputation_name, data in imputed:
|
|
50
|
+
X, y = data
|
|
51
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y,
|
|
52
|
+
test_size=0.33,
|
|
53
|
+
random_state=0)
|
|
54
|
+
print("Imputation {} =========".format(imputation_name))
|
|
55
|
+
for clf_name, clf in clfs:
|
|
56
|
+
clf = clf()
|
|
57
|
+
clf.fit(X_train, y_train)
|
|
58
|
+
y_pred = clf.predict(X_test)
|
|
59
|
+
accuracy = accuracy_score(y_test, y_pred)
|
|
60
|
+
results[imputation_name].append((clf_name, accuracy))
|
|
61
|
+
print("...{}".format(clf_name))
|
|
62
|
+
|
|
63
|
+
# If not None, write to path
|
|
64
|
+
if log_path:
|
|
65
|
+
with open(log_path, 'w') as f:
|
|
66
|
+
f.write(str(results))
|
|
67
|
+
print("Results saved to {}".format(log_path))
|
|
68
|
+
|
|
69
|
+
return results
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
""" impyute.contrib.count_missing.py """
|
|
2
|
+
import numpy as np
|
|
3
|
+
from impyute.ops import matrix
|
|
4
|
+
|
|
5
|
+
def count_missing(data):
|
|
6
|
+
""" Calculate the total percentage of missing values and also the
|
|
7
|
+
percentage in each column.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
data: np.array
|
|
12
|
+
Data to impute.
|
|
13
|
+
|
|
14
|
+
Returns
|
|
15
|
+
-------
|
|
16
|
+
dict
|
|
17
|
+
Percentage of missing values in total and in each column.
|
|
18
|
+
|
|
19
|
+
"""
|
|
20
|
+
size = len(data.flatten())
|
|
21
|
+
nan_xy = matrix.nan_indices(data)
|
|
22
|
+
np.unique(nan_xy)
|
|
23
|
+
counter = {y: 0. for y in np.unique(nan_xy.T[1])}
|
|
24
|
+
change_in_percentage = 1./size
|
|
25
|
+
for _, y in nan_xy:
|
|
26
|
+
counter[y] += change_in_percentage
|
|
27
|
+
total_missing = len(nan_xy)/size
|
|
28
|
+
counter["total"] = total_missing
|
|
29
|
+
|
|
30
|
+
return counter
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
""" impyute.contrib.describe """
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
|
|
4
|
+
def describe(data): # verbose=True):
|
|
5
|
+
""" Print input/output multiple times
|
|
6
|
+
|
|
7
|
+
Eventually will be used instead of matrix.nan_indices everywhere
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
data: numpy.nd.array
|
|
12
|
+
The data you want to get a description from
|
|
13
|
+
verbose: boolean(optional)
|
|
14
|
+
Decides whether the description is short or long form
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
dict
|
|
19
|
+
missingness: list
|
|
20
|
+
Confidence interval of data being MCAR, MAR or MNAR - in that order
|
|
21
|
+
nan_xy: list of tuples
|
|
22
|
+
Indices of all null points
|
|
23
|
+
nan_n: list
|
|
24
|
+
Total number of null values for each column
|
|
25
|
+
pmissing_n: float
|
|
26
|
+
Percentage of missing values in dataset
|
|
27
|
+
nan_rows: list
|
|
28
|
+
Indices of all rows that are completely null
|
|
29
|
+
nan_cols: list
|
|
30
|
+
Indices of all columns that are completely null
|
|
31
|
+
mean_rows: list
|
|
32
|
+
Mean value of each row
|
|
33
|
+
mean_cols: list
|
|
34
|
+
Mean value of each column
|
|
35
|
+
std_dev: list
|
|
36
|
+
std dev for each row/column
|
|
37
|
+
min_max: list
|
|
38
|
+
Finds the minimum and maximum for each row
|
|
39
|
+
|
|
40
|
+
"""
|
|
41
|
+
# missingness = [0.33, 0.33, 0.33] # find_missingness(data)
|
|
42
|
+
nan_xy = matrix.nan_indices(data)
|
|
43
|
+
nan_n = len(nan_xy)
|
|
44
|
+
pmissing_n = float(nan_n/len(data.flatten))
|
|
45
|
+
# pmissing_rows = ""
|
|
46
|
+
# pmissing_cols = ""
|
|
47
|
+
# nan_rows = ""
|
|
48
|
+
# nan_cols = ""
|
|
49
|
+
# mean_rows = ""
|
|
50
|
+
# mean_cols = ""
|
|
51
|
+
# std_dev = ""
|
|
52
|
+
# "missingness": missingness,
|
|
53
|
+
description = {"nan_xy": nan_xy,
|
|
54
|
+
"nan_n": nan_n,
|
|
55
|
+
"pmissing_n": pmissing_n}
|
|
56
|
+
# "pmissing_rows": pmissing_rows,
|
|
57
|
+
# "pmissing_cols": pmissing_cols,
|
|
58
|
+
# "nan_rows": nan_rows,
|
|
59
|
+
# "nan_cols": nan_cols,
|
|
60
|
+
# "mean_rows": mean_rows,
|
|
61
|
+
# "mean_cols": mean_cols,
|
|
62
|
+
# "std_dev": std_dev}
|
|
63
|
+
return description
|
impyute/cs/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
""" Imputations for cross-sectional data. """
|
|
2
|
+
|
|
3
|
+
from .random import random_impute
|
|
4
|
+
from .central_tendency import mean
|
|
5
|
+
from .central_tendency import mode
|
|
6
|
+
from .central_tendency import median
|
|
7
|
+
from .buck_iterative import buck_iterative
|
|
8
|
+
from .em import em
|
|
9
|
+
from .fast_knn import fast_knn
|
|
10
|
+
|
|
11
|
+
__all__ = ["random_impute", "mean", "mode", "median", "buck_iterative", "em", "fast_knn"]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.linear_model import LinearRegression
|
|
3
|
+
from impyute.ops import matrix
|
|
4
|
+
from impyute.ops import wrapper
|
|
5
|
+
# pylint: disable=too-many-locals
|
|
6
|
+
|
|
7
|
+
@wrapper.wrappers
|
|
8
|
+
@wrapper.checks
|
|
9
|
+
def buck_iterative(data):
|
|
10
|
+
""" Iterative variant of buck's method
|
|
11
|
+
|
|
12
|
+
- Variable to regress on is chosen at random.
|
|
13
|
+
- EM type infinite regression loop stops after change in prediction from
|
|
14
|
+
previous prediction < 10% for all columns with missing values
|
|
15
|
+
|
|
16
|
+
A Method of Estimation of Missing Values in Multivariate Data Suitable for
|
|
17
|
+
use with an Electronic Computer S. F. Buck Journal of the Royal Statistical
|
|
18
|
+
Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
data: numpy.ndarray
|
|
23
|
+
Data to impute.
|
|
24
|
+
|
|
25
|
+
Returns
|
|
26
|
+
-------
|
|
27
|
+
numpy.ndarray
|
|
28
|
+
Imputed data.
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
nan_xy = matrix.nan_indices(data)
|
|
32
|
+
|
|
33
|
+
# Add a column of zeros to the index values
|
|
34
|
+
nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1)
|
|
35
|
+
|
|
36
|
+
nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz]
|
|
37
|
+
temp = []
|
|
38
|
+
cols_missing = {y for _, y, _ in nan_xyz}
|
|
39
|
+
|
|
40
|
+
# Step 1: Simple Imputation, these are just placeholders
|
|
41
|
+
for x_i, y_i, value in nan_xyz:
|
|
42
|
+
# Column containing nan value without the nan value
|
|
43
|
+
col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
|
|
44
|
+
|
|
45
|
+
new_value = np.mean(col)
|
|
46
|
+
data[x_i][y_i] = new_value
|
|
47
|
+
temp.append([x_i, y_i, new_value])
|
|
48
|
+
nan_xyz = temp
|
|
49
|
+
|
|
50
|
+
# Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)
|
|
51
|
+
|
|
52
|
+
converged = [False] * len(nan_xyz)
|
|
53
|
+
while not all(converged):
|
|
54
|
+
# Step 2: Placeholders are set back to missing for one variable/column
|
|
55
|
+
dependent_col = int(np.random.choice(list(cols_missing)))
|
|
56
|
+
missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col]
|
|
57
|
+
|
|
58
|
+
# Step 3: Perform linear regression using the other variables
|
|
59
|
+
x_train, y_train = [], []
|
|
60
|
+
for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs):
|
|
61
|
+
x_train.append(np.delete(data[x_i], dependent_col))
|
|
62
|
+
y_train.append(data[x_i][dependent_col])
|
|
63
|
+
model = LinearRegression()
|
|
64
|
+
model.fit(x_train, y_train)
|
|
65
|
+
|
|
66
|
+
# Step 4: Missing values for the missing variable/column are replaced
|
|
67
|
+
# with predictions from our new linear regression model
|
|
68
|
+
# For null indices with the dependent column that was randomly chosen
|
|
69
|
+
for i, z in enumerate(nan_xyz):
|
|
70
|
+
x_i = z[0]
|
|
71
|
+
y_i = z[1]
|
|
72
|
+
value = data[x_i, y_i]
|
|
73
|
+
if y_i == dependent_col:
|
|
74
|
+
# Row 'x' without the nan value
|
|
75
|
+
new_value = model.predict([np.delete(data[x_i], dependent_col)])
|
|
76
|
+
data[x_i][y_i] = new_value.reshape(1, -1)
|
|
77
|
+
if value == 0.0:
|
|
78
|
+
delta = (new_value-value)/0.01
|
|
79
|
+
else:
|
|
80
|
+
delta = (new_value-value)/value
|
|
81
|
+
converged[i] = abs(delta) < 0.1
|
|
82
|
+
return data
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
|
|
5
|
+
@wrapper.wrappers
|
|
6
|
+
@wrapper.checks
|
|
7
|
+
def mean(data):
|
|
8
|
+
""" Substitute missing values with the mean of that column.
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
----------
|
|
12
|
+
data: numpy.ndarray
|
|
13
|
+
Data to impute.
|
|
14
|
+
|
|
15
|
+
Returns
|
|
16
|
+
-------
|
|
17
|
+
numpy.ndarray
|
|
18
|
+
Imputed data.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
nan_xy = matrix.nan_indices(data)
|
|
22
|
+
for x_i, y_i in nan_xy:
|
|
23
|
+
row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
|
|
24
|
+
new_value = np.mean(row_wo_nan)
|
|
25
|
+
data[x_i][y_i] = new_value
|
|
26
|
+
return data
|
|
27
|
+
|
|
28
|
+
@wrapper.wrappers
|
|
29
|
+
@wrapper.checks
|
|
30
|
+
def median(data):
|
|
31
|
+
""" Substitute missing values with the median of that column(middle).
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
data: numpy.ndarray
|
|
36
|
+
Data to impute.
|
|
37
|
+
|
|
38
|
+
Returns
|
|
39
|
+
-------
|
|
40
|
+
numpy.ndarray
|
|
41
|
+
Imputed data.
|
|
42
|
+
|
|
43
|
+
"""
|
|
44
|
+
nan_xy = matrix.nan_indices(data)
|
|
45
|
+
cols_missing = set(nan_xy.T[1])
|
|
46
|
+
medians = {}
|
|
47
|
+
for y_i in cols_missing:
|
|
48
|
+
cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
|
|
49
|
+
median_y = np.median(cols_wo_nan)
|
|
50
|
+
medians[str(y_i)] = median_y
|
|
51
|
+
for x_i, y_i in nan_xy:
|
|
52
|
+
data[x_i][y_i] = medians[str(y_i)]
|
|
53
|
+
return data
|
|
54
|
+
|
|
55
|
+
@wrapper.wrappers
|
|
56
|
+
@wrapper.checks
|
|
57
|
+
def mode(data):
|
|
58
|
+
""" Substitute missing values with the mode of that column(most frequent).
|
|
59
|
+
|
|
60
|
+
In the case that there is a tie (there are multiple, most frequent values)
|
|
61
|
+
for a column randomly pick one of them.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
data: numpy.ndarray
|
|
66
|
+
Data to impute.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
numpy.ndarray
|
|
71
|
+
Imputed data.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
nan_xy = matrix.nan_indices(data)
|
|
75
|
+
modes = []
|
|
76
|
+
for y_i in range(np.shape(data)[1]):
|
|
77
|
+
unique_counts = np.unique(data[:, [y_i]], return_counts=True)
|
|
78
|
+
max_count = np.max(unique_counts[1])
|
|
79
|
+
mode_y = [unique for unique, count in np.transpose(unique_counts)
|
|
80
|
+
if count == max_count and not np.isnan(unique)]
|
|
81
|
+
modes.append(mode_y) # Appends index of column and column modes
|
|
82
|
+
for x_i, y_i in nan_xy:
|
|
83
|
+
data[x_i][y_i] = np.random.choice(modes[y_i])
|
|
84
|
+
return data
|
impyute/cs/em.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
|
|
5
|
+
@wrapper.wrappers
|
|
6
|
+
@wrapper.checks
|
|
7
|
+
def em(data, eps=0.1):
|
|
8
|
+
""" Imputes given data using expectation maximization.
|
|
9
|
+
|
|
10
|
+
E-step: Calculates the expected complete data log likelihood ratio.
|
|
11
|
+
M-step: Finds the parameters that maximize the log likelihood of the
|
|
12
|
+
complete data.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
data: numpy.nd.array
|
|
17
|
+
Data to impute.
|
|
18
|
+
eps: float
|
|
19
|
+
The amount of minimum change between iterations to break, if relative change < eps, converge.
|
|
20
|
+
relative change = abs(current - previous) / previous
|
|
21
|
+
inplace: boolean
|
|
22
|
+
If True, operate on the numpy array reference
|
|
23
|
+
|
|
24
|
+
Returns
|
|
25
|
+
-------
|
|
26
|
+
numpy.nd.array
|
|
27
|
+
Imputed data.
|
|
28
|
+
|
|
29
|
+
"""
|
|
30
|
+
nan_xy = matrix.nan_indices(data)
|
|
31
|
+
for x_i, y_i in nan_xy:
|
|
32
|
+
col = data[:, int(y_i)]
|
|
33
|
+
mu = col[~np.isnan(col)].mean()
|
|
34
|
+
std = col[~np.isnan(col)].std()
|
|
35
|
+
col[x_i] = np.random.normal(loc=mu, scale=std)
|
|
36
|
+
previous, i = 1, 1
|
|
37
|
+
while True:
|
|
38
|
+
i += 1
|
|
39
|
+
# Expectation
|
|
40
|
+
mu = col[~np.isnan(col)].mean()
|
|
41
|
+
std = col[~np.isnan(col)].std()
|
|
42
|
+
# Maximization
|
|
43
|
+
col[x_i] = np.random.normal(loc=mu, scale=std)
|
|
44
|
+
# Break out of loop if likelihood doesn't change at least 10%
|
|
45
|
+
# and has run at least 5 times
|
|
46
|
+
delta = np.abs(col[x_i]-previous)/previous
|
|
47
|
+
if i > 5 and delta < eps:
|
|
48
|
+
data[x_i][y_i] = col[x_i]
|
|
49
|
+
break
|
|
50
|
+
data[x_i][y_i] = col[x_i]
|
|
51
|
+
previous = col[x_i]
|
|
52
|
+
return data
|
impyute/cs/fast_knn.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from scipy.spatial import KDTree
|
|
4
|
+
from impyute.ops import matrix
|
|
5
|
+
from impyute.ops import wrapper
|
|
6
|
+
from impyute.ops import inverse_distance_weighting as idw
|
|
7
|
+
|
|
8
|
+
from . import mean
|
|
9
|
+
# pylint: disable=too-many-arguments
|
|
10
|
+
|
|
11
|
+
@wrapper.wrappers
|
|
12
|
+
@wrapper.checks
|
|
13
|
+
def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
|
|
14
|
+
idw_fn=idw.shepards, init_impute_fn=mean):
|
|
15
|
+
""" Impute using a variant of the nearest neighbours approach
|
|
16
|
+
|
|
17
|
+
Basic idea: Impute array with a passed in initial impute fn (mean impute)
|
|
18
|
+
and then use the resulting complete array to construct a KDTree. Use this
|
|
19
|
+
KDTree to compute nearest neighbours. After finding `k` nearest
|
|
20
|
+
neighbours, take the weighted average of them. Basically, find the nearest
|
|
21
|
+
row in terms of distance
|
|
22
|
+
|
|
23
|
+
This approach is much, much faster than the other implementation (fit+transform
|
|
24
|
+
for each subset) which is almost prohibitively expensive.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
data: ndarray
|
|
29
|
+
2D matrix to impute.
|
|
30
|
+
|
|
31
|
+
k: int, optional
|
|
32
|
+
Parameter used for method querying the KDTree class object. Number of
|
|
33
|
+
neighbours used in the KNN query. Refer to the docs for
|
|
34
|
+
[`scipy.spatial.KDTree.query`]
|
|
35
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
36
|
+
|
|
37
|
+
eps: nonnegative float, optional
|
|
38
|
+
Parameter used for method querying the KDTree class object. From the
|
|
39
|
+
SciPy docs: "Return approximate nearest neighbors; the kth returned
|
|
40
|
+
value is guaranteed to be no further than (1+eps) times the distance to
|
|
41
|
+
the real kth nearest neighbor". Refer to the docs for
|
|
42
|
+
[`scipy.spatial.KDTree.query`]
|
|
43
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
44
|
+
|
|
45
|
+
p : float, 1<=p<=infinity, optional
|
|
46
|
+
Parameter used for method querying the KDTree class object. Straight from the
|
|
47
|
+
SciPy docs: "Which Minkowski p-norm to use. 1 is the
|
|
48
|
+
sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
|
|
49
|
+
distance infinity is the maximum-coordinate-difference distance". Refer to
|
|
50
|
+
the docs for
|
|
51
|
+
[`scipy.spatial.KDTree.query`]
|
|
52
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
53
|
+
|
|
54
|
+
distance_upper_bound : nonnegative float, optional
|
|
55
|
+
Parameter used for method querying the KDTree class object. Straight
|
|
56
|
+
from the SciPy docs: "Return only neighbors within this distance. This
|
|
57
|
+
is used to prune tree searches, so if you are doing a series of
|
|
58
|
+
nearest-neighbor queries, it may help to supply the distance to the
|
|
59
|
+
nearest neighbor of the most recent point." Refer to the docs for
|
|
60
|
+
[`scipy.spatial.KDTree.query`]
|
|
61
|
+
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
|
|
62
|
+
|
|
63
|
+
leafsize: int, optional
|
|
64
|
+
Parameter used for construction of the `KDTree` class object. Straight from
|
|
65
|
+
the SciPy docs: "The number of points at which the algorithm switches
|
|
66
|
+
over to brute-force. Has to be positive". Refer to the docs for
|
|
67
|
+
[`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html)
|
|
68
|
+
for more information.
|
|
69
|
+
|
|
70
|
+
idw_fn: fn, optional
|
|
71
|
+
Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom
|
|
72
|
+
one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using
|
|
73
|
+
functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
|
|
74
|
+
|
|
75
|
+
init_impute_fn: fn, optional
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
numpy.ndarray
|
|
80
|
+
Imputed data.
|
|
81
|
+
|
|
82
|
+
Examples
|
|
83
|
+
--------
|
|
84
|
+
|
|
85
|
+
>>> data = np.arange(25).reshape((5, 5)).astype(np.float)
|
|
86
|
+
>>> data[0][2] = np.nan
|
|
87
|
+
>>> data
|
|
88
|
+
array([[ 0., 1., nan, 3., 4.],
|
|
89
|
+
[ 5., 6., 7., 8., 9.],
|
|
90
|
+
[10., 11., 12., 13., 14.],
|
|
91
|
+
[15., 16., 17., 18., 19.],
|
|
92
|
+
[20., 21., 22., 23., 24.]])
|
|
93
|
+
>> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour
|
|
94
|
+
array([[ 0., 1., 7., 3., 4.],
|
|
95
|
+
[ 5., 6., 7., 8., 9.],
|
|
96
|
+
[10., 11., 12., 13., 14.],
|
|
97
|
+
[15., 16., 17., 18., 19.],
|
|
98
|
+
[20., 21., 22., 23., 24.]])
|
|
99
|
+
>> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours
|
|
100
|
+
array([[ 0. , 1. , 10.08608891, 3. , 4. ],
|
|
101
|
+
[ 5. , 6. , 7. , 8. , 9. ],
|
|
102
|
+
[10. , 11. , 12. , 13. , 14. ],
|
|
103
|
+
[15. , 16. , 17. , 18. , 19. ],
|
|
104
|
+
[20. , 21. , 22. , 23. , 24. ]])
|
|
105
|
+
>> fast_knn(data, k=3)
|
|
106
|
+
array([[ 0. , 1. , 13.40249283, 3. , 4. ],
|
|
107
|
+
[ 5. , 6. , 7. , 8. , 9. ],
|
|
108
|
+
[10. , 11. , 12. , 13. , 14. ],
|
|
109
|
+
[15. , 16. , 17. , 18. , 19. ],
|
|
110
|
+
[20. , 21. , 22. , 23. , 24. ]])
|
|
111
|
+
>> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error
|
|
112
|
+
...
|
|
113
|
+
IndexError: index 5 is out of bounds for axis 0 with size 5
|
|
114
|
+
|
|
115
|
+
"""
|
|
116
|
+
nan_xy = matrix.nan_indices(data)
|
|
117
|
+
data_c = init_impute_fn(data)
|
|
118
|
+
kdtree = KDTree(data_c, leafsize=leafsize)
|
|
119
|
+
|
|
120
|
+
for x_i, y_i in nan_xy:
|
|
121
|
+
distances, indices = kdtree.query(data_c[x_i], k=k+1, eps=eps,
|
|
122
|
+
p=p, distance_upper_bound=distance_upper_bound)
|
|
123
|
+
# Will always return itself in the first index. Delete it.
|
|
124
|
+
distances, indices = distances[1:], indices[1:]
|
|
125
|
+
# Add small constant to distances to avoid division by 0
|
|
126
|
+
distances += 1e-3
|
|
127
|
+
weights = idw_fn(distances)
|
|
128
|
+
# Assign missing value the weighted average of `k` nearest neighbours
|
|
129
|
+
data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices])
|
|
130
|
+
return data
|
impyute/cs/random.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from impyute.ops import matrix
|
|
3
|
+
from impyute.ops import wrapper
|
|
4
|
+
|
|
5
|
+
@wrapper.wrappers
|
|
6
|
+
@wrapper.checks
|
|
7
|
+
def random_impute(data):
|
|
8
|
+
""" Fill missing values in with a randomly selected value from the same
|
|
9
|
+
column.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
data: numpy.ndarray
|
|
14
|
+
Data to impute.
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
numpy.ndarray
|
|
19
|
+
Imputed data.
|
|
20
|
+
|
|
21
|
+
"""
|
|
22
|
+
nan_xy = matrix.nan_indices(data)
|
|
23
|
+
for x, y in nan_xy:
|
|
24
|
+
uniques = np.unique(data[:, y])
|
|
25
|
+
uniques = uniques[~np.isnan(uniques)]
|
|
26
|
+
data[x][y] = np.random.choice(uniques)
|
|
27
|
+
return data
|