cpgtools 1.12.0__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cpgtools might be problematic. Click here for more details.

Files changed (77) hide show
  1. cpgmodule/_version.py +1 -0
  2. cpgmodule/data/__init__.py +0 -0
  3. cpgmodule/methylClock.py +53 -0
  4. cpgmodule/utils.py +38 -1
  5. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_aggregation.py +1 -1
  6. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_position.py +1 -1
  7. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_probe.py +6 -4
  8. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_density_gene_centered.py +1 -1
  9. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_chrom.py +1 -1
  10. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_gene_centered.py +1 -1
  11. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_region.py +1 -3
  12. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_logo.py +1 -1
  13. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_to_gene.py +1 -1
  14. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_PCA.py +31 -23
  15. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_UMAP.py +29 -22
  16. cpgtools-2.0.2.data/scripts/beta_imputation.py +604 -0
  17. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_jitter_plot.py +1 -1
  18. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_m_conversion.py +1 -1
  19. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_gene_centered.py +1 -1
  20. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_region.py +1 -1
  21. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_selectNBest.py +9 -6
  22. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_stacked_barplot.py +1 -1
  23. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_stats.py +1 -1
  24. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_tSNE.py +31 -24
  25. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_topN.py +1 -1
  26. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_trichotmize.py +1 -1
  27. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_Bayes.py +1 -1
  28. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_bb.py +1 -1
  29. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_fisher.py +1 -1
  30. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_glm.py +1 -1
  31. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_logit.py +1 -1
  32. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_nonparametric.py +1 -1
  33. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_ttest.py +3 -3
  34. cpgtools-2.0.2.data/scripts/predict_sex.py +126 -0
  35. cpgtools-2.0.2.dist-info/LICENSE +19 -0
  36. cpgtools-2.0.2.dist-info/METADATA +76 -0
  37. cpgtools-2.0.2.dist-info/RECORD +82 -0
  38. {cpgtools-1.12.0.dist-info → cpgtools-2.0.2.dist-info}/WHEEL +1 -1
  39. cpgtools-2.0.2.dist-info/top_level.txt +3 -0
  40. impyute/__init__.py +3 -0
  41. impyute/contrib/__init__.py +7 -0
  42. impyute/contrib/compare.py +69 -0
  43. impyute/contrib/count_missing.py +30 -0
  44. impyute/contrib/describe.py +63 -0
  45. impyute/cs/__init__.py +11 -0
  46. impyute/cs/buck_iterative.py +82 -0
  47. impyute/cs/central_tendency.py +84 -0
  48. impyute/cs/em.py +52 -0
  49. impyute/cs/fast_knn.py +130 -0
  50. impyute/cs/random.py +27 -0
  51. impyute/dataset/__init__.py +6 -0
  52. impyute/dataset/base.py +137 -0
  53. impyute/dataset/corrupt.py +55 -0
  54. impyute/deletion/__init__.py +5 -0
  55. impyute/deletion/complete_case.py +21 -0
  56. impyute/ops/__init__.py +12 -0
  57. impyute/ops/error.py +9 -0
  58. impyute/ops/inverse_distance_weighting.py +31 -0
  59. impyute/ops/matrix.py +47 -0
  60. impyute/ops/testing.py +20 -0
  61. impyute/ops/util.py +76 -0
  62. impyute/ops/wrapper.py +179 -0
  63. impyute/ts/__init__.py +6 -0
  64. impyute/ts/locf.py +57 -0
  65. impyute/ts/moving_window.py +128 -0
  66. missingpy/__init__.py +4 -0
  67. missingpy/knnimpute.py +328 -0
  68. missingpy/missforest.py +556 -0
  69. missingpy/pairwise_external.py +315 -0
  70. missingpy/tests/__init__.py +0 -0
  71. missingpy/tests/test_knnimpute.py +605 -0
  72. missingpy/tests/test_missforest.py +409 -0
  73. missingpy/utils.py +124 -0
  74. cpgtools-1.12.0.dist-info/LICENSE.txt +0 -674
  75. cpgtools-1.12.0.dist-info/METADATA +0 -30
  76. cpgtools-1.12.0.dist-info/RECORD +0 -43
  77. cpgtools-1.12.0.dist-info/top_level.txt +0 -2
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.34.2)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,3 @@
1
+ cpgmodule
2
+ impyute
3
+ missingpy
impyute/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """ Imputations for cross-sectional and time-series data. """
2
+
3
+ __all__ = ["cs", "ts"]
@@ -0,0 +1,7 @@
1
+ """ Volatile code. Expect stuff in this to change. """
2
+
3
+ from .describe import describe
4
+ from .count_missing import count_missing
5
+ from .compare import compare
6
+
7
+ __all__ = ["describe", "count_missing", "compare"]
@@ -0,0 +1,69 @@
1
+ """impyute.contrib.compare.py"""
2
+ import importlib
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import accuracy_score
5
+ # pylint: disable=too-many-locals, dangerous-default-value
6
+
7
+ def compare(imputed, classifiers=["sklearn.svm.SVC"], log_path=None):
8
+ """
9
+ Given an imputed dataset with labels and a list of supervised machine
10
+ learning model, find accuracy score of all model/imputation pairs.
11
+
12
+ Parameters
13
+ ----------
14
+ imputed: [(str, np.ndarray), (str, np.ndarray)...]
15
+ List of tuples containing (imputation_name, imputed_data) where
16
+ `imputation_name` is a string and `imputed_data` is a tuple where
17
+ `imputed_data`[0] is the data, X and `imputed_data`[1] is the label, y
18
+ classifiers: [str, str...str] (optional)
19
+ Provide a list of classifiers to run imputed data sets on. Right now,
20
+ it ONLY works with sklearn, the format should be like so:
21
+ `sklearn.SUBMODULE.FUNCTION`. More generally its
22
+ 'MODULE.SUBMODULE.FUNCTION'. If providing a custom classifier, make
23
+ sure to add the file location to sys.path first and the classifier
24
+ should also be structured like sklearn (with a `fit` and `predict`
25
+ method).
26
+ log_path: str (optional)
27
+ To write results to a file, provide a relative path
28
+
29
+ Returns
30
+ -------
31
+ results.txt
32
+ Classification results on imputed data
33
+
34
+ """
35
+ clfs = []
36
+ for clf_name in classifiers:
37
+ mod_name, smod_name, fn_name = clf_name.split(".")
38
+ try:
39
+ mod = importlib.import_module("{}.{}".format(mod_name, smod_name))
40
+ fn = getattr(mod, fn_name)
41
+ clfs.append([fn_name, fn])
42
+ except ModuleNotFoundError:
43
+ print("Cannot import '{}' from '{}.{}'".format(fn_name,
44
+ mod_name,
45
+ smod_name))
46
+
47
+ results = {imputation_name: [] for imputation_name, _ in imputed}
48
+
49
+ for imputation_name, data in imputed:
50
+ X, y = data
51
+ X_train, X_test, y_train, y_test = train_test_split(X, y,
52
+ test_size=0.33,
53
+ random_state=0)
54
+ print("Imputation {} =========".format(imputation_name))
55
+ for clf_name, clf in clfs:
56
+ clf = clf()
57
+ clf.fit(X_train, y_train)
58
+ y_pred = clf.predict(X_test)
59
+ accuracy = accuracy_score(y_test, y_pred)
60
+ results[imputation_name].append((clf_name, accuracy))
61
+ print("...{}".format(clf_name))
62
+
63
+ # If not None, write to path
64
+ if log_path:
65
+ with open(log_path, 'w') as f:
66
+ f.write(str(results))
67
+ print("Results saved to {}".format(log_path))
68
+
69
+ return results
@@ -0,0 +1,30 @@
1
+ """ impyute.contrib.count_missing.py """
2
+ import numpy as np
3
+ from impyute.ops import matrix
4
+
5
+ def count_missing(data):
6
+ """ Calculate the total percentage of missing values and also the
7
+ percentage in each column.
8
+
9
+ Parameters
10
+ ----------
11
+ data: np.array
12
+ Data to impute.
13
+
14
+ Returns
15
+ -------
16
+ dict
17
+ Percentage of missing values in total and in each column.
18
+
19
+ """
20
+ size = len(data.flatten())
21
+ nan_xy = matrix.nan_indices(data)
22
+ np.unique(nan_xy)
23
+ counter = {y: 0. for y in np.unique(nan_xy.T[1])}
24
+ change_in_percentage = 1./size
25
+ for _, y in nan_xy:
26
+ counter[y] += change_in_percentage
27
+ total_missing = len(nan_xy)/size
28
+ counter["total"] = total_missing
29
+
30
+ return counter
@@ -0,0 +1,63 @@
1
+ """ impyute.contrib.describe """
2
+ from impyute.ops import matrix
3
+
4
+ def describe(data): # verbose=True):
5
+ """ Print input/output multiple times
6
+
7
+ Eventually will be used instead of matrix.nan_indices everywhere
8
+
9
+ Parameters
10
+ ----------
11
+ data: numpy.nd.array
12
+ The data you want to get a description from
13
+ verbose: boolean(optional)
14
+ Decides whether the description is short or long form
15
+
16
+ Returns
17
+ -------
18
+ dict
19
+ missingness: list
20
+ Confidence interval of data being MCAR, MAR or MNAR - in that order
21
+ nan_xy: list of tuples
22
+ Indices of all null points
23
+ nan_n: list
24
+ Total number of null values for each column
25
+ pmissing_n: float
26
+ Percentage of missing values in dataset
27
+ nan_rows: list
28
+ Indices of all rows that are completely null
29
+ nan_cols: list
30
+ Indices of all columns that are completely null
31
+ mean_rows: list
32
+ Mean value of each row
33
+ mean_cols: list
34
+ Mean value of each column
35
+ std_dev: list
36
+ std dev for each row/column
37
+ min_max: list
38
+ Finds the minimum and maximum for each row
39
+
40
+ """
41
+ # missingness = [0.33, 0.33, 0.33] # find_missingness(data)
42
+ nan_xy = matrix.nan_indices(data)
43
+ nan_n = len(nan_xy)
44
+ pmissing_n = float(nan_n/len(data.flatten))
45
+ # pmissing_rows = ""
46
+ # pmissing_cols = ""
47
+ # nan_rows = ""
48
+ # nan_cols = ""
49
+ # mean_rows = ""
50
+ # mean_cols = ""
51
+ # std_dev = ""
52
+ # "missingness": missingness,
53
+ description = {"nan_xy": nan_xy,
54
+ "nan_n": nan_n,
55
+ "pmissing_n": pmissing_n}
56
+ # "pmissing_rows": pmissing_rows,
57
+ # "pmissing_cols": pmissing_cols,
58
+ # "nan_rows": nan_rows,
59
+ # "nan_cols": nan_cols,
60
+ # "mean_rows": mean_rows,
61
+ # "mean_cols": mean_cols,
62
+ # "std_dev": std_dev}
63
+ return description
impyute/cs/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """ Imputations for cross-sectional data. """
2
+
3
+ from .random import random_impute
4
+ from .central_tendency import mean
5
+ from .central_tendency import mode
6
+ from .central_tendency import median
7
+ from .buck_iterative import buck_iterative
8
+ from .em import em
9
+ from .fast_knn import fast_knn
10
+
11
+ __all__ = ["random_impute", "mean", "mode", "median", "buck_iterative", "em", "fast_knn"]
@@ -0,0 +1,82 @@
1
+ import numpy as np
2
+ from sklearn.linear_model import LinearRegression
3
+ from impyute.ops import matrix
4
+ from impyute.ops import wrapper
5
+ # pylint: disable=too-many-locals
6
+
7
+ @wrapper.wrappers
8
+ @wrapper.checks
9
+ def buck_iterative(data):
10
+ """ Iterative variant of buck's method
11
+
12
+ - Variable to regress on is chosen at random.
13
+ - EM type infinite regression loop stops after change in prediction from
14
+ previous prediction < 10% for all columns with missing values
15
+
16
+ A Method of Estimation of Missing Values in Multivariate Data Suitable for
17
+ use with an Electronic Computer S. F. Buck Journal of the Royal Statistical
18
+ Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306
19
+
20
+ Parameters
21
+ ----------
22
+ data: numpy.ndarray
23
+ Data to impute.
24
+
25
+ Returns
26
+ -------
27
+ numpy.ndarray
28
+ Imputed data.
29
+
30
+ """
31
+ nan_xy = matrix.nan_indices(data)
32
+
33
+ # Add a column of zeros to the index values
34
+ nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1)
35
+
36
+ nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz]
37
+ temp = []
38
+ cols_missing = {y for _, y, _ in nan_xyz}
39
+
40
+ # Step 1: Simple Imputation, these are just placeholders
41
+ for x_i, y_i, value in nan_xyz:
42
+ # Column containing nan value without the nan value
43
+ col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
44
+
45
+ new_value = np.mean(col)
46
+ data[x_i][y_i] = new_value
47
+ temp.append([x_i, y_i, new_value])
48
+ nan_xyz = temp
49
+
50
+ # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)
51
+
52
+ converged = [False] * len(nan_xyz)
53
+ while not all(converged):
54
+ # Step 2: Placeholders are set back to missing for one variable/column
55
+ dependent_col = int(np.random.choice(list(cols_missing)))
56
+ missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col]
57
+
58
+ # Step 3: Perform linear regression using the other variables
59
+ x_train, y_train = [], []
60
+ for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs):
61
+ x_train.append(np.delete(data[x_i], dependent_col))
62
+ y_train.append(data[x_i][dependent_col])
63
+ model = LinearRegression()
64
+ model.fit(x_train, y_train)
65
+
66
+ # Step 4: Missing values for the missing variable/column are replaced
67
+ # with predictions from our new linear regression model
68
+ # For null indices with the dependent column that was randomly chosen
69
+ for i, z in enumerate(nan_xyz):
70
+ x_i = z[0]
71
+ y_i = z[1]
72
+ value = data[x_i, y_i]
73
+ if y_i == dependent_col:
74
+ # Row 'x' without the nan value
75
+ new_value = model.predict([np.delete(data[x_i], dependent_col)])
76
+ data[x_i][y_i] = new_value.reshape(1, -1)
77
+ if value == 0.0:
78
+ delta = (new_value-value)/0.01
79
+ else:
80
+ delta = (new_value-value)/value
81
+ converged[i] = abs(delta) < 0.1
82
+ return data
@@ -0,0 +1,84 @@
1
+ import numpy as np
2
+ from impyute.ops import matrix
3
+ from impyute.ops import wrapper
4
+
5
+ @wrapper.wrappers
6
+ @wrapper.checks
7
+ def mean(data):
8
+ """ Substitute missing values with the mean of that column.
9
+
10
+ Parameters
11
+ ----------
12
+ data: numpy.ndarray
13
+ Data to impute.
14
+
15
+ Returns
16
+ -------
17
+ numpy.ndarray
18
+ Imputed data.
19
+
20
+ """
21
+ nan_xy = matrix.nan_indices(data)
22
+ for x_i, y_i in nan_xy:
23
+ row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
24
+ new_value = np.mean(row_wo_nan)
25
+ data[x_i][y_i] = new_value
26
+ return data
27
+
28
+ @wrapper.wrappers
29
+ @wrapper.checks
30
+ def median(data):
31
+ """ Substitute missing values with the median of that column(middle).
32
+
33
+ Parameters
34
+ ----------
35
+ data: numpy.ndarray
36
+ Data to impute.
37
+
38
+ Returns
39
+ -------
40
+ numpy.ndarray
41
+ Imputed data.
42
+
43
+ """
44
+ nan_xy = matrix.nan_indices(data)
45
+ cols_missing = set(nan_xy.T[1])
46
+ medians = {}
47
+ for y_i in cols_missing:
48
+ cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
49
+ median_y = np.median(cols_wo_nan)
50
+ medians[str(y_i)] = median_y
51
+ for x_i, y_i in nan_xy:
52
+ data[x_i][y_i] = medians[str(y_i)]
53
+ return data
54
+
55
+ @wrapper.wrappers
56
+ @wrapper.checks
57
+ def mode(data):
58
+ """ Substitute missing values with the mode of that column(most frequent).
59
+
60
+ In the case that there is a tie (there are multiple, most frequent values)
61
+ for a column randomly pick one of them.
62
+
63
+ Parameters
64
+ ----------
65
+ data: numpy.ndarray
66
+ Data to impute.
67
+
68
+ Returns
69
+ -------
70
+ numpy.ndarray
71
+ Imputed data.
72
+
73
+ """
74
+ nan_xy = matrix.nan_indices(data)
75
+ modes = []
76
+ for y_i in range(np.shape(data)[1]):
77
+ unique_counts = np.unique(data[:, [y_i]], return_counts=True)
78
+ max_count = np.max(unique_counts[1])
79
+ mode_y = [unique for unique, count in np.transpose(unique_counts)
80
+ if count == max_count and not np.isnan(unique)]
81
+ modes.append(mode_y) # Appends index of column and column modes
82
+ for x_i, y_i in nan_xy:
83
+ data[x_i][y_i] = np.random.choice(modes[y_i])
84
+ return data
impyute/cs/em.py ADDED
@@ -0,0 +1,52 @@
1
+ import numpy as np
2
+ from impyute.ops import matrix
3
+ from impyute.ops import wrapper
4
+
5
+ @wrapper.wrappers
6
+ @wrapper.checks
7
+ def em(data, eps=0.1):
8
+ """ Imputes given data using expectation maximization.
9
+
10
+ E-step: Calculates the expected complete data log likelihood ratio.
11
+ M-step: Finds the parameters that maximize the log likelihood of the
12
+ complete data.
13
+
14
+ Parameters
15
+ ----------
16
+ data: numpy.nd.array
17
+ Data to impute.
18
+ eps: float
19
+ The amount of minimum change between iterations to break, if relative change < eps, converge.
20
+ relative change = abs(current - previous) / previous
21
+ inplace: boolean
22
+ If True, operate on the numpy array reference
23
+
24
+ Returns
25
+ -------
26
+ numpy.nd.array
27
+ Imputed data.
28
+
29
+ """
30
+ nan_xy = matrix.nan_indices(data)
31
+ for x_i, y_i in nan_xy:
32
+ col = data[:, int(y_i)]
33
+ mu = col[~np.isnan(col)].mean()
34
+ std = col[~np.isnan(col)].std()
35
+ col[x_i] = np.random.normal(loc=mu, scale=std)
36
+ previous, i = 1, 1
37
+ while True:
38
+ i += 1
39
+ # Expectation
40
+ mu = col[~np.isnan(col)].mean()
41
+ std = col[~np.isnan(col)].std()
42
+ # Maximization
43
+ col[x_i] = np.random.normal(loc=mu, scale=std)
44
+ # Break out of loop if likelihood doesn't change at least 10%
45
+ # and has run at least 5 times
46
+ delta = np.abs(col[x_i]-previous)/previous
47
+ if i > 5 and delta < eps:
48
+ data[x_i][y_i] = col[x_i]
49
+ break
50
+ data[x_i][y_i] = col[x_i]
51
+ previous = col[x_i]
52
+ return data
impyute/cs/fast_knn.py ADDED
@@ -0,0 +1,130 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from scipy.spatial import KDTree
4
+ from impyute.ops import matrix
5
+ from impyute.ops import wrapper
6
+ from impyute.ops import inverse_distance_weighting as idw
7
+
8
+ from . import mean
9
+ # pylint: disable=too-many-arguments
10
+
11
+ @wrapper.wrappers
12
+ @wrapper.checks
13
+ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
14
+ idw_fn=idw.shepards, init_impute_fn=mean):
15
+ """ Impute using a variant of the nearest neighbours approach
16
+
17
+ Basic idea: Impute array with a passed in initial impute fn (mean impute)
18
+ and then use the resulting complete array to construct a KDTree. Use this
19
+ KDTree to compute nearest neighbours. After finding `k` nearest
20
+ neighbours, take the weighted average of them. Basically, find the nearest
21
+ row in terms of distance
22
+
23
+ This approach is much, much faster than the other implementation (fit+transform
24
+ for each subset) which is almost prohibitively expensive.
25
+
26
+ Parameters
27
+ ----------
28
+ data: ndarray
29
+ 2D matrix to impute.
30
+
31
+ k: int, optional
32
+ Parameter used for method querying the KDTree class object. Number of
33
+ neighbours used in the KNN query. Refer to the docs for
34
+ [`scipy.spatial.KDTree.query`]
35
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
36
+
37
+ eps: nonnegative float, optional
38
+ Parameter used for method querying the KDTree class object. From the
39
+ SciPy docs: "Return approximate nearest neighbors; the kth returned
40
+ value is guaranteed to be no further than (1+eps) times the distance to
41
+ the real kth nearest neighbor". Refer to the docs for
42
+ [`scipy.spatial.KDTree.query`]
43
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
44
+
45
+ p : float, 1<=p<=infinity, optional
46
+ Parameter used for method querying the KDTree class object. Straight from the
47
+ SciPy docs: "Which Minkowski p-norm to use. 1 is the
48
+ sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
49
+ distance infinity is the maximum-coordinate-difference distance". Refer to
50
+ the docs for
51
+ [`scipy.spatial.KDTree.query`]
52
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
53
+
54
+ distance_upper_bound : nonnegative float, optional
55
+ Parameter used for method querying the KDTree class object. Straight
56
+ from the SciPy docs: "Return only neighbors within this distance. This
57
+ is used to prune tree searches, so if you are doing a series of
58
+ nearest-neighbor queries, it may help to supply the distance to the
59
+ nearest neighbor of the most recent point." Refer to the docs for
60
+ [`scipy.spatial.KDTree.query`]
61
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
62
+
63
+ leafsize: int, optional
64
+ Parameter used for construction of the `KDTree` class object. Straight from
65
+ the SciPy docs: "The number of points at which the algorithm switches
66
+ over to brute-force. Has to be positive". Refer to the docs for
67
+ [`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html)
68
+ for more information.
69
+
70
+ idw_fn: fn, optional
71
+ Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom
72
+ one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using
73
+ functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
74
+
75
+ init_impute_fn: fn, optional
76
+
77
+ Returns
78
+ -------
79
+ numpy.ndarray
80
+ Imputed data.
81
+
82
+ Examples
83
+ --------
84
+
85
+ >>> data = np.arange(25).reshape((5, 5)).astype(np.float)
86
+ >>> data[0][2] = np.nan
87
+ >>> data
88
+ array([[ 0., 1., nan, 3., 4.],
89
+ [ 5., 6., 7., 8., 9.],
90
+ [10., 11., 12., 13., 14.],
91
+ [15., 16., 17., 18., 19.],
92
+ [20., 21., 22., 23., 24.]])
93
+ >> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour
94
+ array([[ 0., 1., 7., 3., 4.],
95
+ [ 5., 6., 7., 8., 9.],
96
+ [10., 11., 12., 13., 14.],
97
+ [15., 16., 17., 18., 19.],
98
+ [20., 21., 22., 23., 24.]])
99
+ >> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours
100
+ array([[ 0. , 1. , 10.08608891, 3. , 4. ],
101
+ [ 5. , 6. , 7. , 8. , 9. ],
102
+ [10. , 11. , 12. , 13. , 14. ],
103
+ [15. , 16. , 17. , 18. , 19. ],
104
+ [20. , 21. , 22. , 23. , 24. ]])
105
+ >> fast_knn(data, k=3)
106
+ array([[ 0. , 1. , 13.40249283, 3. , 4. ],
107
+ [ 5. , 6. , 7. , 8. , 9. ],
108
+ [10. , 11. , 12. , 13. , 14. ],
109
+ [15. , 16. , 17. , 18. , 19. ],
110
+ [20. , 21. , 22. , 23. , 24. ]])
111
+ >> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error
112
+ ...
113
+ IndexError: index 5 is out of bounds for axis 0 with size 5
114
+
115
+ """
116
+ nan_xy = matrix.nan_indices(data)
117
+ data_c = init_impute_fn(data)
118
+ kdtree = KDTree(data_c, leafsize=leafsize)
119
+
120
+ for x_i, y_i in nan_xy:
121
+ distances, indices = kdtree.query(data_c[x_i], k=k+1, eps=eps,
122
+ p=p, distance_upper_bound=distance_upper_bound)
123
+ # Will always return itself in the first index. Delete it.
124
+ distances, indices = distances[1:], indices[1:]
125
+ # Add small constant to distances to avoid division by 0
126
+ distances += 1e-3
127
+ weights = idw_fn(distances)
128
+ # Assign missing value the weighted average of `k` nearest neighbours
129
+ data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices])
130
+ return data
impyute/cs/random.py ADDED
@@ -0,0 +1,27 @@
1
+ import numpy as np
2
+ from impyute.ops import matrix
3
+ from impyute.ops import wrapper
4
+
5
+ @wrapper.wrappers
6
+ @wrapper.checks
7
+ def random_impute(data):
8
+ """ Fill missing values in with a randomly selected value from the same
9
+ column.
10
+
11
+ Parameters
12
+ ----------
13
+ data: numpy.ndarray
14
+ Data to impute.
15
+
16
+ Returns
17
+ -------
18
+ numpy.ndarray
19
+ Imputed data.
20
+
21
+ """
22
+ nan_xy = matrix.nan_indices(data)
23
+ for x, y in nan_xy:
24
+ uniques = np.unique(data[:, y])
25
+ uniques = uniques[~np.isnan(uniques)]
26
+ data[x][y] = np.random.choice(uniques)
27
+ return data
@@ -0,0 +1,6 @@
1
+ """ Real-world/mock datasets and missingness corruptors to experiment with. """
2
+ from .base import randu
3
+ from .base import randn
4
+ from .base import mnist
5
+
6
+ __all__ = ["randu", "randn", "mnist"]