cpgtools 1.12.0__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cpgtools might be problematic. Click here for more details.

Files changed (77) hide show
  1. cpgmodule/_version.py +1 -0
  2. cpgmodule/data/__init__.py +0 -0
  3. cpgmodule/methylClock.py +53 -0
  4. cpgmodule/utils.py +38 -1
  5. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_aggregation.py +1 -1
  6. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_position.py +1 -1
  7. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_anno_probe.py +6 -4
  8. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_density_gene_centered.py +1 -1
  9. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_chrom.py +1 -1
  10. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_gene_centered.py +1 -1
  11. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_distrb_region.py +1 -3
  12. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_logo.py +1 -1
  13. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/CpG_to_gene.py +1 -1
  14. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_PCA.py +31 -23
  15. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_UMAP.py +29 -22
  16. cpgtools-2.0.2.data/scripts/beta_imputation.py +604 -0
  17. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_jitter_plot.py +1 -1
  18. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_m_conversion.py +1 -1
  19. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_gene_centered.py +1 -1
  20. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_profile_region.py +1 -1
  21. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_selectNBest.py +9 -6
  22. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_stacked_barplot.py +1 -1
  23. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_stats.py +1 -1
  24. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_tSNE.py +31 -24
  25. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_topN.py +1 -1
  26. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/beta_trichotmize.py +1 -1
  27. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_Bayes.py +1 -1
  28. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_bb.py +1 -1
  29. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_fisher.py +1 -1
  30. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_glm.py +1 -1
  31. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_logit.py +1 -1
  32. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_nonparametric.py +1 -1
  33. {cpgtools-1.12.0.data → cpgtools-2.0.2.data}/scripts/dmc_ttest.py +3 -3
  34. cpgtools-2.0.2.data/scripts/predict_sex.py +126 -0
  35. cpgtools-2.0.2.dist-info/LICENSE +19 -0
  36. cpgtools-2.0.2.dist-info/METADATA +76 -0
  37. cpgtools-2.0.2.dist-info/RECORD +82 -0
  38. {cpgtools-1.12.0.dist-info → cpgtools-2.0.2.dist-info}/WHEEL +1 -1
  39. cpgtools-2.0.2.dist-info/top_level.txt +3 -0
  40. impyute/__init__.py +3 -0
  41. impyute/contrib/__init__.py +7 -0
  42. impyute/contrib/compare.py +69 -0
  43. impyute/contrib/count_missing.py +30 -0
  44. impyute/contrib/describe.py +63 -0
  45. impyute/cs/__init__.py +11 -0
  46. impyute/cs/buck_iterative.py +82 -0
  47. impyute/cs/central_tendency.py +84 -0
  48. impyute/cs/em.py +52 -0
  49. impyute/cs/fast_knn.py +130 -0
  50. impyute/cs/random.py +27 -0
  51. impyute/dataset/__init__.py +6 -0
  52. impyute/dataset/base.py +137 -0
  53. impyute/dataset/corrupt.py +55 -0
  54. impyute/deletion/__init__.py +5 -0
  55. impyute/deletion/complete_case.py +21 -0
  56. impyute/ops/__init__.py +12 -0
  57. impyute/ops/error.py +9 -0
  58. impyute/ops/inverse_distance_weighting.py +31 -0
  59. impyute/ops/matrix.py +47 -0
  60. impyute/ops/testing.py +20 -0
  61. impyute/ops/util.py +76 -0
  62. impyute/ops/wrapper.py +179 -0
  63. impyute/ts/__init__.py +6 -0
  64. impyute/ts/locf.py +57 -0
  65. impyute/ts/moving_window.py +128 -0
  66. missingpy/__init__.py +4 -0
  67. missingpy/knnimpute.py +328 -0
  68. missingpy/missforest.py +556 -0
  69. missingpy/pairwise_external.py +315 -0
  70. missingpy/tests/__init__.py +0 -0
  71. missingpy/tests/test_knnimpute.py +605 -0
  72. missingpy/tests/test_missforest.py +409 -0
  73. missingpy/utils.py +124 -0
  74. cpgtools-1.12.0.dist-info/LICENSE.txt +0 -674
  75. cpgtools-1.12.0.dist-info/METADATA +0 -30
  76. cpgtools-1.12.0.dist-info/RECORD +0 -43
  77. cpgtools-1.12.0.dist-info/top_level.txt +0 -2
@@ -0,0 +1,137 @@
1
+ """ Shared functions to load/generate data """
2
+ import itertools
3
+ import math
4
+ import random
5
+ import string
6
+ import numpy as np
7
+ from impyute.dataset.corrupt import Corruptor
8
+ from impyute.ops import error
9
+
10
+ def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
11
+ """ Return randomly generated dataset of numbers with uniformly
12
+ distributed values between bound[0] and bound[1]
13
+
14
+ Parameters
15
+ ----------
16
+ bound:tuple (start,stop)
17
+ Determines the range of values in the matrix. Index 0 for start
18
+ value and index 1 for stop value. Start is inclusive, stop is
19
+ exclusive.
20
+ shape:tuple(optional)
21
+ Size of the randomly generated data
22
+ missingness: ('mcar', 'mar', 'mnar')
23
+ Type of missingness you want in your dataset
24
+ thr: float between [0,1]
25
+ Percentage of missing data in generated data
26
+ dtype: ('int','float')
27
+ Type of data
28
+
29
+ Returns
30
+ -------
31
+ numpy.ndarray
32
+ """
33
+ if dtype == "int":
34
+ data = np.random.randint(bound[0], bound[1], size=shape).astype(float)
35
+ elif dtype == "float":
36
+ data = np.random.uniform(bound[0], bound[1], size=shape)
37
+ corruptor = Corruptor(data, thr=thr)
38
+ raw_data = getattr(corruptor, missingness)()
39
+ return raw_data
40
+
41
+
42
+ def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"):
43
+ """ Return randomly generated dataset of numbers with normally
44
+ distributed values with given and sigma.
45
+
46
+ Parameters
47
+ ----------
48
+ theta: tuple (mu, sigma)
49
+ Determines the range of values in the matrix
50
+ shape:tuple(optional)
51
+ Size of the randomly generated data
52
+ missingness: ('mcar', 'mar', 'mnar')
53
+ Type of missingness you want in your dataset
54
+ thr: float between [0,1]
55
+ Percentage of missing data in generated data
56
+ dtype: ('int','float')
57
+ Type of data
58
+
59
+ Returns
60
+ -------
61
+ numpy.ndarray
62
+ """
63
+ mean, sigma = theta
64
+ data = np.random.normal(mean, sigma, size=shape)
65
+ if dtype == "int":
66
+ data = np.round(data)
67
+ elif dtype == "float":
68
+ pass
69
+ corruptor = Corruptor(data, thr=thr)
70
+ raw_data = getattr(corruptor, missingness)()
71
+ return raw_data
72
+
73
+ def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
74
+ """ Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)
75
+
76
+ Parameters
77
+ ----------
78
+ nlevels: int
79
+ Specify the number of different categories in the dataset
80
+ shape: tuple(optional)
81
+ Size of the randomly generated data
82
+ missingness: string in ('mcar', 'mar', 'mnar')
83
+ Type of missingness you want in your dataset
84
+ thr: float between [0,1]
85
+ Percentage of missing data in generated data
86
+
87
+ Returns
88
+ -------
89
+ numpy.ndarray
90
+ """
91
+ if shape[0]*shape[1] < nlevels:
92
+ raise error.BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
93
+
94
+ length = len(string.ascii_lowercase)
95
+ n_fold = int(math.floor(math.log(nlevels, length)))
96
+ cat_pool = list(string.ascii_lowercase)
97
+
98
+ # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
99
+ if n_fold > 0:
100
+ for i in range(2, n_fold+2):
101
+ pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i))
102
+ cat_pool.extend([''.join(w) for w in pool_candidate])
103
+ if len(cat_pool) > nlevels:
104
+ break
105
+
106
+ cat = random.sample(cat_pool, nlevels)
107
+ data = np.random.choice(cat, shape, replace=True)
108
+
109
+ # make sure the data frame has nlevel different categories
110
+ while len(np.unique(data)) != nlevels:
111
+ data = np.random.choice(cat, shape, replace=True)
112
+
113
+ corruptor = Corruptor(data, thr=thr, dtype=np.str)
114
+ raw_data = getattr(corruptor, missingness)()
115
+ return raw_data
116
+
117
+
118
+
119
+ def mnist(missingness="mcar", thr=0.2):
120
+ """ Loads corrupted MNIST
121
+
122
+ Parameters
123
+ ----------
124
+ missingness: ('mcar', 'mar', 'mnar')
125
+ Type of missigness you want in your dataset
126
+ th: float between [0,1]
127
+ Percentage of missing data in generated data
128
+
129
+ Returns
130
+ -------
131
+ numpy.ndarray
132
+ """
133
+ from sklearn.datasets import fetch_mldata
134
+ dataset = fetch_mldata('MNIST original')
135
+ corruptor = Corruptor(dataset.data, thr=thr)
136
+ data = getattr(corruptor, missingness)()
137
+ return {"X": data, "Y": dataset.target}
@@ -0,0 +1,55 @@
1
+ """ impyute.dataset.corrupt """
2
+ import numpy as np
3
+
4
+
5
+ class Corruptor:
6
+ """ Adds missing values to a complete dataset.
7
+
8
+ Attributes
9
+ ----------
10
+ data: np.ndarray
11
+ Matrix of values with no NaN's that you want to add NaN's to.
12
+ thr: float (optional)
13
+ The percentage of null values you want in your dataset, a number
14
+ between 0 and 1.
15
+
16
+ Methods
17
+ -------
18
+ mcar()
19
+ Overwrite values with MCAR placed NaN's.
20
+ mar()
21
+ Overwrite values with MAR placed NaN's.
22
+ mnar()
23
+ Overwrite values with MNAR placed NaN's.
24
+
25
+ """
26
+ def __init__(self, data, thr=0.2, dtype=np.float):
27
+ self.dtype = data.dtype
28
+ self.shape = np.shape(data)
29
+ self.data = data.astype(dtype)
30
+ self.thr = thr
31
+
32
+ def mcar(self):
33
+ """ Overwrites values with MCAR placed NaN's """
34
+ data_1d = self.data.flatten()
35
+ n_total = len(data_1d)
36
+ nan_x = np.random.choice(range(n_total),
37
+ size=int(self.thr*n_total),
38
+ replace=False)
39
+ for x_i in nan_x:
40
+ data_1d[x_i] = np.nan
41
+ output = data_1d.reshape(self.shape)
42
+ return output
43
+
44
+ def mar(self):
45
+ """ Overwrites values with MAR placed NaN's """
46
+ pass
47
+
48
+ def mnar(self):
49
+ """ Overwrites values with MNAR placed NaN's """
50
+ pass
51
+
52
+ def complete(self):
53
+ """ Do nothing to the data """
54
+ output = self.data
55
+ return output
@@ -0,0 +1,5 @@
1
+ """ Missing data approaches that delete values. """
2
+
3
+ from .complete_case import complete_case
4
+
5
+ __all__ = ["complete_case"]
@@ -0,0 +1,21 @@
1
+ """ impyute.deletion.complete_case """
2
+ import numpy as np
3
+ from impyute.ops import wrapper
4
+
5
+ @wrapper.wrappers
6
+ @wrapper.checks
7
+ def complete_case(data):
8
+ """ Return only data rows with all columns
9
+
10
+ Parameters
11
+ ----------
12
+ data: numpy.ndarray
13
+ Data to impute.
14
+
15
+ Returns
16
+ -------
17
+ numpy.ndarray
18
+ Imputed data.
19
+
20
+ """
21
+ return data[~np.isnan(data).any(axis=1)]
@@ -0,0 +1,12 @@
1
+ """ Unorganized set of utility functions """
2
+
3
+ from . import error
4
+ from . import inverse_distance_weighting
5
+ from . import matrix
6
+ from . import util
7
+ from . import wrapper
8
+
9
+ __all__ = [
10
+ "error", "inverse_distance_weighting", "matrix",
11
+ "util", "wrapper"
12
+ ]
impyute/ops/error.py ADDED
@@ -0,0 +1,9 @@
1
+ """ Impyute specific error messages """
2
+
3
+ class BadInputError(Exception):
4
+ "Error thrown when input args don't match spec"
5
+ pass
6
+
7
+ class BadOutputError(Exception):
8
+ "Error thrown when outputs don't match spec"
9
+ pass
@@ -0,0 +1,31 @@
1
+ """ Assign weights to distances in a way such that farther values are weighed less """
2
+ import numpy as np
3
+
4
+ def shepards(distances, power=2):
5
+ """ Basic inverse distance weighting function
6
+
7
+ Parameters
8
+ ----------
9
+ distances: list/numpy.ndarray
10
+ 1D list of numbers (ex. distance results from call to KDTree.query)
11
+
12
+ power: int
13
+ Default of 2 used since the referenced paper stated an exponent of 2 "gives seemingly
14
+ satisfactory results"
15
+
16
+ Returns
17
+ -------
18
+ numpy.ndarray
19
+ 1D list of numbers that sum to 1, represents weights of provided distances, in order.
20
+
21
+ References
22
+ ----------
23
+
24
+ Shepard, Donald (1968). "A two-dimensional interpolation function for irregularly-spaced data".
25
+ Proceedings of the 1968 ACM National Conference. pp. 517-524. doi:10.1145/800186.810616
26
+ """
27
+ return to_percentage(1/np.power(distances, power))
28
+
29
+ def to_percentage(vec):
30
+ """ Converts list of real numbers into a list of percentages """
31
+ return vec/np.sum(vec)
impyute/ops/matrix.py ADDED
@@ -0,0 +1,47 @@
1
+ """ Common operations on matrices
2
+
3
+ *Look into whether it's worth writing these in raw c*
4
+ """
5
+ import numpy as np
6
+
7
+ def nan_indices(data):
8
+ """ Finds the indices of all missing values.
9
+
10
+ Parameters
11
+ ----------
12
+ data: numpy.ndarray
13
+
14
+ Returns
15
+ -------
16
+ List of tuples
17
+ Indices of all missing values in tuple format; (i, j)
18
+ """
19
+ return np.argwhere(np.isnan(data))
20
+
21
+ def map_nd(fn, arr):
22
+ """ Map fn that takes a value over entire n-dim array
23
+
24
+ Parameters
25
+ ----------
26
+ arr: numpy.ndarray
27
+
28
+ Returns
29
+ -------
30
+ numpy.ndarray
31
+
32
+ """
33
+ return np.vectorize(fn)(arr)
34
+
35
+ def every_nd(fn, arr):
36
+ """ Returns bool, true if fn is true for all elements of arr
37
+
38
+ Parameters
39
+ ----------
40
+ arr: numpy.ndarray
41
+
42
+ Returns
43
+ -------
44
+ bool
45
+
46
+ """
47
+ return all(map(fn, arr.flatten()))
impyute/ops/testing.py ADDED
@@ -0,0 +1,20 @@
1
+ """ Utilities used for unit tests """
2
+ import numpy as np
3
+
4
+
5
+ def return_na_check(data):
6
+ """Helper function for tests to check if the data returned is a
7
+ numpy array and that the imputed data has no NaN's.
8
+
9
+ Parameters
10
+ ----------
11
+ data: numpy.ndarray
12
+ Data to impute.
13
+
14
+ Returns
15
+ -------
16
+ None
17
+
18
+ """
19
+ assert isinstance(data, np.ndarray)
20
+ assert not np.isnan(data).any()
impyute/ops/util.py ADDED
@@ -0,0 +1,76 @@
1
+ """ Random utility functions """
2
+ from functools import wraps
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ # Things that get exposed from * import
7
+ __all__ = [
8
+ "constantly", "complement", "identity", "thread",
9
+ "execute_fn_with_args_and_or_kwargs", "toy_df",
10
+ "insert_na",
11
+ ]
12
+
13
+ def thread(arg, *fns):
14
+ if len(fns) > 0:
15
+ return thread(fns[0](arg), *fns[1:])
16
+ else:
17
+ return arg
18
+
19
+ def identity(x):
20
+ return x
21
+
22
+ def constantly(x):
23
+ """ Returns a function that takes any args and returns x """
24
+ def func(*args, **kwargs):
25
+ return x
26
+ return func
27
+
28
+ def complement(fn):
29
+ """ Return fn that outputs the opposite truth values of the
30
+ input function
31
+ """
32
+ @wraps(fn)
33
+ def wrapper(*args, **kwargs):
34
+ return not fn(*args, **kwargs)
35
+ return wrapper
36
+
37
+ def execute_fn_with_args_and_or_kwargs(fn, args, kwargs):
38
+ """ If args + kwargs aren't accepted only args are passed in"""
39
+ try:
40
+ return fn(*args, **kwargs)
41
+ except TypeError:
42
+ return fn(*args)
43
+
44
+ def toy_df(nrow, ncol, n_miss, sample_prefix, seed):
45
+ """
46
+ Make a dataFrame (nrow x ncol) with random values between 0 and 1, add
47
+ some missing values (n_miss). Generate a toy dataframe for testing purposes.
48
+ """
49
+ np.random.seed(seed)
50
+ data = np.random.rand(nrow*ncol).reshape((nrow, ncol)).astype(float)
51
+ x_ind = np.random.choice(nrow, n_miss)
52
+ y_ind = np.random.choice(ncol, n_miss)
53
+ for x,y in zip(x_ind, y_ind):
54
+ data[x][y] = np.nan
55
+ colNames = [sample_prefix + '_' + str(i) for i in range(0,ncol)]
56
+ df = pd.DataFrame(data, columns=colNames)
57
+ return df
58
+
59
+ def insert_na(df, n_miss, seed):
60
+ np.random.seed(seed)
61
+ nrow,ncol = df.shape
62
+ na_count = 0
63
+ if n_miss >= nrow*ncol:
64
+ out_df = df.replace(df.values, np.nan)
65
+ else:
66
+ tmp = df.to_numpy()
67
+ while(1):
68
+ if na_count >= n_miss:
69
+ break
70
+ x_ind = np.random.choice(nrow)
71
+ y_ind = np.random.choice(ncol)
72
+ if not np.isnan(tmp[x_ind][y_ind]):
73
+ tmp[x_ind][y_ind] = np.nan
74
+ na_count += 1
75
+ out_df = pd.DataFrame(tmp, index=df.index, columns=df.columns)
76
+ return out_df
impyute/ops/wrapper.py ADDED
@@ -0,0 +1,179 @@
1
+ """ Decorator functions to wrap around entry and exit
2
+
3
+ ... to easily apply to a function, functions that check/process inputs
4
+ and outputs
5
+ """
6
+ from functools import wraps
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from . import error
11
+ from . import matrix
12
+ from . import util as u
13
+
14
+ ## Hacky way to handle python2 not having `ModuleNotFoundError`
15
+ # pylint: disable=redefined-builtin, missing-docstring
16
+ try:
17
+ raise ModuleNotFoundError
18
+ except NameError:
19
+ class ModuleNotFoundError(Exception):
20
+ pass
21
+ except ModuleNotFoundError:
22
+ pass
23
+ # pylint: enable=redefined-builtin, missing-docstring
24
+
25
+
26
+ def handle_df(fn):
27
+ """ Decorator to handle pandas Dataframe object as input
28
+
29
+ If the first arg is a pandas dataframe, convert it to a numpy array
30
+ otherwise don't do anything. Cast back to a pandas Dataframe after
31
+ the imputation function has run
32
+ """
33
+ @wraps(fn)
34
+ def wrapper(*args, **kwargs):
35
+ is_df = False
36
+ ## convert tuple to list so args can be modified
37
+ args = list(args)
38
+ ## Either make a copy or use a pointer to the original
39
+ if kwargs.get('inplace'):
40
+ args[0] = args[0]
41
+ else:
42
+ args[0] = args[0].copy()
43
+
44
+ ## If input data is a dataframe then cast the input to an np.array
45
+ ## and set an indicator flag before continuing
46
+ if isinstance(args[0], pd.DataFrame):
47
+ is_df = True
48
+ in_ind = args[0].index
49
+ in_columns = args[0].columns
50
+ args[0] = args[0].to_numpy()
51
+
52
+ ## function invokation
53
+ results = u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
54
+
55
+ ## cast the output back to a DataFrame.
56
+ if is_df:
57
+ results = pd.DataFrame(results, index=in_ind, columns=in_columns)
58
+ return results
59
+ return wrapper
60
+
61
+ def add_inplace_option(fn):
62
+ """ Decorator for inplace option
63
+
64
+ Functions wrapped by this can have an `inplace` kwarg to use either a copy of
65
+ data or reference """
66
+ @wraps(fn)
67
+ def wrapper(*args, **kwargs):
68
+ """ Run input checks"""
69
+ ## convert tuple to list so args can be modified
70
+ args = list(args)
71
+ ## Either make a copy or use a pointer to the original
72
+ if kwargs.get('inplace'):
73
+ args[0] = args[0]
74
+ else:
75
+ args[0] = args[0].copy()
76
+
77
+ ## function invokation
78
+ return u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
79
+ return wrapper
80
+
81
+ def conform_output(fn):
82
+ """ Decorator to handle impossible values
83
+
84
+ Adds two optional kwargs, `coerce_fn` and `valid_fn`.
85
+
86
+ `valid_fn` function stub
87
+
88
+ def my_coerce_fn(some_literal) -> boolean
89
+
90
+ `coerce_fn` function stub
91
+
92
+ def my_coerce_fn(arr, x_i, y_i) -> some_literal
93
+
94
+ Valid function is something run on each element of the, this is
95
+ the function that we use to indicate whether the value is valid
96
+ or not
97
+
98
+ Coerce function has three arguments, the original matrix and
99
+ the two indices of the invalid value x_i and y_i. This function
100
+ will be run on all invalid values.
101
+ """
102
+ @wraps(fn)
103
+ def wrapper(*args, **kwargs):
104
+ def raise_error(arr, x_i, y_i):
105
+ raise error.BadOutputError("{} does not conform".format(arr[x_i, y_i]))
106
+ ## convert tuple to list so args can be modified
107
+ args = list(args)
108
+ # function that checks if the value is valid
109
+ valid_fn = kwargs.get("valid_fn", u.constantly(True))
110
+ # function that modifies the invalid value to something valid
111
+ coerce_fn = kwargs.get("coerce_fn", raise_error)
112
+
113
+ ## function invokation
114
+ results = u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
115
+
116
+ # check each value to see if it's valid
117
+ bool_arr = matrix.map_nd(u.complement(valid_fn), results)
118
+ # get indices of invalid values
119
+ invalid_indices = np.argwhere(bool_arr)
120
+ # run the coerce fn on each invalid indice
121
+ for x_i, y_i in invalid_indices:
122
+ results[x_i, y_i] = coerce_fn(results, x_i, y_i)
123
+
124
+ return results
125
+ return wrapper
126
+
127
+ def wrappers(fn):
128
+ """ Helper decorator, all wrapper functions applied to modify input (matrix
129
+ with missing values) and output (matrix with imputed values)
130
+
131
+ NOTE: `handle_df` has to be last as it needs to be in the outer loop (first
132
+ entry point) since every other function assumes you're getting an np.array
133
+ as input
134
+ """
135
+ return u.thread(
136
+ fn, # function that's getting wrapped
137
+ add_inplace_option, # allow choosing reference/copy
138
+ conform_output, # allow enforcing of some spec on returned outputs
139
+ handle_df, # if df type, cast to np.array on in and df on out
140
+ )
141
+
142
+ def _shape_2d(data):
143
+ """ True if array is 2D"""
144
+ return len(np.shape(data)) == 2
145
+
146
+ def _shape_3d(data):
147
+ """ True if array is 3D"""
148
+ return len(np.shape(data)) == 3
149
+
150
+ def _is_ndarray(data):
151
+ """ True if the array is an instance of numpy's ndarray"""
152
+ return isinstance(data, np.ndarray)
153
+
154
+ def _dtype_float(data):
155
+ """ True if the values in the array are floating point"""
156
+ return data.dtype == float
157
+
158
+ def _nan_exists(data):
159
+ """ True if there is at least one np.nan in the array"""
160
+ nan_xy = matrix.nan_indices(data)
161
+ return len(nan_xy) > 0
162
+
163
+ def checks(fn):
164
+ """ Throw exception if error runs"""
165
+ @wraps(fn)
166
+ def wrapper(*args, **kwargs):
167
+ data = args[0]
168
+ if len(np.shape(data)) != 2:
169
+ raise error.BadInputError("No support for arrays that aren't 2D yet.")
170
+ elif not _shape_2d(data):
171
+ raise error.BadInputError("Not a 2D array.")
172
+ elif not _is_ndarray(data):
173
+ raise error.BadInputError("Not a np.ndarray.")
174
+ elif not _dtype_float(data):
175
+ raise error.BadInputError("Data is not float.")
176
+ elif not _nan_exists(data):
177
+ raise error.BadInputError("No NaN's in given data")
178
+ return u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
179
+ return wrapper
impyute/ts/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """ Imputations for time-series data. """
2
+
3
+ from .locf import locf
4
+ from .moving_window import moving_window
5
+
6
+ __all__ = ["locf", "moving_window"]
impyute/ts/locf.py ADDED
@@ -0,0 +1,57 @@
1
+ import numpy as np
2
+ from impyute.ops import matrix
3
+ from impyute.ops import wrapper
4
+ from impyute.ops import error
5
+
6
+ @wrapper.wrappers
7
+ @wrapper.checks
8
+ def locf(data, axis=0):
9
+ """ Last Observation Carried Forward
10
+
11
+ For each set of missing indices, use the value of one row before(same
12
+ column). In the case that the missing value is the first row, look one
13
+ row ahead instead. If this next row is also NaN, look to the next row.
14
+ Repeat until you find a row in this column that's not NaN. All the rows
15
+ before will be filled with this value.
16
+
17
+ Parameters
18
+ ----------
19
+ data: numpy.ndarray
20
+ Data to impute.
21
+ axis: boolean (optional)
22
+ 0 if time series is in row format (Ex. data[0][:] is 1st data point).
23
+ 1 if time series is in col format (Ex. data[:][0] is 1st data point).
24
+
25
+ Returns
26
+ -------
27
+ numpy.ndarray
28
+ Imputed data.
29
+
30
+ """
31
+ if axis == 0:
32
+ data = np.transpose(data)
33
+ elif axis == 1:
34
+ pass
35
+ else:
36
+ raise error.BadInputError("Error: Axis value is invalid, please use either 0 (row format) or 1 (column format)")
37
+
38
+ nan_xy = matrix.nan_indices(data)
39
+ for x_i, y_i in nan_xy:
40
+ # Simplest scenario, look one row back
41
+ if x_i-1 > -1:
42
+ data[x_i][y_i] = data[x_i-1][y_i]
43
+ # Look n rows forward
44
+ else:
45
+ x_residuals = np.shape(data)[0]-x_i-1 # n datapoints left
46
+ val_found = False
47
+ for i in range(1, x_residuals):
48
+ if not np.isnan(data[x_i+i][y_i]):
49
+ val_found = True
50
+ break
51
+ if val_found:
52
+ # pylint: disable=undefined-loop-variable
53
+ for x_nan in range(i):
54
+ data[x_i+x_nan][y_i] = data[x_i+i][y_i]
55
+ else:
56
+ raise Exception("Error: Entire Column is NaN")
57
+ return data