cpgtools 2.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cpgmodule/BED.py +441 -0
  2. cpgmodule/MI.py +193 -0
  3. cpgmodule/__init__.py +0 -0
  4. cpgmodule/_version.py +1 -0
  5. cpgmodule/cgID.py +866897 -0
  6. cpgmodule/data/AltumAge_cpg.pkl +0 -0
  7. cpgmodule/data/AltumAge_multi_platform_cpgs.pkl +0 -0
  8. cpgmodule/data/AltumAge_scaler.pkl +0 -0
  9. cpgmodule/data/GA_Bohlin.pkl +0 -0
  10. cpgmodule/data/GA_Haftorn.pkl +0 -0
  11. cpgmodule/data/GA_Knight.pkl +0 -0
  12. cpgmodule/data/GA_Lee_CPC.pkl +0 -0
  13. cpgmodule/data/GA_Lee_RPC.pkl +0 -0
  14. cpgmodule/data/GA_Lee_refined_RPC.pkl +0 -0
  15. cpgmodule/data/GA_Mayne.pkl +0 -0
  16. cpgmodule/data/Hannum.pkl +0 -0
  17. cpgmodule/data/Horvath_2013.pkl +0 -0
  18. cpgmodule/data/Horvath_2018.pkl +0 -0
  19. cpgmodule/data/Levine.pkl +0 -0
  20. cpgmodule/data/Lu_DNAmTL.pkl +0 -0
  21. cpgmodule/data/Ped_McEwen.pkl +0 -0
  22. cpgmodule/data/Ped_Wu.pkl +0 -0
  23. cpgmodule/data/Zhang_BLUP.pkl +0 -0
  24. cpgmodule/data/Zhang_EN.pkl +0 -0
  25. cpgmodule/data/__init__.py +0 -0
  26. cpgmodule/extend_bed.py +147 -0
  27. cpgmodule/imotif.py +348 -0
  28. cpgmodule/ireader.py +28 -0
  29. cpgmodule/methylClock.py +53 -0
  30. cpgmodule/padjust.py +58 -0
  31. cpgmodule/region2gene.py +170 -0
  32. cpgmodule/utils.py +642 -0
  33. cpgtools-2.0.5.data/scripts/CpG_aggregation.py +238 -0
  34. cpgtools-2.0.5.data/scripts/CpG_anno_position.py +156 -0
  35. cpgtools-2.0.5.data/scripts/CpG_anno_probe.py +112 -0
  36. cpgtools-2.0.5.data/scripts/CpG_density_gene_centered.py +107 -0
  37. cpgtools-2.0.5.data/scripts/CpG_distrb_chrom.py +154 -0
  38. cpgtools-2.0.5.data/scripts/CpG_distrb_gene_centered.py +193 -0
  39. cpgtools-2.0.5.data/scripts/CpG_distrb_region.py +146 -0
  40. cpgtools-2.0.5.data/scripts/CpG_logo.py +134 -0
  41. cpgtools-2.0.5.data/scripts/CpG_to_gene.py +141 -0
  42. cpgtools-2.0.5.data/scripts/beta_PCA.py +188 -0
  43. cpgtools-2.0.5.data/scripts/beta_UMAP.py +181 -0
  44. cpgtools-2.0.5.data/scripts/beta_combat.py +174 -0
  45. cpgtools-2.0.5.data/scripts/beta_jitter_plot.py +107 -0
  46. cpgtools-2.0.5.data/scripts/beta_m_conversion.py +105 -0
  47. cpgtools-2.0.5.data/scripts/beta_profile_gene_centered.py +165 -0
  48. cpgtools-2.0.5.data/scripts/beta_profile_region.py +152 -0
  49. cpgtools-2.0.5.data/scripts/beta_selectNBest.py +116 -0
  50. cpgtools-2.0.5.data/scripts/beta_stacked_barplot.py +119 -0
  51. cpgtools-2.0.5.data/scripts/beta_stats.py +101 -0
  52. cpgtools-2.0.5.data/scripts/beta_tSNE.py +179 -0
  53. cpgtools-2.0.5.data/scripts/beta_topN.py +99 -0
  54. cpgtools-2.0.5.data/scripts/beta_trichotmize.py +190 -0
  55. cpgtools-2.0.5.data/scripts/dmc_Bayes.py +442 -0
  56. cpgtools-2.0.5.data/scripts/dmc_bb.py +221 -0
  57. cpgtools-2.0.5.data/scripts/dmc_fisher.py +161 -0
  58. cpgtools-2.0.5.data/scripts/dmc_glm.py +191 -0
  59. cpgtools-2.0.5.data/scripts/dmc_logit.py +226 -0
  60. cpgtools-2.0.5.data/scripts/dmc_nonparametric.py +176 -0
  61. cpgtools-2.0.5.data/scripts/dmc_ttest.py +222 -0
  62. cpgtools-2.0.5.data/scripts/predict_missing.py +673 -0
  63. cpgtools-2.0.5.data/scripts/predict_sex.py +126 -0
  64. cpgtools-2.0.5.dist-info/METADATA +59 -0
  65. cpgtools-2.0.5.dist-info/RECORD +104 -0
  66. cpgtools-2.0.5.dist-info/WHEEL +5 -0
  67. cpgtools-2.0.5.dist-info/licenses/LICENSE.txt +19 -0
  68. cpgtools-2.0.5.dist-info/top_level.txt +5 -0
  69. impyute/__init__.py +3 -0
  70. impyute/contrib/__init__.py +7 -0
  71. impyute/contrib/compare.py +69 -0
  72. impyute/contrib/count_missing.py +30 -0
  73. impyute/contrib/describe.py +63 -0
  74. impyute/cs/__init__.py +11 -0
  75. impyute/cs/buck_iterative.py +82 -0
  76. impyute/cs/central_tendency.py +84 -0
  77. impyute/cs/em.py +52 -0
  78. impyute/cs/fast_knn.py +130 -0
  79. impyute/cs/random.py +27 -0
  80. impyute/dataset/__init__.py +6 -0
  81. impyute/dataset/base.py +137 -0
  82. impyute/dataset/corrupt.py +55 -0
  83. impyute/deletion/__init__.py +5 -0
  84. impyute/deletion/complete_case.py +21 -0
  85. impyute/ops/__init__.py +12 -0
  86. impyute/ops/error.py +9 -0
  87. impyute/ops/inverse_distance_weighting.py +31 -0
  88. impyute/ops/matrix.py +47 -0
  89. impyute/ops/testing.py +20 -0
  90. impyute/ops/util.py +96 -0
  91. impyute/ops/wrapper.py +179 -0
  92. impyute/ts/__init__.py +6 -0
  93. impyute/ts/locf.py +57 -0
  94. impyute/ts/moving_window.py +128 -0
  95. impyutelib.py +890 -0
  96. missingpy/__init__.py +4 -0
  97. missingpy/knnimpute.py +328 -0
  98. missingpy/missforest.py +556 -0
  99. missingpy/pairwise_external.py +315 -0
  100. missingpy/tests/__init__.py +0 -0
  101. missingpy/tests/test_knnimpute.py +605 -0
  102. missingpy/tests/test_missforest.py +409 -0
  103. missingpy/utils.py +124 -0
  104. misspylib.py +565 -0
impyutelib.py ADDED
@@ -0,0 +1,890 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Tue Oct 8 12:01:45 2024
5
+ Adapted and modified from impyute.
6
+ """
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from sklearn.linear_model import LinearRegression
11
+ from scipy.spatial import KDTree
12
+ from functools import wraps
13
+
14
+ ## Common operations on matrices
15
+
16
+ def nan_indices(data):
17
+ """ Finds the indices of all missing values.
18
+
19
+ Parameters
20
+ ----------
21
+ data: numpy.ndarray
22
+
23
+ Returns
24
+ -------
25
+ List of tuples
26
+ Indices of all missing values in tuple format; (i, j)
27
+ """
28
+ return np.argwhere(np.isnan(data))
29
+
30
+ def map_nd(fn, arr):
31
+ """ Map fn that takes a value over entire n-dim array
32
+
33
+ Parameters
34
+ ----------
35
+ arr: numpy.ndarray
36
+
37
+ Returns
38
+ -------
39
+ numpy.ndarray
40
+
41
+ """
42
+ return np.vectorize(fn)(arr)
43
+
44
+ def every_nd(fn, arr):
45
+ """ Returns bool, true if fn is true for all elements of arr
46
+
47
+ Parameters
48
+ ----------
49
+ arr: numpy.ndarray
50
+
51
+ Returns
52
+ -------
53
+ bool
54
+
55
+ """
56
+ return all(map(fn, arr.flatten()))
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+ ## Util
67
+
68
+ def thread(arg, *fns):
69
+ if len(fns) > 0:
70
+ return thread(fns[0](arg), *fns[1:])
71
+ else:
72
+ return arg
73
+
74
+ def identity(x):
75
+ return x
76
+
77
+ def constantly(x):
78
+ """ Returns a function that takes any args and returns x """
79
+ def func(*args, **kwargs):
80
+ return x
81
+ return func
82
+
83
+ def complement(fn):
84
+ """ Return fn that outputs the opposite truth values of the
85
+ input function
86
+ """
87
+ @wraps(fn)
88
+ def wrapper(*args, **kwargs):
89
+ return not fn(*args, **kwargs)
90
+ return wrapper
91
+
92
+ def execute_fn_with_args_and_or_kwargs(fn, args, kwargs):
93
+ """ If args + kwargs aren't accepted only args are passed in"""
94
+ try:
95
+ return fn(*args, **kwargs)
96
+ except TypeError:
97
+ return fn(*args)
98
+
99
+ def toy_df(n_rows=20, n_cols=5, missingness=0.2, min_val=0, max_val=1,
100
+ missing_value=np.nan, rand_seed=1234, sample_prefix=None):
101
+ """Generate an array or DataFrame with NaNs"""
102
+ np.random.seed(rand_seed)
103
+ X = np.random.uniform(
104
+ low = min_val, high = max_val, size = n_rows * n_cols).reshape(n_rows, n_cols).astype(
105
+ float)
106
+ # check missingness
107
+ if missingness > 0:
108
+ # If missingness >= 1 then use it as approximate (see below) count
109
+ if missingness >= 1:
110
+ n_missing = int(missingness)
111
+ else:
112
+ n_missing = int(missingness * n_rows * n_cols)
113
+ print(n_missing)
114
+
115
+ # Introduce NaNs until n_miss "NAs" are inserted.
116
+ missing_count = 0
117
+ for i,j in zip(np.random.choice(n_rows, n_missing), np.random.choice(n_cols, n_missing)):
118
+ if np.isnan(X[i][j]):
119
+ continue
120
+ else:
121
+ X[i][j] = missing_value
122
+ missing_count += 1
123
+ if missing_count >= n_missing:
124
+ break
125
+
126
+ # check sample_prefix
127
+ if sample_prefix is None:
128
+ return X
129
+ else:
130
+ colNames = [sample_prefix + '_' + str(i) for i in range(0, n_cols)]
131
+ return pd.DataFrame(X, columns=colNames)
132
+
133
+ def insert_na(df, n_miss, seed):
134
+ np.random.seed(seed)
135
+ nrow,ncol = df.shape
136
+ na_count = 0
137
+ if n_miss >= nrow*ncol:
138
+ out_df = df.replace(df.values, np.nan)
139
+ else:
140
+ tmp = df.to_numpy()
141
+ while(1):
142
+ if na_count >= n_miss:
143
+ break
144
+ x_ind = np.random.choice(nrow)
145
+ y_ind = np.random.choice(ncol)
146
+ if not np.isnan(tmp[x_ind][y_ind]):
147
+ tmp[x_ind][y_ind] = np.nan
148
+ na_count += 1
149
+ out_df = pd.DataFrame(tmp, index=df.index, columns=df.columns)
150
+ return out_df
151
+
152
+ def apply_method(df, method_name, **kwargs):
153
+ """Applies a pandas method to a DataFrame.
154
+ Args:
155
+ df (pd.DataFrame): The DataFrame to apply the method to.
156
+ method_name (str): The name of the method to apply.
157
+ **kwargs: Additional keyword arguments to pass to the method.
158
+ Returns:
159
+ pd.DataFrame: The transformed DataFrame.
160
+ """
161
+ method = getattr(df, method_name)
162
+ return method(**kwargs)
163
+
164
+ def shepards(distances, power=2):
165
+ """ Basic inverse distance weighting function
166
+
167
+ Parameters
168
+ ----------
169
+ distances: list/numpy.ndarray
170
+ 1D list of numbers (ex. distance results from call to KDTree.query)
171
+
172
+ power: int
173
+ Default of 2 used since the referenced paper stated an exponent of 2 "gives seemingly
174
+ satisfactory results"
175
+
176
+ Returns
177
+ -------
178
+ numpy.ndarray
179
+ 1D list of numbers that sum to 1, represents weights of provided distances, in order.
180
+
181
+ References
182
+ ----------
183
+
184
+ Shepard, Donald (1968). "A two-dimensional interpolation function for irregularly-spaced data".
185
+ Proceedings of the 1968 ACM National Conference. pp. 517-524. doi:10.1145/800186.810616
186
+ """
187
+ return to_percentage(1/np.power(distances, power))
188
+
189
+ def to_percentage(vec):
190
+ """ Converts list of real numbers into a list of percentages """
191
+ return vec/np.sum(vec)
192
+
193
+
194
+
195
+
196
+ ## Wrapper
197
+
198
+ def handle_df(fn):
199
+ """ Decorator to handle pandas Dataframe object as input
200
+
201
+ If the first arg is a pandas dataframe, convert it to a numpy array
202
+ otherwise don't do anything. Cast back to a pandas Dataframe after
203
+ the imputation function has run
204
+ """
205
+ @wraps(fn)
206
+ def wrapper(*args, **kwargs):
207
+ is_df = False
208
+ ## convert tuple to list so args can be modified
209
+ args = list(args)
210
+ ## Either make a copy or use a pointer to the original
211
+ if kwargs.get('inplace'):
212
+ args[0] = args[0]
213
+ else:
214
+ args[0] = args[0].copy()
215
+
216
+ ## If input data is a dataframe then cast the input to an np.array
217
+ ## and set an indicator flag before continuing
218
+ if isinstance(args[0], pd.DataFrame):
219
+ is_df = True
220
+ in_ind = args[0].index
221
+ in_columns = args[0].columns
222
+ args[0] = args[0].to_numpy()
223
+
224
+ ## function invokation
225
+ results = execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
226
+
227
+ ## cast the output back to a DataFrame.
228
+ if is_df:
229
+ results = pd.DataFrame(results, index=in_ind, columns=in_columns)
230
+ return results
231
+ return wrapper
232
+
233
+ def add_inplace_option(fn):
234
+ """ Decorator for inplace option
235
+
236
+ Functions wrapped by this can have an `inplace` kwarg to use either a copy of
237
+ data or reference """
238
+ @wraps(fn)
239
+ def wrapper(*args, **kwargs):
240
+ """ Run input checks"""
241
+ ## convert tuple to list so args can be modified
242
+ args = list(args)
243
+ ## Either make a copy or use a pointer to the original
244
+ if kwargs.get('inplace'):
245
+ args[0] = args[0]
246
+ else:
247
+ args[0] = args[0].copy()
248
+
249
+ ## function invokation
250
+ return execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
251
+ return wrapper
252
+
253
+ def conform_output(fn):
254
+ """ Decorator to handle impossible values
255
+
256
+ Adds two optional kwargs, `coerce_fn` and `valid_fn`.
257
+
258
+ `valid_fn` function stub
259
+
260
+ def my_coerce_fn(some_literal) -> boolean
261
+
262
+ `coerce_fn` function stub
263
+
264
+ def my_coerce_fn(arr, x_i, y_i) -> some_literal
265
+
266
+ Valid function is something run on each element of the, this is
267
+ the function that we use to indicate whether the value is valid
268
+ or not
269
+
270
+ Coerce function has three arguments, the original matrix and
271
+ the two indices of the invalid value x_i and y_i. This function
272
+ will be run on all invalid values.
273
+ """
274
+ @wraps(fn)
275
+ def wrapper(*args, **kwargs):
276
+ def raise_error(arr, x_i, y_i):
277
+ raise Exception("{} does not conform".format(arr[x_i, y_i]))
278
+ ## convert tuple to list so args can be modified
279
+ args = list(args)
280
+ # function that checks if the value is valid
281
+ valid_fn = kwargs.get("valid_fn", constantly(True))
282
+ # function that modifies the invalid value to something valid
283
+ coerce_fn = kwargs.get("coerce_fn", raise_error)
284
+
285
+ ## function invokation
286
+ results = execute_fn_with_args_and_or_kwargs(fn, args, kwargs)
287
+
288
+ # check each value to see if it's valid
289
+ bool_arr = map_nd(complement(valid_fn), results)
290
+ # get indices of invalid values
291
+ invalid_indices = np.argwhere(bool_arr)
292
+ # run the coerce fn on each invalid indice
293
+ for x_i, y_i in invalid_indices:
294
+ results[x_i, y_i] = coerce_fn(results, x_i, y_i)
295
+
296
+ return results
297
+ return wrapper
298
+
299
+ def wrappers(fn):
300
+ """ Helper decorator, all wrapper functions applied to modify input (matrix
301
+ with missing values) and output (matrix with imputed values)
302
+
303
+ NOTE: `handle_df` has to be last as it needs to be in the outer loop (first
304
+ entry point) since every other function assumes you're getting an np.array
305
+ as input
306
+ """
307
+ return thread(
308
+ fn, # function that's getting wrapped
309
+ add_inplace_option, # allow choosing reference/copy
310
+ conform_output, # allow enforcing of some spec on returned outputs
311
+ handle_df, # if df type, cast to np.array on in and df on out
312
+ )
313
+
314
+ ## Central tendency
315
+ @wrappers
316
+ def mean(data):
317
+ """ Substitute missing values with the mean of that column.
318
+
319
+ Parameters
320
+ ----------
321
+ data: numpy.ndarray
322
+ Data to impute.
323
+
324
+ Returns
325
+ -------
326
+ numpy.ndarray
327
+ Imputed data.
328
+
329
+ """
330
+ nan_xy = nan_indices(data)
331
+ for x_i, y_i in nan_xy:
332
+ row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
333
+ new_value = np.mean(row_wo_nan)
334
+ data[x_i][y_i] = new_value
335
+ return data
336
+
337
+ @wrappers
338
+ def median(data):
339
+ """ Substitute missing values with the median of that column(middle).
340
+
341
+ Parameters
342
+ ----------
343
+ data: numpy.ndarray
344
+ Data to impute.
345
+
346
+ Returns
347
+ -------
348
+ numpy.ndarray
349
+ Imputed data.
350
+
351
+ """
352
+ nan_xy = nan_indices(data)
353
+ cols_missing = set(nan_xy.T[1])
354
+ medians = {}
355
+ for y_i in cols_missing:
356
+ cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
357
+ median_y = np.median(cols_wo_nan)
358
+ medians[str(y_i)] = median_y
359
+ for x_i, y_i in nan_xy:
360
+ data[x_i][y_i] = medians[str(y_i)]
361
+ return data
362
+
363
+ @wrappers
364
+ def mode(data):
365
+ """ Substitute missing values with the mode of that column(most frequent).
366
+
367
+ In the case that there is a tie (there are multiple, most frequent values)
368
+ for a column randomly pick one of them.
369
+
370
+ Parameters
371
+ ----------
372
+ data: numpy.ndarray
373
+ Data to impute.
374
+
375
+ Returns
376
+ -------
377
+ numpy.ndarray
378
+ Imputed data.
379
+
380
+ """
381
+ nan_xy = nan_indices(data)
382
+ modes = []
383
+ for y_i in range(np.shape(data)[1]):
384
+ unique_counts = np.unique(data[:, [y_i]], return_counts=True)
385
+ max_count = np.max(unique_counts[1])
386
+ mode_y = [unique for unique, count in np.transpose(unique_counts)
387
+ if count == max_count and not np.isnan(unique)]
388
+ modes.append(mode_y) # Appends index of column and column modes
389
+ for x_i, y_i in nan_xy:
390
+ data[x_i][y_i] = np.random.choice(modes[y_i])
391
+ return data
392
+
393
+
394
+
395
+ #################
396
+ ## random impute
397
+ #################
398
+ @wrappers
399
+ def random_impute(data):
400
+ """ Fill missing values in with a randomly selected value from the same
401
+ column.
402
+
403
+ Parameters
404
+ ----------
405
+ data: numpy.ndarray
406
+ Data to impute.
407
+
408
+ Returns
409
+ -------
410
+ numpy.ndarray
411
+ Imputed data.
412
+
413
+ """
414
+ nan_xy = nan_indices(data)
415
+ for x, y in nan_xy:
416
+ uniques = np.unique(data[:, y])
417
+ uniques = uniques[~np.isnan(uniques)]
418
+ data[x][y] = np.random.choice(uniques)
419
+ return data
420
+
421
+
422
+ ########################
423
+ ## moving window impute
424
+ ########################
425
+ @wrappers
426
+ def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean,
427
+ inplace=False):
428
+ """ Interpolate the missing values based on nearby values.
429
+
430
+ For example, with an array like this:
431
+
432
+ array([[-1.24940, -1.38673, -0.03214945, 0.08255145, -0.007415],
433
+ [ 2.14662, 0.32758 , -0.82601414, 1.78124027, 0.873998],
434
+ [-0.41400, -0.977629, nan, -1.39255344, 1.680435],
435
+ [ 0.40975, 1.067599, 0.29152388, -1.70160145, -0.565226],
436
+ [-0.54592, -1.126187, 2.04004377, 0.16664863, -0.010677]])
437
+
438
+ Using a `k` or window size of 3. The one missing value would be set
439
+ to -1.18509122. The window operates on the horizontal axis.
440
+
441
+ Usage
442
+ -----
443
+
444
+ The parameters default the function to a moving mean. You may want to change
445
+ the default window size:
446
+
447
+ moving_window(data, wsize=10)
448
+
449
+ To only look at past data (null value is at the rightmost index in the window):
450
+
451
+ moving_window(data, nindex=-1)
452
+
453
+ To use a custom function:
454
+
455
+ moving_window(data, func=np.median)
456
+
457
+ You can also do something like take 1.5x the max of previous values in the window:
458
+
459
+ moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1)
460
+
461
+ Parameters
462
+ ----------
463
+ data: numpy.ndarray
464
+ 2D matrix to impute.
465
+ nindex: int
466
+ Null index. Index of the null value inside the moving average window.
467
+ Use cases: Say you wanted to make value skewed toward the left or right
468
+ side. 0 would only take the average of values from the right and -1
469
+ would only take the average of values from the left
470
+ wsize: int
471
+ Window size. Size of the moving average window/area of values being used
472
+ for each local imputation. This number includes the missing value.
473
+ errors: {"raise", "coerce", "ignore"}
474
+ Errors will occur with the indexing of the windows - for example if there
475
+ is a nan at data[x][0] and `nindex` is set to -1 or there is a nan at
476
+ data[x][-1] and `nindex` is set to 0. `"raise"` will raise an error,
477
+ `"coerce"` will try again using an nindex set to the middle and `"ignore"`
478
+ will just leave it as a nan.
479
+ inplace: {True, False}
480
+ Whether to return a copy or run on the passed-in array
481
+
482
+ Returns
483
+ -------
484
+ numpy.ndarray
485
+ Imputed data.
486
+
487
+ """
488
+ if errors == "ignore":
489
+ raise Exception("`errors` value `ignore` not implemented yet. Sorry!")
490
+
491
+ if not inplace:
492
+ data = data.copy()
493
+
494
+ if nindex is None: # If using equal window side lengths
495
+ assert wsize % 2 == 1, "The parameter `wsize` should not be even "\
496
+ "if the value `nindex` is not set since it defaults to the midpoint "\
497
+ "and an even `wsize` makes the midpoint ambiguous"
498
+ wside_left = wsize // 2
499
+ wside_right = wsize // 2
500
+ else: # If using custom window side lengths
501
+ assert nindex < wsize, "The null index must be smaller than the window size"
502
+ if nindex == -1:
503
+ wside_left = wsize - 1
504
+ wside_right = 0
505
+ else:
506
+ wside_left = nindex
507
+ wside_right = wsize - nindex - 1
508
+
509
+ while True:
510
+ nan_xy = nan_indices(data)
511
+ n_nan_prev = len(nan_xy)
512
+ for x_i, y_i in nan_xy:
513
+ left_i = max(0, y_i-wside_left)
514
+ right_i = min(len(data), y_i+wside_right+1)
515
+ window = data[x_i, left_i: right_i]
516
+ window_not_null = window[~np.isnan(window)]
517
+
518
+ if len(window_not_null) > 0:
519
+ try:
520
+ data[x_i][y_i] = func(window_not_null)
521
+ continue
522
+ except Exception as e:
523
+ if errors == "raise":
524
+ raise e
525
+
526
+ if errors == "coerce":
527
+ # If either the window has a length of 0 or the aggregate function fails somehow,
528
+ # do a fallback of just trying the best we can by using it as the middle and trying
529
+ # to recalculate. Use temporary wside_left/wside_right, for only the calculation of
530
+ # this specific problamatic value
531
+ wside_left_tmp = wsize // 2
532
+ wside_right_tmp = wside_left_tmp
533
+
534
+ left_i_tmp = max(0, y_i-wside_left_tmp)
535
+ right_i_tmp = min(len(data), y_i+wside_right_tmp+1)
536
+
537
+ window = data[x_i, left_i_tmp:right_i_tmp]
538
+ window_not_null = window[~np.isnan(window)]
539
+ try:
540
+ data[x_i][y_i] = func(window_not_null)
541
+ except Exception as e:
542
+ print("Exception:", e)
543
+ if n_nan_prev == len(nan_indices(data)):
544
+ break
545
+ return data
546
+
547
+
548
+ ########################
549
+ ## fKNN
550
+ ########################
551
+ @wrappers
552
+ def fKNN(data, na_locations, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
553
+ idw_fn=shepards, init_impute_fn=mean):
554
+ """ Impute using a variant of the nearest neighbours approach
555
+
556
+ Basic idea: Impute array with a passed in initial impute fn (mean impute)
557
+ and then use the resulting complete array to construct a KDTree. Use this
558
+ KDTree to compute nearest neighbours. After finding `k` nearest
559
+ neighbours, take the weighted average of them. Basically, find the nearest
560
+ row in terms of distance
561
+
562
+ This approach is much, much faster than the other implementation (fit+transform
563
+ for each subset) which is almost prohibitively expensive.
564
+
565
+ Parameters
566
+ ----------
567
+ data: ndarray
568
+ 2D matrix to impute.
569
+
570
+ na_locations: tuple
571
+ Pre-calculated (x,y) of missing values.
572
+
573
+ k: int, optional
574
+ Parameter used for method querying the KDTree class object. Number of
575
+ neighbours used in the KNN query. Refer to the docs for
576
+ [`scipy.spatial.KDTree.query`]
577
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
578
+
579
+ eps: nonnegative float, optional
580
+ Parameter used for method querying the KDTree class object. From the
581
+ SciPy docs: "Return approximate nearest neighbors; the kth returned
582
+ value is guaranteed to be no further than (1+eps) times the distance to
583
+ the real kth nearest neighbor". Refer to the docs for
584
+ [`scipy.spatial.KDTree.query`]
585
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
586
+
587
+ p : float, 1<=p<=infinity, optional
588
+ Parameter used for method querying the KDTree class object. Straight from the
589
+ SciPy docs: "Which Minkowski p-norm to use. 1 is the
590
+ sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
591
+ distance infinity is the maximum-coordinate-difference distance". Refer to
592
+ the docs for
593
+ [`scipy.spatial.KDTree.query`]
594
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
595
+
596
+ distance_upper_bound : nonnegative float, optional
597
+ Parameter used for method querying the KDTree class object. Straight
598
+ from the SciPy docs: "Return only neighbors within this distance. This
599
+ is used to prune tree searches, so if you are doing a series of
600
+ nearest-neighbor queries, it may help to supply the distance to the
601
+ nearest neighbor of the most recent point." Refer to the docs for
602
+ [`scipy.spatial.KDTree.query`]
603
+ (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
604
+
605
+ leafsize: int, optional
606
+ Parameter used for construction of the `KDTree` class object. Straight from
607
+ the SciPy docs: "The number of points at which the algorithm switches
608
+ over to brute-force. Has to be positive". Refer to the docs for
609
+ [`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html)
610
+ for more information.
611
+
612
+ idw_fn: fn, optional
613
+ Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom
614
+ one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using
615
+ functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
616
+
617
+ init_impute_fn: fn, optional
618
+
619
+ Returns
620
+ -------
621
+ numpy.ndarray
622
+ Imputed data.
623
+
624
+ Examples
625
+ --------
626
+
627
+ >>> data = np.arange(25).reshape((5, 5)).astype(np.float)
628
+ >>> data[0][2] = np.nan
629
+ >>> data
630
+ array([[ 0., 1., nan, 3., 4.],
631
+ [ 5., 6., 7., 8., 9.],
632
+ [10., 11., 12., 13., 14.],
633
+ [15., 16., 17., 18., 19.],
634
+ [20., 21., 22., 23., 24.]])
635
+ >> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour
636
+ array([[ 0., 1., 7., 3., 4.],
637
+ [ 5., 6., 7., 8., 9.],
638
+ [10., 11., 12., 13., 14.],
639
+ [15., 16., 17., 18., 19.],
640
+ [20., 21., 22., 23., 24.]])
641
+ >> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours
642
+ array([[ 0. , 1. , 10.08608891, 3. , 4. ],
643
+ [ 5. , 6. , 7. , 8. , 9. ],
644
+ [10. , 11. , 12. , 13. , 14. ],
645
+ [15. , 16. , 17. , 18. , 19. ],
646
+ [20. , 21. , 22. , 23. , 24. ]])
647
+ >> fast_knn(data, k=3)
648
+ array([[ 0. , 1. , 13.40249283, 3. , 4. ],
649
+ [ 5. , 6. , 7. , 8. , 9. ],
650
+ [10. , 11. , 12. , 13. , 14. ],
651
+ [15. , 16. , 17. , 18. , 19. ],
652
+ [20. , 21. , 22. , 23. , 24. ]])
653
+ >> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error
654
+ ...
655
+ IndexError: index 5 is out of bounds for axis 0 with size 5
656
+
657
+ """
658
+ nan_xy = na_locations #pre-calculate nan_xy
659
+ data_c = data #pre-impute data
660
+ kdtree = KDTree(data_c, leafsize=leafsize)
661
+
662
+ for x_i, y_i in nan_xy:
663
+ distances, indices = kdtree.query(data_c[x_i], k=k+1, eps=eps, p=p,
664
+ distance_upper_bound=distance_upper_bound)
665
+ # Will always return itself in the first index. Delete it.
666
+ distances, indices = distances[1:], indices[1:]
667
+ # Add small constant to distances to avoid division by 0
668
+ distances += 1e-3
669
+ weights = idw_fn(distances)
670
+ # Assign missing value the weighted average of `k` nearest neighbours
671
+ data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices])
672
+ return data
673
+
674
+
675
+ def external_ref(data, na_locations, ref_data, k=3, eps=0, p=2,
676
+ distance_upper_bound=np.inf, leafsize=10,
677
+ idw_fn=shepards):
678
+ """ Impute using a variant of the nearest neighbours approach
679
+
680
+ Basic idea: Impute array with a passed in initial impute fn (mean impute)
681
+ and then use the resulting complete array to construct a KDTree. Use this
682
+ KDTree to compute nearest neighbours. After finding `k` nearest
683
+ neighbours, take the weighted average of them. Basically, find the nearest
684
+ row in terms of distance
685
+
686
+ This approach is much, much faster than the other implementation
687
+ (fit+transform for each subset) which is almost prohibitively expensive.
688
+
689
+ Parameters
690
+ ----------
691
+ data: ndarray
692
+ 2D matrix with missing values.
693
+
694
+ na_locations: tuple
695
+ Pre-calculated (x,y) of missing values.
696
+
697
+ ref_data: ndarray
698
+ 2D matrix used as external reference data. k nearest neighbours will be
699
+ identified from this data.
700
+
701
+ k: int, optional
702
+ Parameter used for method querying the KDTree class object. Number of
703
+ neighbours used in the KNN query.
704
+
705
+ eps: nonnegative float, optional
706
+ Parameter used for method querying the KDTree class object. From the
707
+ SciPy docs: "Return approximate nearest neighbors; the kth returned
708
+ value is guaranteed to be no further than (1+eps) times the distance to
709
+ the real kth nearest neighbor".
710
+
711
+ p : float, 1<=p<=infinity, optional
712
+ Parameter used for method querying the KDTree class object. Straight
713
+ from the SciPy docs: "Which Minkowski p-norm to use. 1 is the
714
+ sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
715
+ distance infinity is the maximum-coordinate-difference distance".
716
+
717
+ distance_upper_bound : nonnegative float, optional
718
+ Parameter used for method querying the KDTree class object. Straight
719
+ from the SciPy docs: "Return only neighbors within this distance. This
720
+ is used to prune tree searches, so if you are doing a series of
721
+ nearest-neighbor queries, it may help to supply the distance to the
722
+ nearest neighbor of the most recent point."
723
+
724
+ leafsize: int, optional
725
+ Parameter used for construction of the `KDTree` class object. Straight
726
+ from the SciPy docs: "The number of points at which the algorithm
727
+ switches over to brute-force. Has to be positive".
728
+
729
+ idw_fn: fn, optional
730
+ Function that takes one argument, a list of distances, and returns
731
+ weighted percentages. You can define a custom one or bootstrap from
732
+ functions defined in `impy.util.inverse_distance_weighting` which can
733
+ be using functools.partial, for example:`functools.partial
734
+ impy.util.inverse_distance_weighting.shepards, power=1)`
735
+
736
+ Returns
737
+ -------
738
+ numpy.ndarray
739
+ Imputed data.
740
+
741
+ """
742
+ nan_xy = na_locations #pre-calculate nan_xy
743
+ kdtree = KDTree(ref_data, leafsize=leafsize)
744
+
745
+ for x_i, y_i in nan_xy:
746
+ distances, indices = kdtree.query(data[x_i], k=k, eps=eps, p=p,
747
+ distance_upper_bound=distance_upper_bound)
748
+ # Add small constant to distances to avoid division by 0
749
+ distances += 1e-3
750
+ weights = idw_fn(distances)
751
+ # Assign missing value the weighted average of `k` nearest neighbours
752
+ data[x_i][y_i] = np.dot(weights, [ref_data[ind][y_i] for ind in indices])
753
+ return data
754
+
755
+
756
+ ############################
757
+ ## Expectation–maximization
758
+ ############################
759
+ @wrappers
760
+ def em(data, eps=0.1):
761
+ """ Imputes given data using expectation maximization.
762
+
763
+ E-step: Calculates the expected complete data log likelihood ratio.
764
+ M-step: Finds the parameters that maximize the log likelihood of the
765
+ complete data.
766
+
767
+ Parameters
768
+ ----------
769
+ data: numpy.nd.array
770
+ Data to impute.
771
+ eps: float
772
+ The amount of minimum change between iterations to break, if relative
773
+ change < eps, converge.
774
+ relative change = abs(current - previous) / previous
775
+ inplace: boolean
776
+ If True, operate on the numpy array reference
777
+
778
+ Returns
779
+ -------
780
+ numpy.nd.array
781
+ Imputed data.
782
+
783
+ """
784
+ nan_xy = nan_indices(data)
785
+ for x_i, y_i in nan_xy:
786
+ col = data[:, int(y_i)]
787
+ mu = col[~np.isnan(col)].mean()
788
+ std = col[~np.isnan(col)].std()
789
+ col[x_i] = np.random.normal(loc=mu, scale=std)
790
+ previous, i = 1, 1
791
+ while True:
792
+ i += 1
793
+ # Expectation
794
+ mu = col[~np.isnan(col)].mean()
795
+ std = col[~np.isnan(col)].std()
796
+ # Maximization
797
+ col[x_i] = np.random.normal(loc=mu, scale=std)
798
+ # Break out of loop if likelihood doesn't change at least 10%
799
+ # and has run at least 5 times
800
+ delta = np.abs(col[x_i]-previous)/previous
801
+ if i > 5 and delta < eps:
802
+ data[x_i][y_i] = col[x_i]
803
+ break
804
+ data[x_i][y_i] = col[x_i]
805
+ previous = col[x_i]
806
+ return data
807
+
808
+
809
+ #######################
810
+ ## Buck's method
811
+ #######################
812
+ @wrappers
813
+ def buck_iterative(data, eps=0.1):
814
+ """ Iterative variant of buck's method
815
+
816
+ - Variable to regress on is chosen at random.
817
+ - EM type infinite regression loop stops after change in prediction from
818
+ previous prediction < 10% for all columns with missing values
819
+
820
+ A Method of Estimation of Missing Values in Multivariate Data Suitable for
821
+ use with an Electronic Computer S. F. Buck Journal of the Royal Statistical
822
+ Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306
823
+
824
+ Parameters
825
+ ----------
826
+ data: numpy.ndarray
827
+ Data to impute.
828
+ eps: float
829
+ The amount of minimum change between iterations to break, if relative
830
+ change < eps, converge.
831
+ relative change = abs(current - previous) / previous
832
+ Returns
833
+ -------
834
+ numpy.ndarray
835
+ Imputed data.
836
+
837
+ """
838
+ nan_xy = nan_indices(data)
839
+
840
+ # Add a column of zeros to the index values
841
+ nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1)
842
+
843
+ nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz]
844
+ temp = []
845
+ cols_missing = {y for _, y, _ in nan_xyz}
846
+
847
+ # Step 1: Simple Imputation, these are just placeholders
848
+ for x_i, y_i, value in nan_xyz:
849
+ # Column containing nan value without the nan value
850
+ col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
851
+
852
+ new_value = np.mean(col)
853
+ data[x_i][y_i] = new_value
854
+ temp.append([x_i, y_i, new_value])
855
+ nan_xyz = temp
856
+
857
+ # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)
858
+
859
+ converged = [False] * len(nan_xyz)
860
+ while not all(converged):
861
+ # Step 2: Placeholders are set back to missing for one variable/column
862
+ dependent_col = int(np.random.choice(list(cols_missing)))
863
+ missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col]
864
+
865
+ # Step 3: Perform linear regression using the other variables
866
+ x_train, y_train = [], []
867
+ for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs):
868
+ x_train.append(np.delete(data[x_i], dependent_col))
869
+ y_train.append(data[x_i][dependent_col])
870
+ model = LinearRegression()
871
+ model.fit(x_train, y_train)
872
+
873
+ # Step 4: Missing values for the missing variable/column are replaced
874
+ # with predictions from our new linear regression model
875
+ # For null indices with the dependent column that was randomly chosen
876
+ for i, z in enumerate(nan_xyz):
877
+ x_i = z[0]
878
+ y_i = z[1]
879
+ value = data[x_i, y_i]
880
+ if y_i == dependent_col:
881
+ # Row 'x' without the nan value
882
+ new_value = model.predict([np.delete(data[x_i], dependent_col)])
883
+ data[x_i][y_i] = new_value.reshape(1, -1)
884
+ if value == 0.0:
885
+ delta = (new_value-value)/0.01
886
+ else:
887
+ delta = (new_value-value)/value
888
+ converged[i] = abs(delta) < eps
889
+ return data
890
+