tigramite-fast 5.2.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. tigramite/__init__.py +0 -0
  2. tigramite/causal_effects.py +1525 -0
  3. tigramite/causal_mediation.py +1592 -0
  4. tigramite/data_processing.py +1574 -0
  5. tigramite/graphs.py +1509 -0
  6. tigramite/independence_tests/LBFGS.py +1114 -0
  7. tigramite/independence_tests/__init__.py +0 -0
  8. tigramite/independence_tests/cmiknn.py +661 -0
  9. tigramite/independence_tests/cmiknn_mixed.py +1397 -0
  10. tigramite/independence_tests/cmisymb.py +286 -0
  11. tigramite/independence_tests/gpdc.py +664 -0
  12. tigramite/independence_tests/gpdc_torch.py +820 -0
  13. tigramite/independence_tests/gsquared.py +190 -0
  14. tigramite/independence_tests/independence_tests_base.py +1310 -0
  15. tigramite/independence_tests/oracle_conditional_independence.py +1582 -0
  16. tigramite/independence_tests/pairwise_CI.py +383 -0
  17. tigramite/independence_tests/parcorr.py +369 -0
  18. tigramite/independence_tests/parcorr_mult.py +485 -0
  19. tigramite/independence_tests/parcorr_wls.py +451 -0
  20. tigramite/independence_tests/regressionCI.py +403 -0
  21. tigramite/independence_tests/robust_parcorr.py +403 -0
  22. tigramite/jpcmciplus.py +966 -0
  23. tigramite/lpcmci.py +3649 -0
  24. tigramite/models.py +2257 -0
  25. tigramite/pcmci.py +3935 -0
  26. tigramite/pcmci_base.py +1218 -0
  27. tigramite/plotting.py +4735 -0
  28. tigramite/rpcmci.py +467 -0
  29. tigramite/toymodels/__init__.py +0 -0
  30. tigramite/toymodels/context_model.py +261 -0
  31. tigramite/toymodels/non_additive.py +1231 -0
  32. tigramite/toymodels/structural_causal_processes.py +1201 -0
  33. tigramite/toymodels/surrogate_generator.py +319 -0
  34. tigramite_fast-5.2.10.1.dist-info/METADATA +182 -0
  35. tigramite_fast-5.2.10.1.dist-info/RECORD +38 -0
  36. tigramite_fast-5.2.10.1.dist-info/WHEEL +5 -0
  37. tigramite_fast-5.2.10.1.dist-info/licenses/license.txt +621 -0
  38. tigramite_fast-5.2.10.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1397 @@
1
+ from __future__ import print_function
2
+ from scipy import special, spatial
3
+ from sklearn.neighbors import BallTree, NearestNeighbors
4
+ from sklearn import metrics
5
+ from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
6
+ from sklearn.utils.extmath import cartesian
7
+ import numpy as np
8
+ import math
9
+ import warnings
10
+
11
+ from tigramite.independence_tests.independence_tests_base import CondIndTest
12
+
13
+ class CMIknnMixed(CondIndTest):
14
+ r"""Conditional mutual information test based on nearest-neighbor estimator.
15
+
16
+ Conditional mutual information is a general dependency measure coming
17
+ from an information-theoretic framework. It makes almost no assumptions about the
18
+ parametric form of the dependencies by directly estimating the underlying
19
+ joint density. The tests here are based on the entropy estimation using
20
+ k-nearest neighbors. We implement three different approaches:
21
+
22
+ (1) Mesner & Shalizi [1]
23
+ (2) Conditional variant [2]
24
+ (3) Our variant [3]
25
+
26
+ These approaches differ in how the distance metrics are defined when searching
27
+ for neighbors:
28
+
29
+ (1) The distance on the discrete dimensions for unequal values is 1, otherwise 0.
30
+ (2) This approach splits the space into clusters for which discrete values are
31
+ all equal, then computes distances between those points (which now have
32
+ only continuous values).
33
+ (2) This approach uses the approach from [1], but defines the distance for
34
+ points with unequal discrete values as infinite, and ignores all
35
+ neighbors that have infinite distances.
36
+
37
+ The tests can be combined with a shuffle test to generate the distribution
38
+ under the null hypothesis of independence, described in [4]. The
39
+ knn-estimator is suitable for heterogeneous variables
40
+ (mixed-type, multivariate with discrete and continuous dimensions). For
41
+ mixture-type variables, use only (1) or (3).
42
+
43
+ For continuous variables, use the CMI class. For discrete variables, use
44
+ the CMIsymb or Gsquared class.
45
+
46
+ Notes
47
+ -----
48
+ These estimators have as a parameter the number of
49
+ nearest-neighbors :math:`k` which determines the size of hyper-cubes
50
+ around each (high-dimensional) sample point.
51
+
52
+ For variants (2) and (3), k is used locally, meaning that it defines
53
+ how many neighbors from a respective subsample should be considered.
54
+
55
+ :math:`k` can be viewed as a density smoothing parameter (although it is
56
+ data-adaptive unlike fixed-bandwidth estimators). For large :math:`k`, the
57
+ underlying dependencies are more smoothed and CMI has a larger bias,
58
+ but lower variance, which is more important for significance testing. Note
59
+ that the estimated CMI values can be slightly negative while CMI is a non-
60
+ negative quantity.
61
+
62
+ This method requires the scipy package.
63
+
64
+ References
65
+ ----------
66
+ .. [1] Mesner, O.C., & Shalizi, C.R. (2019): Conditional Mutual Information
67
+ Estimation for Mixed Discrete and Continuous Variables with
68
+ Nearest Neighbors. arXiv: Statistics Theory.
69
+ https://arxiv.org/abs/1912.03387
70
+
71
+ .. [2] Zan, L., Meynaoui, A., Assaad, C.K., Devijver, E., & Gaussier,
72
+ É. (2022): A Conditional Mutual Information Estimator for
73
+ Mixed Data and an Associated Conditional Independence Test.
74
+ Entropy, 24.
75
+ https://www.mdpi.com/1099-4300/24/9/1234/html
76
+
77
+ .. [3] Oana-Iuliana Popescu, Andreas Gerhardus, Martin Rabel, Jakob Runge
78
+ (2024), accepted at CLEAR
79
+ https://arxiv.org/abs/2310.11132
80
+
81
+ .. [4] J. Runge (2018): Conditional Independence Testing Based on a
82
+ Nearest-Neighbor Estimator of Conditional Mutual Information.
83
+ In Proceedings of the 21st International Conference on Artificial
84
+ Intelligence and Statistics.
85
+ http://proceedings.mlr.press/v84/runge18a.html
86
+
87
+ Parameters
88
+ ----------
89
+ knn : int or float, optional (default: 0.2)
90
+ Number of nearest-neighbors which determines the size of hyper-cubes
91
+ around each (high-dimensional) sample point. If smaller than 1, this is
92
+ computed as a fraction of T, hence knn=knn*T. For knn larger or equal to
93
+ 1, this is the absolute number.
94
+
95
+ knn_type : string, optional (default: 'global')
96
+ Sets the type of heuristic for the MSinf estimator (see paper). Can
97
+ be 'local', 'global', or 'cluster_size'. Use 'global' for the most
98
+ computational efficient variant of the estimator.
99
+
100
+ estimator : string, optional (default: 'MS')
101
+ The type of estimator to be used. Three options are available:
102
+ approach (1) (Mesner and Shalizi (2021) [1]): 'MS',
103
+ approach (2) (Zao et.al. (2022) [2]): 'cond',
104
+ approach (3) (Mesner and Shalizi (2021) [1]) with
105
+ infinite distance for points from different categories): 'FPinf'
106
+
107
+ shuffle_neighbors : int, optional (default: 5)
108
+ Number of nearest-neighbors within Z for the shuffle surrogates which
109
+ determines the size of hyper-cubes around each (high-dimensional) sample
110
+ point.
111
+
112
+ transform : {'ranks', 'standardize', 'scale', False}, optional
113
+ (default: 'standardize')
114
+ Whether to transform the array beforehand by transforming to ranks,
115
+ standardizing or scaling to (a,b)
116
+
117
+ scale : tuple, optional (default: (0,1))
118
+ the scale (a,b) to use if the transform is set 'scale'
119
+
120
+ perc : float, optional (default: None)
121
+ the value to be used as percentage of the cluster size for the realization
122
+ of a discrete value when using the 'MSinf' method. If set to None,
123
+ it is the same as the knn value.
124
+
125
+ workers : int (optional, default = -1)
126
+ Number of workers to use for parallel processing. If -1 is given
127
+ all processors are used. Default: -1.
128
+
129
+ rho: list of float, optional (default: [np.inf])
130
+ Hyperparameters used for weighting the discrete variable distances.
131
+ If not initialized, the distance will be set to np.inf, such that discrete
132
+ variables with different values will never be considered neighbors.
133
+ Otherwise the rho
134
+ ...
135
+
136
+ significance : str, optional (default: 'shuffle_test')
137
+ Type of significance test to use. For CMIknn only 'fixed_thres' and
138
+ 'shuffle_test' are available.
139
+
140
+ **kwargs :
141
+ Arguments passed on to parent class CondIndTest.
142
+ """
143
+ @property
144
+ def measure(self):
145
+ """
146
+ Concrete property to return the measure of the independence test
147
+ """
148
+ return self._measure
149
+
150
+ def __init__(self,
151
+ knn=0.2,
152
+ knn_type='global',
153
+ estimator='MSinf',
154
+ shuffle_neighbors=5,
155
+ significance='shuffle_test',
156
+ transform='ranks',
157
+ scale_range=(0, 1),
158
+ max_with_0=False,
159
+ workers=-1,
160
+ **kwargs):
161
+ # Set the member variables
162
+ self.knn = knn
163
+ self.knn_type = knn_type
164
+ self.estimator = estimator
165
+ self.shuffle_neighbors = shuffle_neighbors
166
+ self.transform = transform
167
+ self.max_with_0 = max_with_0
168
+ self.scale_range = scale_range
169
+ self._measure = 'cmi_knn_mixed'
170
+ self.two_sided = False
171
+ self.residual_based = False
172
+ self.recycle_residuals = False
173
+ self.workers = workers
174
+ self.eps = 1e-5
175
+
176
+ # Call the parent constructor
177
+ CondIndTest.__init__(self, significance=significance, **kwargs)
178
+ # Print some information about construction
179
+ if self.verbosity > 0:
180
+ if self.knn < 1:
181
+ print("knn/T = %s" % self.knn)
182
+ else:
183
+ print("knn = %s" % self.knn)
184
+ print("shuffle_neighbors = %d\n" % self.shuffle_neighbors)
185
+
186
+ def _standardize_array(self, array, dim):
187
+ """Standardizes a given array with dimensions dim.
188
+
189
+ Parameters
190
+ ----------
191
+ array : array-like
192
+ data array with X, Y, Z in rows and observations in columns
193
+
194
+ dim: int
195
+ number of dimensions of the data.
196
+
197
+ Returns
198
+ -------
199
+ array : array-like
200
+ The standardized array.
201
+ """
202
+ array = array.astype(np.float64)
203
+ array -= array.mean(axis=1).reshape(dim, 1)
204
+ std = array.std(axis=1)
205
+ for i in range(dim):
206
+ if std[i] != 0.:
207
+ array[i] /= std[i]
208
+ # array /= array.std(axis=1).reshape(dim, 1)
209
+ # FIXME: If the time series is constant, return nan rather than
210
+ # raising Exception
211
+ if np.any(std == 0.):
212
+ warnings.warn("Possibly constant array!")
213
+ # raise ValueError("nans after standardizing, "
214
+ # "possibly constant array!")
215
+ return array
216
+
217
+ def _scale_array(self, array, minmax=(0, 1)):
218
+ """Scales a given array to range minmax dimension-wise.
219
+
220
+ Parameters
221
+ ----------
222
+ array : array-like
223
+ data array with X, Y, Z in rows and observations in columns
224
+
225
+ minmax: tuple (a, b)
226
+ the min and the max values (a, b) for the scaling
227
+
228
+ Returns
229
+ -------
230
+ array : array-like
231
+ The scaled array.
232
+ """
233
+ scaler = MinMaxScaler(minmax)
234
+ return scaler.fit_transform(array.T).T
235
+
236
+ def _rank_array(self, array):
237
+ """Transform a given array to ranks.
238
+
239
+ Parameters
240
+ ----------
241
+ array : array-like
242
+ data array with X, Y, Z in rows and observations in columns
243
+
244
+ Returns
245
+ -------
246
+ array : array-like
247
+ The scaled array.
248
+ """
249
+ return array.argsort(axis=1).argsort(axis=1).astype(np.float64)
250
+
251
+ def _transform_mixed_data(self, array, data_type=None, add_noise=True):
252
+ """Applies data transformations to the continuous dimensions of the given data.
253
+
254
+ Parameters
255
+ ----------
256
+ array : array-like
257
+ data array with X, Y, Z in rows and observations in columns
258
+
259
+ add_noise : bool (default False)
260
+ Defines whether to add small normal noise to the continuous data.
261
+
262
+ data_type : array-like
263
+ data array of same shape as array which describes whether variables
264
+ are continuous or discrete: 0s for continuous variables and
265
+ 1s for discrete variables
266
+
267
+ Returns
268
+ -------
269
+ array : array-like
270
+ The array with the continuous data transformed.
271
+
272
+ """
273
+ continuous_idxs = np.where(np.all(data_type == 0, axis=1))[0]
274
+ cont_dim = len(continuous_idxs)
275
+
276
+ if add_noise:
277
+ # Add noise to destroy ties
278
+ array[continuous_idxs, :] += (1E-16 * array[continuous_idxs, :].std(axis=1).reshape(cont_dim, 1)
279
+ * self.random_state.random((array[continuous_idxs, :].shape[0], array[continuous_idxs, :].shape[1])))
280
+ if self.transform == 'standardize':
281
+ array[continuous_idxs, :] = self._standardize_array(array[continuous_idxs, :], cont_dim)
282
+ elif self.transform == 'scale':
283
+ array[continuous_idxs, :] = self._scale_array(array[continuous_idxs, :], minmax=self.scale_range)
284
+ elif self.transform == 'ranks':
285
+ # if self.estimator == 'MS' or self.estimator == 'FPinf':
286
+ array[continuous_idxs, :] = self._rank_array(array[continuous_idxs, :])
287
+ # else:
288
+ # for conditional, compute ranks for each
289
+ # pass
290
+ elif self.transform == 'none':
291
+ pass
292
+ else:
293
+ warnings.warn('Unknown transform')
294
+
295
+ return array
296
+
297
+ def _transform_to_one_hot_mixed(self, array, xyz, data_type,
298
+ zero_inf=False):
299
+ """Applies one-hot encoding to the discrete dimensions of the array.
300
+
301
+ Parameters
302
+ ----------
303
+ array : array-like
304
+ data array with X, Y, Z in rows and observations in columns
305
+
306
+ xyz : list
307
+ List that indicates which dimensions belong to which variable, e.g.
308
+ for X, Y, Z one-dimensional xyz = [0, 1, 2]
309
+
310
+ data_type : array-like
311
+ data array of same shape as array which describes whether variables
312
+ are continuous or discrete: 0s for continuous variables and
313
+ 1s for discrete variables
314
+
315
+ zero_inf : bool, optional (default: False)
316
+ defines whether to set infinite distances between points with different
317
+ values for the discrete dimensions
318
+
319
+ Returns
320
+ -------
321
+ array : array-like
322
+ The array with the continuous data transformed.
323
+
324
+ """
325
+
326
+ discrete_idx_list = np.where(np.all(data_type == 1, axis=0), 1, 0)
327
+ mixed_idx_list = np.where(np.any(data_type == 1, axis=0), 1, 0)
328
+
329
+ narray = np.copy(array)
330
+ nxyz = np.copy(xyz)
331
+ ndata_type = np.copy(data_type)
332
+
333
+ appended_columns = 0
334
+ for i in range(len(discrete_idx_list)):
335
+ if discrete_idx_list[i] == 1:
336
+ encoder = OneHotEncoder(handle_unknown='ignore')
337
+ i += appended_columns
338
+ data = narray[:, i]
339
+ xyz_val = nxyz[i]
340
+ encoder_df = encoder.fit_transform(data.reshape(-1, 1)).toarray()
341
+ if zero_inf:
342
+ encoder_df = np.where(encoder_df == 1, 9999999, 0)
343
+
344
+ xyz_val = [nxyz[i]] * encoder_df.shape[-1]
345
+ narray = np.concatenate([narray[:, :i], encoder_df, narray[:, i+1:]], axis=-1)
346
+
347
+ nxyz = np.concatenate([nxyz[:i], xyz_val, nxyz[i+1:]])
348
+ ndata_type = np.concatenate([ndata_type[:, :i],
349
+ np.ones(encoder_df.shape),
350
+ ndata_type[:, i+1:]],
351
+ axis=-1)
352
+ appended_columns += encoder_df.shape[-1] - 1
353
+
354
+ elif mixed_idx_list[i] == 1 and zero_inf == True:
355
+ i += appended_columns
356
+ data = narray[:, i]
357
+ xyz_val = nxyz[i]
358
+
359
+ # find categories
360
+ categories = np.unique(narray[:, i] * ndata_type[:, i])
361
+ categories = np.delete(categories, categories == 0.)
362
+ cont_vars = np.unique(narray[:, i] * (1 - ndata_type[:, i]))
363
+
364
+ encoder = OneHotEncoder(categories=[categories], handle_unknown='ignore')
365
+ xyz_val = nxyz[i]
366
+ encoder_df = encoder.fit_transform(data.reshape(-1, 1)).toarray()
367
+ if zero_inf:
368
+ encoder_df = np.where(encoder_df == 1, 9999999, 0)
369
+
370
+ xyz_val = [nxyz[i]] * (encoder_df.shape[-1] + 1)
371
+ cont_column = np.expand_dims(narray[:, i] * (1 - ndata_type[:, i]), -1)
372
+ narray = np.concatenate([narray[:, :i], cont_column, encoder_df, narray[:, i+1:]], axis=-1)
373
+
374
+ nxyz = np.concatenate([nxyz[:i], xyz_val, nxyz[i+1:]])
375
+ ndata_type = np.concatenate([ndata_type[:, :i],
376
+ np.zeros(cont_column.shape),
377
+ np.ones(encoder_df.shape),
378
+ ndata_type[:, i+1:]],
379
+ axis=-1)
380
+ appended_columns += encoder_df.shape[-1]
381
+
382
+ ndiscrete_idx_list = np.where(np.any(ndata_type == 1, axis=0), 1, 0)
383
+
384
+ return narray, nxyz, ndata_type, ndiscrete_idx_list
385
+
386
+
387
+ def get_smallest_cluster_size(self, array, data_type=None):
388
+ """Computes the smallest number of samples for each realization
389
+ of the discrete variables.
390
+ Used for computation of the "local" knn.
391
+
392
+ Parameters
393
+ ----------
394
+ array : array-like
395
+ data array with X, Y, Z in rows and observations in columns
396
+
397
+ data_type : array-like
398
+ data array of same shape as array which describes whether variables
399
+ are continuous or discrete: 0s for continuous variables and
400
+ 1s for discrete variables
401
+ Returns
402
+ -------
403
+ min_nc : integer
404
+ The smallest number of samples in a cluster.
405
+ """
406
+ discrete_idx_list = np.where(np.any(data_type == 1, axis=0), 1, 0)
407
+ discrete_xyz_idx = np.where(np.asarray(discrete_idx_list) == 1)[0]
408
+
409
+ num_xyz_classes = [np.unique(array[:, index]) for index in range(len(discrete_idx_list)) if (discrete_idx_list[index] == 1)]
410
+
411
+ xyz_cartesian_product = []
412
+
413
+ if len(num_xyz_classes) > 1:
414
+ xyz_cartesian_product = cartesian(num_xyz_classes)
415
+ elif len(num_xyz_classes) > 0:
416
+ xyz_cartesian_product = num_xyz_classes[0]
417
+
418
+ min_nc = array.shape[0]
419
+
420
+ if len(xyz_cartesian_product) > 0:
421
+ for i, entry in enumerate(xyz_cartesian_product):
422
+ current_array = array[np.sum(array[:, discrete_xyz_idx] == entry,
423
+ axis=-1) == len(discrete_xyz_idx)]
424
+ if current_array.shape[0] > 0 and current_array.shape[0] < min_nc:
425
+ min_nc = current_array.shape[0]
426
+
427
+ return min_nc
428
+
429
+
430
+
431
+ # @jit(forceobj=True)
432
+ def _get_nearest_neighbors_zeroinf_onehot(self, array, xyz, knn,
433
+ data_type=None):
434
+ """Returns CMI estimate according to [1] with an
435
+ altered distance metric: the 0-inf metric, which attributes
436
+ infinite distance to points where the values for the discrete dimensions
437
+ do not coincide.
438
+
439
+
440
+ Retrieves the distances eps to the k-th nearest neighbors for every
441
+ sample in joint space XYZ and returns the numbers of nearest neighbors
442
+ within eps in subspaces Z, XZ, YZ. Uses the 0-inf metric for
443
+ discrete variables.
444
+
445
+ Parameters
446
+ ----------
447
+ array : array-like
448
+ data array with X, Y, Z in rows and observations in columns
449
+
450
+ xyz : array of ints
451
+ XYZ identifier array of shape (dim,).
452
+
453
+ knn : int or float
454
+ Number of nearest-neighbors which determines the size of hyper-cubes
455
+ around each (high-dimensional) sample point. If smaller than 1, this
456
+ is computed as a fraction of T, hence knn=knn*T. For knn larger or
457
+ equal to 1, this is the absolute number.
458
+
459
+ data_type : array-like
460
+ data array of same shape as array which describes whether variables
461
+ are continuous or discrete: 0s for continuous variables and
462
+ 1s for discrete variables
463
+ Returns
464
+ -------
465
+ k_xz, k_yz, k_z : tuple of arrays of shape (T,)
466
+ Nearest neighbors in subspaces.
467
+ """
468
+ dim, T = array.shape
469
+
470
+ array = array.astype(np.float64)
471
+ xyz = xyz.astype(np.int32)
472
+
473
+ array = self._transform_mixed_data(array, data_type)
474
+
475
+ array = array.T
476
+ data_type = data_type.T
477
+
478
+ narray, nxyz, ndata_type, discrete_idx_list = self._transform_to_one_hot_mixed(array, xyz, data_type,
479
+ zero_inf=True)
480
+
481
+ # Subsample indices
482
+ x_indices = np.where(nxyz == 0)[0]
483
+ y_indices = np.where(nxyz == 1)[0]
484
+ z_indices = np.where(nxyz == 2)[0]
485
+ xz_indices = np.concatenate([x_indices, z_indices])
486
+ yz_indices = np.concatenate([y_indices, z_indices])
487
+
488
+ # Fit trees
489
+ tree_xyz = spatial.cKDTree(narray)
490
+ neighbors = tree_xyz.query(narray, k=knn+1, p=np.inf,
491
+ workers=self.workers,
492
+ distance_upper_bound=9999999)
493
+ n, k = neighbors[0].shape
494
+
495
+ epsarray = np.zeros(n)
496
+ for i in range(n):
497
+ if neighbors[0][i, knn] == np.inf:
498
+ # number of non-inf neighbors
499
+ replacement_idx = np.where(neighbors[0][i] != np.inf)[0][-1]
500
+ if self.knn_type == 'global':
501
+ # look at at least one neighbor
502
+ r = max(int(replacement_idx * self.perc), 1)
503
+ elif self.knn_type == 'cluster_size' or self.knn_type == 'local':
504
+ r = replacement_idx
505
+ epsarray[i] = neighbors[0][i, r]
506
+ else:
507
+ epsarray[i] = neighbors[0][i, knn]
508
+
509
+ neighbors_radius_xyz = tree_xyz.query_ball_point(narray, epsarray, p=np.inf, workers=self.workers,)
510
+
511
+ k_tilde = [len(neighbors_radius_xyz[i]) - 1 if len(neighbors_radius_xyz[i]) > 1 else len(neighbors_radius_xyz[i]) for i in range(len(neighbors_radius_xyz))]
512
+ # k_tilde = [len(neighbors_radius_xyz[i]) for i in range(len(neighbors_radius_xyz))]
513
+
514
+ # compute nearest neighbors in subspaces
515
+ xz = narray[:, xz_indices]
516
+ tree_xz = spatial.cKDTree(xz)
517
+ k_xz = tree_xz.query_ball_point(xz, r=epsarray, p=np.inf, workers=self.workers, return_length=True)
518
+
519
+ yz = narray[:, yz_indices]
520
+ tree_yz = spatial.cKDTree(yz)
521
+ k_yz = tree_yz.query_ball_point(yz, r=epsarray, p=np.inf, workers=self.workers, return_length=True)
522
+
523
+ if len(z_indices) > 0:
524
+ z = narray[:, z_indices]
525
+ tree_z = spatial.cKDTree(z)
526
+ k_z = tree_z.query_ball_point(z, r=epsarray, p=np.inf, workers=self.workers, return_length=True)
527
+ else:
528
+ # Number of neighbors is T when z is empty.
529
+ k_z = np.full(T, T, dtype='float')
530
+
531
+ k_xz = np.asarray([i - 1 if i > 1 else i for i in k_xz])
532
+ k_yz = np.asarray([i - 1 if i > 1 else i for i in k_yz])
533
+ k_z = np.asarray([i - 1 if i > 1 else i for i in k_z])
534
+
535
+ return k_tilde, k_xz, k_yz, k_z
536
+
537
+ def get_dependence_measure_MSinf(self, array, xyz,
538
+ data_type=None):
539
+ """Returns CMI estimate according to Frenzel and Pompe with an
540
+ altered distance metric: the 0-inf metric, which attributes
541
+ infinite distance to points where the values for the discrete dimensions
542
+ do not coincide.
543
+
544
+ Parameters
545
+ ----------
546
+ array : array-like
547
+ data array with X, Y, Z in rows and observations in columns
548
+
549
+ xyz : array of ints
550
+ XYZ identifier array of shape (dim,).
551
+
552
+ data_type : array-like
553
+ data array of same shape as array which describes whether variables
554
+ are continuous or discrete: 0s for continuous variables and
555
+ 1s for discrete variables
556
+
557
+ Returns
558
+ -------
559
+ val : float
560
+ Conditional mutual information estimate.
561
+ """
562
+ dim, T = array.shape
563
+
564
+ # compute knn according to knn type
565
+ if self.knn < 1:
566
+ if self.knn_type == 'global':
567
+ # compute knn
568
+ knn = max(1, int(self.knn*T))
569
+ self.perc = self.knn
570
+ elif self.knn_type == 'cluster_size':
571
+ knn = max(1, int(self.knn*T))
572
+ elif self.knn_type == 'local':
573
+ min_nc = self.get_smallest_cluster_size(array.T, data_type.T)
574
+ knn = max(1, int(self.knn*min_nc))
575
+ else:
576
+ if self.knn_type == 'global':
577
+ knn = max(1, int(self.knn))
578
+ self.perc = self.knn
579
+ elif self.knn_type == 'cluster_size':
580
+ knn = max(1, int(self.knn))
581
+ else:
582
+ raise ValueError("MSinf with knn_type == 'local' needs knn value as percentage (value < 1), not number of neighbors!")
583
+
584
+ knn_tilde, k_xz, k_yz, k_z = self._get_nearest_neighbors_zeroinf_onehot(array=array,
585
+ xyz=xyz,
586
+ knn=knn,
587
+ data_type=data_type)
588
+ non_zero = knn_tilde - k_xz - k_yz + k_z
589
+
590
+ non_zero_count = np.count_nonzero(non_zero) / len(non_zero)
591
+
592
+ val = (special.digamma(knn_tilde) - special.digamma(k_xz) -
593
+ special.digamma(k_yz) +
594
+ special.digamma(k_z))
595
+
596
+ val = val[np.isfinite(val)].mean()
597
+
598
+ if self.max_with_0 and val < 0.:
599
+ val = 0.
600
+
601
+ return val
602
+
603
+
604
+ # @jit(forceobj=True)
605
+ def _get_nearest_neighbors_MS_one_hot(self, array, xyz,
606
+ knn, data_type=None):
607
+ """Returns nearest neighbors according to [1].
608
+
609
+ Retrieves the distances eps to the k-th nearest neighbors for every
610
+ sample in joint space XYZ and returns the numbers of nearest neighbors
611
+ within eps in subspaces Z, XZ, YZ. Uses a custom-defined metric for
612
+ discrete variables.
613
+
614
+ Parameters
615
+ ----------
616
+ array : array-like
617
+ data array with X, Y, Z in rows and observations in columns
618
+
619
+ xyz : array of ints
620
+ XYZ identifier array of shape (dim,).
621
+
622
+ knn : int or float
623
+ Number of nearest-neighbors which determines the size of hyper-cubes
624
+ around each (high-dimensional) sample point. If smaller than 1, this
625
+ is computed as a fraction of T, hence knn=knn*T. For knn larger or
626
+ equal to 1, this is the absolute number.
627
+
628
+ data_type : array-like
629
+ data array of same shape as array which describes whether variables
630
+ are continuous or discrete: 0s for continuous variables and
631
+ 1s for discrete variables
632
+
633
+ Returns
634
+ -------
635
+ k_tilde, k_xz, k_yz, k_z : tuple of arrays of shape (T,)
636
+ Nearest neighbors in XYZ, XZ, YZ, and Z subspaces.
637
+ """
638
+
639
+ dim, T = array.shape
640
+
641
+ array = array.astype(np.float64)
642
+ xyz = xyz.astype(np.int32)
643
+
644
+ array = self._transform_mixed_data(array, data_type)
645
+
646
+ array = array.T
647
+ data_type = data_type.T
648
+
649
+ narray, nxyz, ndata_type, discrete_idx_list = self._transform_to_one_hot_mixed(array,
650
+ xyz,
651
+ data_type)
652
+
653
+ # Subsample indices
654
+ x_indices = np.where(nxyz == 0)[0]
655
+ y_indices = np.where(nxyz == 1)[0]
656
+ z_indices = np.where(nxyz == 2)[0]
657
+
658
+ xz_indices = np.concatenate([x_indices, z_indices])
659
+ yz_indices = np.concatenate([y_indices, z_indices])
660
+
661
+ # Fit trees
662
+ tree_xyz = spatial.cKDTree(narray)
663
+ neighbors = tree_xyz.query(narray, k=knn+1, p=np.inf, workers=self.workers)
664
+
665
+ epsarray = neighbors[0][:, -1].astype(np.float64)
666
+
667
+ neighbors_radius_xyz = tree_xyz.query_ball_point(narray, epsarray, p=np.inf,
668
+ workers=self.workers)
669
+
670
+ # search again for neighbors in the radius to find all of them
671
+ # in the discrete case k_tilde can be larger than the given knn
672
+ k_tilde = np.asarray([len(neighbors_radius_xyz[i]) - 1 if len(neighbors_radius_xyz[i]) > 1 else len(neighbors_radius_xyz[i]) for i in range(len(neighbors_radius_xyz))])
673
+
674
+ # compute entropies
675
+ xz = narray[:, xz_indices]
676
+ tree_xz = spatial.cKDTree(xz)
677
+ k_xz = tree_xz.query_ball_point(xz, r=epsarray, p=np.inf,
678
+ workers=self.workers, return_length=True)
679
+
680
+
681
+ yz = narray[:, yz_indices]
682
+ tree_yz = spatial.cKDTree(yz)
683
+ k_yz = tree_yz.query_ball_point(yz, r=epsarray, p=np.inf,
684
+ workers=self.workers, return_length=True)
685
+
686
+ if len(z_indices) > 0:
687
+ z = narray[:, z_indices]
688
+ tree_z = spatial.cKDTree(z)
689
+ k_z = tree_z.query_ball_point(z, r=epsarray, p=np.inf,
690
+ workers=self.workers, return_length=True)
691
+
692
+ else:
693
+ # Number of neighbors is T when z is empty.
694
+ k_z = np.full(T, T, dtype='float')
695
+
696
+ k_xz = np.asarray([i - 1 if i > 1 else i for i in k_xz])
697
+ k_yz = np.asarray([i - 1 if i > 1 else i for i in k_yz])
698
+ k_z = np.asarray([i - 1 if i > 1 else i for i in k_z])
699
+
700
+ return k_tilde, k_xz, k_yz, k_z
701
+
702
+
703
+ def get_dependence_measure_MS(self, array, xyz,
704
+ data_type=None):
705
+
706
+ """Returns CMI estimate as described in Messner and Shalizi (2021).
707
+
708
+ Parameters
709
+ ----------
710
+ array : array-like
711
+ data array with X, Y, Z in rows and observations in columns
712
+
713
+ xyz : array of ints
714
+ XYZ identifier array of shape (dim,).
715
+
716
+ data_type : array-like
717
+ data array of same shape as array which describes whether variables
718
+ are continuous or discrete: 0s for continuous variables and
719
+ 1s for discrete variables
720
+
721
+ Returns
722
+ -------
723
+ val : float
724
+ Conditional mutual information estimate.
725
+ """
726
+ dim, T = array.shape
727
+
728
+ # compute knn
729
+ if self.knn < 1:
730
+ knn = max(1, int(self.knn*T))
731
+ else:
732
+ knn = max(1, self.knn)
733
+
734
+
735
+ knn_tilde, k_xz, k_yz, k_z = self._get_nearest_neighbors_MS_one_hot(array=array,
736
+ xyz=xyz,
737
+ knn=knn,
738
+ data_type=data_type)
739
+
740
+ non_zero = knn_tilde - k_xz - k_yz + k_z
741
+
742
+ non_zero_count = np.count_nonzero(non_zero) / len(non_zero)
743
+
744
+ val = (special.digamma(knn_tilde) - special.digamma(k_xz) -
745
+ special.digamma(k_yz) +
746
+ special.digamma(k_z))
747
+
748
+ val = val[np.isfinite(val)].mean()
749
+
750
+ if self.max_with_0 and val < 0.:
751
+ val = 0.
752
+
753
+ return val
754
+
755
+
756
+
757
+ def _compute_entropies_for_discrete_entry(self, array,
758
+ discrete_values,
759
+ discrete_idxs,
760
+ continuous_idxs,
761
+ total_num_samples,
762
+ knn):
763
+ # select data for which the discrete values are the given ones
764
+ current_array = array[np.sum(array[:, discrete_idxs] == discrete_values,
765
+ axis=-1) == len(discrete_idxs)]
766
+
767
+ # if we do not have samples, we cannot estimate CMI
768
+ if np.size(current_array) == 0:
769
+ return 0., 0.
770
+
771
+ T, dim = current_array.shape
772
+
773
+ # if we have more samples than knns and samples are not purely discrete, we can
774
+ # compute CMI
775
+ if len(continuous_idxs) > 0 and T > knn:
776
+ val_continuous_entropy = self._compute_continuous_entropy(current_array[:, continuous_idxs], knn)
777
+ else:
778
+ val_continuous_entropy = 0.
779
+
780
+ prob = float(T) / total_num_samples
781
+
782
+ # multiply by probabilities of occurence
783
+ val_continuous_entropy *= prob
784
+ # compute entropy for that occurence
785
+ val_discrete_entropy = prob * np.log(prob)
786
+
787
+ return val_continuous_entropy, val_discrete_entropy
788
+
789
+ def _compute_continuous_entropy(self, array, knn):
790
+ T, dim = array.shape
791
+ if T == 1:
792
+ return 0.
793
+
794
+ if knn < 1:
795
+ knn = int(max(np.rint(knn * T), 1))
796
+
797
+ tree = spatial.cKDTree(array)
798
+ epsarray = tree.query(array, k=[knn+1], p=np.inf,
799
+ workers=self.workers,
800
+ eps=0.)[0][:, 0].astype(np.float64)
801
+ epsarray = epsarray[epsarray != 0]
802
+ num_non_zero = len(epsarray)
803
+ if num_non_zero == 0:
804
+ cmi_hat = 0.
805
+ else:
806
+ avg_dist = float(array.shape[-1]) / float(num_non_zero) * np.sum(np.log(2 * epsarray))
807
+ cmi_hat = special.digamma(num_non_zero) - special.digamma(knn) + avg_dist
808
+
809
+ return cmi_hat
810
+
811
+
812
+ def get_dependence_measure_ZMADG(self, array, xyz,
813
+ data_type=None):
814
+ """Returns CMI estimate as described in [2].
815
+
816
+ Parameters
817
+ ----------
818
+ array : array-like
819
+ data array with X, Y, Z in rows and observations in columns
820
+
821
+ xyz : array of ints
822
+ XYZ identifier array of shape (dim,).
823
+
824
+ data_type : array-like
825
+ data array of same shape as array which describes whether variables
826
+ are continuous or discrete: 0s for continuous variables and
827
+ 1s for discrete variables
828
+
829
+ Returns
830
+ -------
831
+ val : float
832
+ Conditional mutual information estimate.
833
+ """
834
+
835
+ dim, T = array.shape
836
+
837
+ if self.knn > 1:
838
+ raise ValueError("ZMADG needs knn value as percentage (value < 1), not number of neighbors!")
839
+ else:
840
+ knn = self.knn
841
+
842
+ array = array.astype(np.float64)
843
+ xyz = xyz.astype(np.int32)
844
+
845
+ array = self._transform_mixed_data(array, data_type)
846
+
847
+ array = array.T
848
+ data_type = data_type.T
849
+
850
+ discrete_idx_list = np.where(np.any(data_type == 1, axis=0), 1, 0)
851
+
852
+ if np.sum(discrete_idx_list) == 0:
853
+ raise ValueError("Variables are continuous, cannot use CMIknnMixed ZMADG!")
854
+
855
+ # Subsample indices
856
+ x_indices = np.where(xyz == 0)[0]
857
+ y_indices = np.where(xyz == 1)[0]
858
+ z_indices = np.where(xyz == 2)[0]
859
+ xz_indices = np.concatenate([x_indices, z_indices])
860
+ yz_indices = np.concatenate([y_indices, z_indices])
861
+
862
+ discrete_xz_indices = discrete_idx_list[xz_indices]
863
+ discrete_yz_indices = discrete_idx_list[yz_indices]
864
+ discrete_z_indices = discrete_idx_list[z_indices]
865
+
866
+ discrete_xyz_idx = np.where(np.asarray(discrete_idx_list) == 1)[0]
867
+ discrete_xz_idx = np.where(np.asarray(discrete_xz_indices) == 1)[0]
868
+ discrete_yz_idx = np.where(np.asarray(discrete_yz_indices) == 1)[0]
869
+ discrete_z_idx = np.where(np.asarray(discrete_z_indices) == 1)[0]
870
+
871
+ continuous_xyz_idx = np.where(np.asarray(discrete_idx_list) == 0)[0]
872
+ continuous_xz_idx = np.where(np.asarray(discrete_xz_indices) == 0)[0]
873
+ continuous_yz_idx = np.where(np.asarray(discrete_yz_indices) == 0)[0]
874
+ continuous_z_idx = np.where(np.asarray(discrete_z_indices) == 0)[0]
875
+
876
+ # get the number of unique values for each category of the discrete variable
877
+ # add empty set for code not to break when accessing [0]
878
+ num_xz_classes = [np.unique(array[:, xz_indices][:, index]) for index in range(len(discrete_xz_indices)) if (discrete_xz_indices[index] == 1)]
879
+ num_yz_classes = [np.unique(array[:, yz_indices][:, index]) for index in range(len(discrete_yz_indices)) if (discrete_yz_indices[index] == 1)]
880
+ num_z_classes = [np.unique(array[:, z_indices][:, index]) for index in range(len(discrete_z_indices)) if (discrete_z_indices[index] == 1)]
881
+ num_xyz_classes = [np.unique(array[:, index]) for index in range(len(discrete_idx_list)) if (discrete_idx_list[index] == 1)]
882
+
883
+ xyz_cartesian_product = []
884
+ xz_cartesian_product = []
885
+ yz_cartesian_product = []
886
+ z_cartesian_product = []
887
+
888
+ if len(num_xyz_classes) > 1:
889
+ xyz_cartesian_product = cartesian(num_xyz_classes)
890
+ elif len(num_xyz_classes) > 0:
891
+ xyz_cartesian_product = num_xyz_classes[0]
892
+
893
+
894
+ if len(num_xz_classes) > 1:
895
+ xz_cartesian_product = cartesian(num_xz_classes)
896
+ elif len(num_xz_classes) > 0:
897
+ xz_cartesian_product = num_xz_classes[0]
898
+
899
+
900
+ if len(num_yz_classes) > 1:
901
+ yz_cartesian_product = cartesian(num_yz_classes)
902
+ elif len(num_yz_classes) > 0:
903
+ yz_cartesian_product = num_yz_classes[0]
904
+
905
+
906
+ if len(num_z_classes) > 1:
907
+ z_cartesian_product = cartesian(num_z_classes)
908
+ elif len(num_z_classes) > 0:
909
+ z_cartesian_product = num_z_classes[0]
910
+
911
+
912
+ ####### start computing entropies
913
+
914
+ if len(xyz_cartesian_product) > 0:
915
+ xyz_cmi = 0.
916
+ xyz_entropy = 0.
917
+
918
+ for i, entry in enumerate(xyz_cartesian_product):
919
+ xyz_cont_entropy, xyz_disc_entropy = self._compute_entropies_for_discrete_entry(array, entry,
920
+ discrete_xyz_idx,
921
+ continuous_xyz_idx,
922
+ T, knn)
923
+ xyz_cmi += xyz_cont_entropy
924
+ xyz_entropy -= xyz_disc_entropy
925
+ else:
926
+ xyz_cmi = self._compute_continuous_entropy(array, knn)
927
+ xyz_entropy = 0.
928
+
929
+ h_xyz = xyz_cmi + xyz_entropy
930
+
931
+ if len(xz_cartesian_product) > 0:
932
+ xz_cmi = 0.
933
+ xz_entropy = 0.
934
+
935
+ for i, entry in enumerate(xz_cartesian_product):
936
+ xz_cont_entropy, xz_disc_entropy = self._compute_entropies_for_discrete_entry(array[:, xz_indices], entry,
937
+ discrete_xz_idx,
938
+ continuous_xz_idx,
939
+ T, knn)
940
+ xz_cmi += xz_cont_entropy
941
+ xz_entropy -= xz_disc_entropy
942
+ else:
943
+ xz_cmi = self._compute_continuous_entropy(array[:, xz_indices], knn)
944
+ xz_entropy = 0.
945
+
946
+ h_xz = xz_cmi + xz_entropy
947
+
948
+ # compute entropies in Xy subspace
949
+ if len(yz_cartesian_product) > 0:
950
+ yz_cmi = 0.
951
+ yz_entropy = 0.
952
+
953
+ for i, entry in enumerate(yz_cartesian_product):
954
+ yz_cont_entropy, yz_disc_entropy = self._compute_entropies_for_discrete_entry(array[:, yz_indices], entry,
955
+ discrete_yz_idx,
956
+ continuous_yz_idx,
957
+ T, knn)
958
+ yz_cmi += yz_cont_entropy
959
+ yz_entropy -= yz_disc_entropy
960
+ else:
961
+ yz_cmi = self._compute_continuous_entropy(array[:, yz_indices], knn)
962
+ yz_entropy = 0.
963
+
964
+ h_yz = yz_cmi + yz_entropy
965
+
966
+ # compute entropies in Z subspace
967
+ if len(z_cartesian_product) > 0:
968
+ z_cmi = 0.
969
+ z_entropy = 0.
970
+
971
+ for i, entry in enumerate(z_cartesian_product):
972
+ z_cont_entropy, z_disc_entropy = self._compute_entropies_for_discrete_entry(array[:, z_indices],
973
+ entry,
974
+ discrete_z_idx,
975
+ continuous_z_idx,
976
+ T, knn)
977
+ z_cmi += z_cont_entropy
978
+ z_entropy -= z_disc_entropy
979
+ else:
980
+ z_cmi = self._compute_continuous_entropy(array[:, z_indices], knn)
981
+ z_entropy = 0.
982
+
983
+ h_z = z_cmi + z_entropy
984
+
985
+ # put it all together for the CMI estimation
986
+ val = h_xz + h_yz - h_xyz - h_z
987
+
988
+ if self.max_with_0:
989
+ if val < 0.:
990
+ val = 0.
991
+
992
+ entropies = (xz_cmi, yz_cmi, xyz_cmi, z_cmi, xz_entropy, yz_entropy, xyz_entropy, z_entropy)
993
+
994
+ return val
995
+
996
+ def _get_p_value(self, val, array, xyz, T, dim,
997
+ data_type=None,
998
+ sig_override=None):
999
+ """
1000
+ Returns the p-value from whichever significance function is specified
1001
+ for this test. If an override is used, then it will call a different
1002
+ function then specified by self.significance
1003
+
1004
+ Parameters
1005
+ ----------
1006
+ val : float
1007
+ Test statistic value.
1008
+
1009
+ array : array-like
1010
+ data array with X, Y, Z in rows and observations in columns
1011
+
1012
+ xyz : array of ints
1013
+ XYZ identifier array of shape (dim,).
1014
+
1015
+ T : int
1016
+ Sample length
1017
+
1018
+ dim : int
1019
+ Dimensionality, ie, number of features.
1020
+
1021
+ data_type : array-like
1022
+ Binary data array of same shape as array which describes whether
1023
+ individual samples in a variable (or all samples) are continuous
1024
+ or discrete: 0s for continuous variables and 1s for discrete variables.
1025
+
1026
+ sig_override : string
1027
+ Must be in 'analytic', 'shuffle_test', 'fixed_thres'
1028
+
1029
+ Returns
1030
+ -------
1031
+ pval : float or numpy.nan
1032
+ P-value.
1033
+ """
1034
+ # Defaults to the self.significance member value
1035
+ use_sig = self.significance
1036
+ if sig_override is not None:
1037
+ use_sig = sig_override
1038
+ # Check if we are using the analytic significance
1039
+ if use_sig == 'analytic':
1040
+ pval = self.get_analytic_significance(value=val, T=T, dim=dim, xyz=xyz)
1041
+ # Check if we are using the shuffle significance
1042
+ elif use_sig == 'shuffle_test':
1043
+ pval = self.get_shuffle_significance(array=array,
1044
+ xyz=xyz,
1045
+ value=val,
1046
+ data_type=data_type)
1047
+ # Check if we are using the fixed_thres significance
1048
+ elif use_sig == 'fixed_thres':
1049
+ # Determined outside then
1050
+ pval = None
1051
+ else:
1052
+ raise ValueError("%s not known." % self.significance)
1053
+ return pval
1054
+
1055
+ def get_dependence_measure(self, array, xyz,
1056
+ data_type=None):
1057
+ """Calls the appropriate function to estimate CMI.
1058
+ Parameters
1059
+ ----------
1060
+ array : array-like
1061
+ data array with X, Y, Z in rows and observations in columns
1062
+
1063
+ xyz : array of ints
1064
+ XYZ identifier array of shape (dim,)
1065
+
1066
+ data_type : array-like
1067
+ data array of same shape as array which describes whether variables
1068
+ are continuous or discrete: 0s for continuous variables and
1069
+ 1s for discrete variables
1070
+
1071
+ Returns
1072
+ -------
1073
+ val : float
1074
+ Conditional mutual information estimate.
1075
+ """
1076
+ # check that data is really mixed
1077
+ if data_type is None:
1078
+ raise ValueError("Type mask cannot be none for CMIknnMixed!")
1079
+ if np.sum(data_type) > data_type.size:
1080
+ raise ValueError("Type mask contains other values than 0 and 1!")
1081
+
1082
+ if self.estimator == 'MS':
1083
+ return self.get_dependence_measure_MS(array,
1084
+ xyz,
1085
+ data_type)
1086
+ elif self.estimator == 'ZMADG':
1087
+ return self.get_dependence_measure_ZMADG(array,
1088
+ xyz,
1089
+ data_type)
1090
+ elif self.estimator == 'MSinf':
1091
+ return self.get_dependence_measure_MSinf(array,
1092
+ xyz,
1093
+ data_type)
1094
+ else:
1095
+ raise ValueError('No such estimator available!')
1096
+
1097
+ # @jit(forceobj=True)
1098
+ def get_restricted_permutation(self, T, shuffle_neighbors, neighbors, order):
1099
+
1100
+ restricted_permutation = np.zeros(T, dtype=np.int32)
1101
+ used = np.array([], dtype=np.int32)
1102
+
1103
+ for sample_index in order:
1104
+ # neighbors_to_use = np.unique(neighbors[sample_index, :])
1105
+ neighbors_to_use = neighbors[sample_index]
1106
+ m = 0
1107
+ # use = neighbors[sample_index, m]
1108
+ use = neighbors_to_use[m]
1109
+ while ((use in used) and (m < shuffle_neighbors - 1)):
1110
+ m += 1
1111
+ use = neighbors_to_use[m]
1112
+ # use = neighbors[sample_index, m]
1113
+
1114
+ restricted_permutation[sample_index] = use
1115
+ used = np.append(used, use)
1116
+
1117
+ return restricted_permutation
1118
+
1119
+
1120
+ # @jit(forceobj=True)
1121
+ def _generate_random_permutation(self, array, neighbors, x_indices, data_type):
1122
+
1123
+ T, dim = array.shape
1124
+ # Generate random order in which to go through indices loop in
1125
+ # next step
1126
+ order = self.random_state.permutation(T).astype(np.int32)
1127
+
1128
+ n = np.empty(neighbors.shape[0], dtype=object)
1129
+
1130
+ for i in range(neighbors.shape[0]):
1131
+ v = np.unique(neighbors[i])
1132
+ # Shuffle neighbor indices for each sample index
1133
+ self.random_state.shuffle(v)
1134
+ n[i] = v
1135
+
1136
+ # Select a series of neighbor indices that contains as few as
1137
+ # possible duplicates
1138
+ restricted_permutation = self.get_restricted_permutation(
1139
+ T=T,
1140
+ shuffle_neighbors=self.shuffle_neighbors,
1141
+ neighbors=n,
1142
+ order=order)
1143
+
1144
+
1145
+ array_shuffled = np.copy(array)
1146
+ data_type_shuffled = np.copy(data_type)
1147
+
1148
+ for i in x_indices:
1149
+ array_shuffled[:, i] = array[restricted_permutation, i]
1150
+ data_type_shuffled[:, i] = data_type[restricted_permutation, i]
1151
+
1152
+ return array_shuffled, data_type_shuffled
1153
+
1154
+ # @jit(forceobj=True)
1155
+ def compute_perm_null_dist(self, array, xyz,
1156
+ data_type=None):
1157
+ # max_neighbors = max(1, int(max_neighbor_ratio*T))
1158
+ array = self._transform_mixed_data(array.T, data_type.T).T
1159
+
1160
+ # compute valid neighbors
1161
+ narray, nxyz, ndata_type, discrete_idx_list = self._transform_to_one_hot_mixed(array,
1162
+ xyz,
1163
+ data_type,
1164
+ zero_inf=True)
1165
+ x_indices = np.where(nxyz == 0)[0]
1166
+ z_indices = np.where(nxyz == 2)[0]
1167
+
1168
+ if self.verbosity > 2:
1169
+ print(" nearest-neighbor shuffle significance "
1170
+ "test with n = %d and %d surrogates" % (
1171
+ self.shuffle_neighbors, self.sig_samples))
1172
+ # Get nearest neighbors around each sample point in Z
1173
+ z_array = np.array(narray[:, z_indices])
1174
+
1175
+ tree_xyz = spatial.cKDTree(z_array)
1176
+ neighbors = tree_xyz.query(z_array,
1177
+ k=self.shuffle_neighbors + 1,
1178
+ p=np.inf,
1179
+ workers=self.workers,
1180
+ distance_upper_bound=9999999,
1181
+ eps=0.)
1182
+
1183
+ # remove all neighbors with distance infinite -> from another class
1184
+ # for those that are discrete
1185
+ valid_neighbors = np.ones(neighbors[1].shape)
1186
+ # fill valid neighbors with point -> if infinite, the neighbor will
1187
+ # be the point itself
1188
+ valid_neighbors = np.multiply(valid_neighbors, np.expand_dims(np.arange(valid_neighbors.shape[0]), axis=-1))
1189
+
1190
+ valid_neighbors[neighbors[0] != np.inf] = neighbors[1][neighbors[0] != np.inf]
1191
+
1192
+ null_dist = np.zeros(self.sig_samples)
1193
+
1194
+ for sam in range(self.sig_samples):
1195
+ # permute un-encoded array using the valud neighbors list
1196
+ array_shuffled, data_type_shuffled = self._generate_random_permutation(array,
1197
+ valid_neighbors,
1198
+ x_indices=np.where(xyz == 0)[0],
1199
+ data_type=data_type)
1200
+
1201
+ # use array instead of narray to avoid double encoding
1202
+ null_dist[sam] = self.get_dependence_measure(array_shuffled.T,
1203
+ xyz,
1204
+ data_type=data_type_shuffled.T)
1205
+ return null_dist
1206
+
1207
+ # @jit(forceobj=True)
1208
+ def get_shuffle_significance(self, array, xyz, value,
1209
+ return_null_dist=False,
1210
+ data_type=None):
1211
+
1212
+ """Returns p-value for nearest-neighbor shuffle significance test.
1213
+
1214
+ For non-empty Z, overwrites get_shuffle_significance from the parent
1215
+ class which is a block shuffle test, which does not preserve
1216
+ dependencies of X and Y with Z. Here the parameter shuffle_neighbors is
1217
+ used to permute only those values :math:`x_i` and :math:`x_j` for which
1218
+ :math:`z_j` is among the nearest neighbors of :math:`z_i`. If Z is
1219
+ empty, the block-shuffle test is used.
1220
+
1221
+ Parameters
1222
+ ----------
1223
+ array : array-like
1224
+ data array with X, Y, Z in rows and observations in columns
1225
+
1226
+ xyz : array of ints
1227
+ XYZ identifier array of shape (dim,).
1228
+
1229
+ value : number
1230
+ Value of test statistic for unshuffled estimate.
1231
+
1232
+ data_type : array-like
1233
+ data array of same shape as array which describes whether variables
1234
+ are continuous or discrete: 0s for continuous variables and
1235
+ 1s for discrete variables
1236
+
1237
+ Returns
1238
+ -------
1239
+ pval : float
1240
+ p-value
1241
+ """
1242
+
1243
+ dim, T = array.shape
1244
+ array = array.T
1245
+ data_type = data_type.T
1246
+
1247
+ z_indices = np.where(xyz == 2)[0]
1248
+
1249
+ if len(z_indices) > 0 and self.shuffle_neighbors < T:
1250
+ null_dist = self.compute_perm_null_dist(array, xyz, data_type)
1251
+ else:
1252
+ null_dist = \
1253
+ self._get_shuffle_dist(array.T, xyz,
1254
+ sig_samples=self.sig_samples,
1255
+ sig_blocklength=self.sig_blocklength,
1256
+ data_type=data_type.T,
1257
+ verbosity=self.verbosity)
1258
+
1259
+ pval = float(np.sum(null_dist >= value) + 1) / (self.sig_samples + 1)
1260
+
1261
+ if return_null_dist:
1262
+ # Sort
1263
+ null_dist.sort()
1264
+ return pval, null_dist
1265
+ return pval
1266
+
1267
+
1268
+ def _get_shuffle_dist(self, array, xyz,
1269
+ sig_samples, sig_blocklength=None,
1270
+ data_type=None,
1271
+ verbosity=0):
1272
+ """Returns shuffle distribution of test statistic.
1273
+
1274
+ The rows in array corresponding to the X-variable are shuffled using
1275
+ a block-shuffle approach.
1276
+
1277
+ Parameters
1278
+ ----------
1279
+ array : array-like
1280
+ data array with X, Y, Z in rows and observations in columns
1281
+
1282
+ xyz : array of ints
1283
+ XYZ identifier array of shape (dim,).
1284
+
1285
+ dependence_measure : object
1286
+ Dependence measure function must be of form
1287
+ dependence_measure(array, xyz) and return a numeric value
1288
+
1289
+ sig_samples : int, optional (default: 100)
1290
+ Number of samples for shuffle significance test.
1291
+
1292
+ sig_blocklength : int, optional (default: None)
1293
+ Block length for block-shuffle significance test. If None, the
1294
+ block length is determined from the decay of the autocovariance as
1295
+ explained in [1]_.
1296
+
1297
+ data_type : array-like
1298
+ data array of same shape as array which describes whether variables
1299
+ are continuous or discrete: 0s for continuous variables and
1300
+ 1s for discrete variables
1301
+
1302
+ verbosity : int, optional (default: 0)
1303
+ Level of verbosity.
1304
+
1305
+ Returns
1306
+ -------
1307
+ null_dist : array of shape (sig_samples,)
1308
+ Contains the sorted test statistic values estimated from the
1309
+ shuffled arrays.
1310
+ """
1311
+ dim, T = array.shape
1312
+
1313
+ x_indices = np.where(xyz == 0)[0]
1314
+ dim_x = len(x_indices)
1315
+
1316
+ if sig_blocklength is None:
1317
+ sig_blocklength = self._get_block_length(array, xyz,
1318
+ mode='significance')
1319
+
1320
+ n_blks = int(math.floor(float(T)/sig_blocklength))
1321
+ # print 'n_blks ', n_blks
1322
+ if verbosity > 2:
1323
+ print(" Significance test with block-length = %d "
1324
+ "..." % (sig_blocklength))
1325
+
1326
+ array_shuffled = np.copy(array)
1327
+ data_type_shuffled = np.copy(data_type)
1328
+ block_starts = np.arange(0, T - sig_blocklength + 1, sig_blocklength)
1329
+
1330
+ # Dividing the array up into n_blks of length sig_blocklength may
1331
+ # leave a tail. This tail is later randomly inserted
1332
+ tail = array[x_indices, n_blks*sig_blocklength:]
1333
+ tail_type = data_type_shuffled[x_indices, n_blks*sig_blocklength:]
1334
+
1335
+
1336
+ null_dist = np.zeros(sig_samples)
1337
+ for sam in range(sig_samples):
1338
+
1339
+ blk_starts = self.random_state.permutation(block_starts)[:n_blks]
1340
+
1341
+ x_shuffled = np.zeros((dim_x, n_blks*sig_blocklength),
1342
+ dtype=array.dtype)
1343
+ type_x_shuffled = np.zeros((dim_x, n_blks*sig_blocklength),
1344
+ dtype=array.dtype)
1345
+
1346
+ for i, index in enumerate(x_indices):
1347
+ for blk in range(sig_blocklength):
1348
+ x_shuffled[i, blk::sig_blocklength] = \
1349
+ array[index, blk_starts + blk]
1350
+
1351
+ type_x_shuffled[i, blk::sig_blocklength] = \
1352
+ data_type[index, blk_starts + blk]
1353
+
1354
+ # Insert tail randomly somewhere
1355
+ if tail.shape[1] > 0:
1356
+ insert_tail_at = self.random_state.choice(block_starts)
1357
+ x_shuffled = np.insert(x_shuffled, insert_tail_at,
1358
+ tail.T, axis=1)
1359
+ type_x_shuffled = np.insert(type_x_shuffled, insert_tail_at,
1360
+ tail_type.T, axis=1)
1361
+
1362
+ for i, index in enumerate(x_indices):
1363
+ array_shuffled[index] = x_shuffled[i]
1364
+ data_type_shuffled[index] = type_x_shuffled[i]
1365
+
1366
+ null_dist[sam] = self.get_dependence_measure(array=array_shuffled,
1367
+ xyz=xyz,
1368
+ data_type=data_type_shuffled)
1369
+
1370
+ return null_dist
1371
+
1372
+
1373
+ if __name__ == '__main__':
1374
+
1375
+ import tigramite
1376
+ from tigramite.data_processing import DataFrame
1377
+ import tigramite.data_processing as pp
1378
+ import numpy as np
1379
+
1380
+ from tigramite.independence_tests.cmiknn import CMIknn
1381
+
1382
+ random_state = np.random.default_rng(seed=None)
1383
+ cmi = CMIknnMixed(seed=None)
1384
+
1385
+ T = 500
1386
+ dimz = 1
1387
+
1388
+ # Continuous data
1389
+ z = random_state.standard_normal((T, dimz))
1390
+ x = random_state.standard_normal(T).reshape(T, 1)
1391
+ y = (5.*z[:,0] + 0.*x[:,0] + random_state.standard_normal(T)).reshape(T, 1)
1392
+
1393
+ print(cmi.get_dependence_measure_raw(x=x,y=y,z=z,
1394
+ x_type=np.zeros(x.shape), y_type=np.zeros(y.shape), z_type=np.zeros(z.shape) ))
1395
+
1396
+ print(cmi.run_test_raw(x=x,y=y,z=z,
1397
+ x_type=np.zeros(x.shape), y_type=np.zeros(y.shape), z_type=np.zeros(z.shape) ))