tigramite-fast 5.2.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. tigramite/__init__.py +0 -0
  2. tigramite/causal_effects.py +1525 -0
  3. tigramite/causal_mediation.py +1592 -0
  4. tigramite/data_processing.py +1574 -0
  5. tigramite/graphs.py +1509 -0
  6. tigramite/independence_tests/LBFGS.py +1114 -0
  7. tigramite/independence_tests/__init__.py +0 -0
  8. tigramite/independence_tests/cmiknn.py +661 -0
  9. tigramite/independence_tests/cmiknn_mixed.py +1397 -0
  10. tigramite/independence_tests/cmisymb.py +286 -0
  11. tigramite/independence_tests/gpdc.py +664 -0
  12. tigramite/independence_tests/gpdc_torch.py +820 -0
  13. tigramite/independence_tests/gsquared.py +190 -0
  14. tigramite/independence_tests/independence_tests_base.py +1310 -0
  15. tigramite/independence_tests/oracle_conditional_independence.py +1582 -0
  16. tigramite/independence_tests/pairwise_CI.py +383 -0
  17. tigramite/independence_tests/parcorr.py +369 -0
  18. tigramite/independence_tests/parcorr_mult.py +485 -0
  19. tigramite/independence_tests/parcorr_wls.py +451 -0
  20. tigramite/independence_tests/regressionCI.py +403 -0
  21. tigramite/independence_tests/robust_parcorr.py +403 -0
  22. tigramite/jpcmciplus.py +966 -0
  23. tigramite/lpcmci.py +3649 -0
  24. tigramite/models.py +2257 -0
  25. tigramite/pcmci.py +3935 -0
  26. tigramite/pcmci_base.py +1218 -0
  27. tigramite/plotting.py +4735 -0
  28. tigramite/rpcmci.py +467 -0
  29. tigramite/toymodels/__init__.py +0 -0
  30. tigramite/toymodels/context_model.py +261 -0
  31. tigramite/toymodels/non_additive.py +1231 -0
  32. tigramite/toymodels/structural_causal_processes.py +1201 -0
  33. tigramite/toymodels/surrogate_generator.py +319 -0
  34. tigramite_fast-5.2.10.1.dist-info/METADATA +182 -0
  35. tigramite_fast-5.2.10.1.dist-info/RECORD +38 -0
  36. tigramite_fast-5.2.10.1.dist-info/WHEEL +5 -0
  37. tigramite_fast-5.2.10.1.dist-info/licenses/license.txt +621 -0
  38. tigramite_fast-5.2.10.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,664 @@
1
+ """Tigramite causal discovery for time series."""
2
+
3
+ # Author: Jakob Runge <jakob@jakob-runge.com>
4
+ #
5
+ # License: GNU General Public License v3.0
6
+
7
+ from __future__ import print_function
8
+ import json, warnings, os, pathlib
9
+ import numpy as np
10
+ import dcor
11
+ from sklearn import gaussian_process
12
+ from .independence_tests_base import CondIndTest
13
+
14
+ class GaussProcReg():
15
+ r"""Gaussian processes abstract base class.
16
+
17
+ GP is estimated with scikit-learn and allows to flexibly specify kernels and
18
+ hyperparameters or let them be optimized automatically. The kernel specifies
19
+ the covariance function of the GP. Parameters can be passed on to
20
+ ``GaussianProcessRegressor`` using the gp_params dictionary. If None is
21
+ passed, the kernel '1.0 * RBF(1.0) + WhiteKernel()' is used with alpha=0 as
22
+ default. Note that the kernel's hyperparameters are optimized during
23
+ fitting.
24
+
25
+ When the null distribution is not analytically available, but can be
26
+ precomputed with the function generate_and_save_nulldists(...) which saves
27
+ a \*.npz file containing the null distribution for different sample sizes.
28
+ This file can then be supplied as null_dist_filename.
29
+
30
+ Assumes one-dimensional X, Y. But can be combined with PairwiseMultCI to
31
+ obtain a test for multivariate X, Y.
32
+
33
+ Parameters
34
+ ----------
35
+ null_samples : int
36
+ Number of null samples to use
37
+
38
+ cond_ind_test : CondIndTest
39
+ Conditional independence test that this Gaussian Process Regressor will
40
+ calculate the null distribution for. This is used to grab the
41
+ get_dependence_measure function.
42
+
43
+ gp_params : dictionary, optional (default: None)
44
+ Dictionary with parameters for ``GaussianProcessRegressor``.
45
+
46
+ null_dist_filename : str, otional (default: None)
47
+ Path to file containing null distribution.
48
+
49
+ verbosity : int, optional (default: 0)
50
+ Level of verbosity.
51
+ """
52
+ def __init__(self,
53
+ null_samples,
54
+ cond_ind_test,
55
+ gp_params=None,
56
+ null_dist_filename=None,
57
+ verbosity=0):
58
+ # Set the dependence measure function
59
+ self.cond_ind_test = cond_ind_test
60
+ # Set member variables
61
+ self.gp_params = gp_params
62
+ self.verbosity = verbosity
63
+ # Set the null distribution defaults
64
+ self.null_samples = null_samples
65
+ self.null_dists = {}
66
+ self.null_dist_filename = null_dist_filename
67
+ # Check if we are loading a null distrubtion from a cached file
68
+ if self.null_dist_filename is not None:
69
+ self.null_dists, self.null_samples = \
70
+ self._load_nulldist(self.null_dist_filename)
71
+
72
+ def _load_nulldist(self, filename):
73
+ r"""
74
+ Load a precomputed null distribution from a \*.npz file. This
75
+ distribution can be calculated using generate_and_save_nulldists(...).
76
+
77
+ Parameters
78
+ ----------
79
+ filename : strng
80
+ Path to the \*.npz file
81
+
82
+ Returns
83
+ -------
84
+ null_dists, null_samples : dict, int
85
+ The null distirbution as a dictionary of distributions keyed by
86
+ sample size, the number of null samples in total.
87
+ """
88
+ null_dist_file = np.load(filename)
89
+ null_dists = dict(zip(null_dist_file['T'],
90
+ null_dist_file['exact_dist']))
91
+ null_samples = len(null_dist_file['exact_dist'][0])
92
+ return null_dists, null_samples
93
+
94
+ def _generate_nulldist(self, df,
95
+ add_to_null_dists=True):
96
+ """Generates null distribution for pairwise independence tests.
97
+
98
+ Generates the null distribution for sample size df. Assumes pairwise
99
+ samples transformed to uniform marginals. Uses get_dependence_measure
100
+ available in class and generates self.sig_samples random samples. Adds
101
+ the null distributions to self.null_dists.
102
+
103
+ Parameters
104
+ ----------
105
+ df : int
106
+ Degrees of freedom / sample size to generate null distribution for.
107
+ add_to_null_dists : bool, optional (default: True)
108
+ Whether to add the null dist to the dictionary of null dists or
109
+ just return it.
110
+
111
+ Returns
112
+ -------
113
+ null_dist : array of shape [df,]
114
+ Only returned,if add_to_null_dists is False.
115
+ """
116
+
117
+ if self.verbosity > 0:
118
+ print("Generating null distribution for df = %d. " % df)
119
+ if add_to_null_dists:
120
+ print("For faster computations, run function "
121
+ "generate_and_save_nulldists(...) to "
122
+ "precompute null distribution and load *.npz file with "
123
+ "argument null_dist_filename")
124
+
125
+ xyz = np.array([0,1])
126
+
127
+ null_dist = np.zeros(self.null_samples)
128
+ for i in range(self.null_samples):
129
+ array = self.cond_ind_test.random_state.random((2, df))
130
+ null_dist[i] = self.cond_ind_test.get_dependence_measure(array, xyz)
131
+
132
+ null_dist.sort()
133
+ if add_to_null_dists:
134
+ self.null_dists[df] = null_dist
135
+ return null_dist
136
+
137
+ def _generate_and_save_nulldists(self, sample_sizes, null_dist_filename):
138
+ """Generates and saves null distribution for pairwise independence
139
+ tests.
140
+
141
+ Generates the null distribution for different sample sizes. Calls
142
+ generate_nulldist. Null dists are saved to disk as
143
+ self.null_dist_filename.npz. Also adds the null distributions to
144
+ self.null_dists.
145
+
146
+ Parameters
147
+ ----------
148
+ sample_sizes : list
149
+ List of sample sizes.
150
+
151
+ null_dist_filename : str
152
+ Name to save file containing null distributions.
153
+ """
154
+
155
+ self.null_dist_filename = null_dist_filename
156
+
157
+ null_dists = np.zeros((len(sample_sizes), self.null_samples))
158
+
159
+ for iT, T in enumerate(sample_sizes):
160
+ null_dists[iT] = self._generate_nulldist(T, add_to_null_dists=False)
161
+ self.null_dists[T] = null_dists[iT]
162
+
163
+ np.savez("%s" % null_dist_filename,
164
+ exact_dist=null_dists,
165
+ T=np.array(sample_sizes))
166
+
167
+ def _get_single_residuals(self, array, target_var,
168
+ return_means=False,
169
+ standardize=True,
170
+ return_likelihood=False):
171
+ """Returns residuals of Gaussian process regression.
172
+
173
+ Performs a GP regression of the variable indexed by target_var on the
174
+ conditions Z. Here array is assumed to contain X and Y as the first two
175
+ rows with the remaining rows (if present) containing the conditions Z.
176
+ Optionally returns the estimated mean and the likelihood.
177
+
178
+ Parameters
179
+ ----------
180
+ array : array-like
181
+ data array with X, Y, Z in rows and observations in columns
182
+
183
+ target_var : {0, 1}
184
+ Variable to regress out conditions from.
185
+
186
+ standardize : bool, optional (default: True)
187
+ Whether to standardize the array beforehand.
188
+
189
+ return_means : bool, optional (default: False)
190
+ Whether to return the estimated regression line.
191
+
192
+ return_likelihood : bool, optional (default: False)
193
+ Whether to return the log_marginal_likelihood of the fitted GP
194
+
195
+ Returns
196
+ -------
197
+ resid [, mean, likelihood] : array-like
198
+ The residual of the regression and optionally the estimated mean
199
+ and/or the likelihood.
200
+ """
201
+ dim, T = array.shape
202
+
203
+ if self.gp_params is None:
204
+ self.gp_params = {}
205
+
206
+ if dim <= 2:
207
+ if return_likelihood:
208
+ return array[target_var, :], -np.inf
209
+ return array[target_var, :]
210
+
211
+ # Standardize
212
+ if standardize:
213
+ array -= array.mean(axis=1).reshape(dim, 1)
214
+ std = array.std(axis=1)
215
+ for i in range(dim):
216
+ if std[i] != 0.:
217
+ array[i] /= std[i]
218
+ if np.any(std == 0.) and self.verbosity > 0:
219
+ warnings.warn("Possibly constant array!")
220
+ # array /= array.std(axis=1).reshape(dim, 1)
221
+ # if np.isnan(array).sum() != 0:
222
+ # raise ValueError("nans after standardizing, "
223
+ # "possibly constant array!")
224
+
225
+ target_series = array[target_var, :]
226
+ z = array[2:].T.copy()
227
+ if np.ndim(z) == 1:
228
+ z = z.reshape(-1, 1)
229
+
230
+
231
+ # Overwrite default kernel and alpha values
232
+ params = self.gp_params.copy()
233
+ if 'kernel' not in list(self.gp_params):
234
+ kernel = gaussian_process.kernels.RBF() +\
235
+ gaussian_process.kernels.WhiteKernel()
236
+ else:
237
+ kernel = self.gp_params['kernel']
238
+ del params['kernel']
239
+
240
+ if 'alpha' not in list(self.gp_params):
241
+ alpha = 0.
242
+ else:
243
+ alpha = self.gp_params['alpha']
244
+ del params['alpha']
245
+
246
+ gp = gaussian_process.GaussianProcessRegressor(kernel=kernel,
247
+ alpha=alpha,
248
+ **params)
249
+
250
+ gp.fit(z, target_series.reshape(-1, 1))
251
+
252
+ if self.verbosity > 3:
253
+ print(kernel, alpha, gp.kernel_, gp.alpha)
254
+
255
+ if return_likelihood:
256
+ likelihood = gp.log_marginal_likelihood()
257
+
258
+ mean = gp.predict(z).squeeze()
259
+
260
+ resid = target_series - mean
261
+
262
+ if return_means and not return_likelihood:
263
+ return (resid, mean)
264
+ elif return_likelihood and not return_means:
265
+ return (resid, likelihood)
266
+ elif return_means and return_likelihood:
267
+ return resid, mean, likelihood
268
+ return resid
269
+
270
+ def _get_model_selection_criterion(self, j, parents, tau_max=0):
271
+ """Returns log marginal likelihood for GP regression.
272
+
273
+ Fits a GP model of the parents to variable j and returns the negative
274
+ log marginal likelihood as a model selection score. Is used to determine
275
+ optimal hyperparameters in PCMCI, in particular the pc_alpha value.
276
+
277
+ Parameters
278
+ ----------
279
+ j : int
280
+ Index of target variable in data array.
281
+
282
+ parents : list
283
+ List of form [(0, -1), (3, -2), ...] containing parents.
284
+
285
+ tau_max : int, optional (default: 0)
286
+ Maximum time lag. This may be used to make sure that estimates for
287
+ different lags in X, Z, all have the same sample size.
288
+
289
+ Returns:
290
+ score : float
291
+ Model score.
292
+ """
293
+
294
+ Y = [(j, 0)]
295
+ X = [(j, 0)] # dummy variable here
296
+ Z = parents
297
+ array, xyz, _ = \
298
+ self.cond_ind_test.dataframe.construct_array(
299
+ X=X, Y=Y, Z=Z,
300
+ tau_max=tau_max,
301
+ mask_type=self.cond_ind_test.mask_type,
302
+ return_cleaned_xyz=False,
303
+ do_checks=True,
304
+ verbosity=self.verbosity)
305
+
306
+ dim, T = array.shape
307
+
308
+ _, logli = self._get_single_residuals(array,
309
+ target_var=1,
310
+ return_likelihood=True)
311
+
312
+ score = -logli
313
+ return score
314
+
315
+ class GPDC(CondIndTest):
316
+ r"""GPDC conditional independence test based on Gaussian processes and distance correlation.
317
+
318
+ GPDC is based on a Gaussian process (GP) regression and a distance
319
+ correlation test on the residuals [2]_. GP is estimated with scikit-learn
320
+ and allows to flexibly specify kernels and hyperparameters or let them be
321
+ optimized automatically. The distance correlation test is implemented with
322
+ cython. Here the null distribution is not analytically available, but can be
323
+ precomputed with the function generate_and_save_nulldists(...) which saves a
324
+ \*.npz file containing the null distribution for different sample sizes.
325
+ This file can then be supplied as null_dist_filename.
326
+
327
+ Notes
328
+ -----
329
+
330
+ GPDC is based on a Gaussian process (GP) regression and a distance
331
+ correlation test on the residuals. Distance correlation is described in
332
+ [2]_. To test :math:`X \perp Y | Z`, first :math:`Z` is regressed out from
333
+ :math:`X` and :math:`Y` assuming the model
334
+
335
+ .. math:: X & = f_X(Z) + \epsilon_{X} \\
336
+ Y & = f_Y(Z) + \epsilon_{Y} \\
337
+ \epsilon_{X,Y} &\sim \mathcal{N}(0, \sigma^2)
338
+
339
+ using GP regression. Here :math:`\sigma^2` and the kernel bandwidth are
340
+ optimzed using ``sklearn``. Then the residuals are transformed to uniform
341
+ marginals yielding :math:`r_X,r_Y` and their dependency is tested with
342
+
343
+ .. math:: \mathcal{R}\left(r_X, r_Y\right)
344
+
345
+ The null distribution of the distance correlation should be pre-computed.
346
+ Otherwise it is computed during runtime.
347
+
348
+ References
349
+ ----------
350
+ .. [2] Gabor J. Szekely, Maria L. Rizzo, and Nail K. Bakirov: Measuring and
351
+ testing dependence by correlation of distances,
352
+ https://arxiv.org/abs/0803.4101
353
+
354
+ Parameters
355
+ ----------
356
+ null_dist_filename : str, otional (default: None)
357
+ Path to file containing null distribution.
358
+
359
+ gp_params : dictionary, optional (default: None)
360
+ Dictionary with parameters for ``GaussianProcessRegressor``.
361
+
362
+ **kwargs :
363
+ Arguments passed on to parent class GaussProcReg.
364
+
365
+ """
366
+ @property
367
+ def measure(self):
368
+ """
369
+ Concrete property to return the measure of the independence test
370
+ """
371
+ return self._measure
372
+
373
+ def __init__(self,
374
+ null_dist_filename=None,
375
+ gp_params=None,
376
+ **kwargs):
377
+ self._measure = 'gp_dc'
378
+ self.two_sided = False
379
+ self.residual_based = True
380
+ # Call the parent constructor
381
+ CondIndTest.__init__(self, **kwargs)
382
+ # Build the regressor
383
+ self.gauss_pr = GaussProcReg(self.sig_samples,
384
+ self,
385
+ gp_params=gp_params,
386
+ null_dist_filename=null_dist_filename,
387
+ verbosity=self.verbosity)
388
+
389
+ if self.verbosity > 0:
390
+ print("null_dist_filename = %s" % self.gauss_pr.null_dist_filename)
391
+ if self.gauss_pr.gp_params is not None:
392
+ for key in list(self.gauss_pr.gp_params):
393
+ print("%s = %s" % (key, self.gauss_pr.gp_params[key]))
394
+ print("")
395
+
396
+ def _load_nulldist(self, filename):
397
+ r"""
398
+ Load a precomputed null distribution from a \*.npz file. This
399
+ distribution can be calculated using generate_and_save_nulldists(...).
400
+
401
+ Parameters
402
+ ----------
403
+ filename : strng
404
+ Path to the \*.npz file
405
+
406
+ Returns
407
+ -------
408
+ null_dists, null_samples : dict, int
409
+ The null distirbution as a dictionary of distributions keyed by
410
+ sample size, the number of null samples in total.
411
+ """
412
+ return self.gauss_pr._load_nulldist(filename)
413
+
414
+ def generate_nulldist(self, df, add_to_null_dists=True):
415
+ """Generates null distribution for pairwise independence tests.
416
+
417
+ Generates the null distribution for sample size df. Assumes pairwise
418
+ samples transformed to uniform marginals. Uses get_dependence_measure
419
+ available in class and generates self.sig_samples random samples. Adds
420
+ the null distributions to self.gauss_pr.null_dists.
421
+
422
+ Parameters
423
+ ----------
424
+ df : int
425
+ Degrees of freedom / sample size to generate null distribution for.
426
+
427
+ add_to_null_dists : bool, optional (default: True)
428
+ Whether to add the null dist to the dictionary of null dists or
429
+ just return it.
430
+
431
+ Returns
432
+ -------
433
+ null_dist : array of shape [df,]
434
+ Only returned,if add_to_null_dists is False.
435
+ """
436
+ return self.gauss_pr._generate_nulldist(df, add_to_null_dists)
437
+
438
+ def generate_and_save_nulldists(self, sample_sizes, null_dist_filename):
439
+ """Generates and saves null distribution for pairwise independence
440
+ tests.
441
+
442
+ Generates the null distribution for different sample sizes. Calls
443
+ generate_nulldist. Null dists are saved to disk as
444
+ self.null_dist_filename.npz. Also adds the null distributions to
445
+ self.gauss_pr.null_dists.
446
+
447
+ Parameters
448
+ ----------
449
+ sample_sizes : list
450
+ List of sample sizes.
451
+
452
+ null_dist_filename : str
453
+ Name to save file containing null distributions.
454
+ """
455
+ self.gauss_pr._generate_and_save_nulldists(sample_sizes,
456
+ null_dist_filename)
457
+
458
+ def _get_single_residuals(self, array, target_var,
459
+ return_means=False,
460
+ standardize=True,
461
+ return_likelihood=False):
462
+ """Returns residuals of Gaussian process regression.
463
+
464
+ Performs a GP regression of the variable indexed by target_var on the
465
+ conditions Z. Here array is assumed to contain X and Y as the first two
466
+ rows with the remaining rows (if present) containing the conditions Z.
467
+ Optionally returns the estimated mean and the likelihood.
468
+
469
+ Parameters
470
+ ----------
471
+ array : array-like
472
+ data array with X, Y, Z in rows and observations in columns
473
+
474
+ target_var : {0, 1}
475
+ Variable to regress out conditions from.
476
+
477
+ standardize : bool, optional (default: True)
478
+ Whether to standardize the array beforehand.
479
+
480
+ return_means : bool, optional (default: False)
481
+ Whether to return the estimated regression line.
482
+
483
+ return_likelihood : bool, optional (default: False)
484
+ Whether to return the log_marginal_likelihood of the fitted GP
485
+
486
+ Returns
487
+ -------
488
+ resid [, mean, likelihood] : array-like
489
+ The residual of the regression and optionally the estimated mean
490
+ and/or the likelihood.
491
+ """
492
+ return self.gauss_pr._get_single_residuals(
493
+ array, target_var,
494
+ return_means,
495
+ standardize,
496
+ return_likelihood)
497
+
498
+ def get_model_selection_criterion(self, j, parents, tau_max=0):
499
+ """Returns log marginal likelihood for GP regression.
500
+
501
+ Fits a GP model of the parents to variable j and returns the negative
502
+ log marginal likelihood as a model selection score. Is used to determine
503
+ optimal hyperparameters in PCMCI, in particular the pc_alpha value.
504
+
505
+ Parameters
506
+ ----------
507
+ j : int
508
+ Index of target variable in data array.
509
+
510
+ parents : list
511
+ List of form [(0, -1), (3, -2), ...] containing parents.
512
+
513
+ tau_max : int, optional (default: 0)
514
+ Maximum time lag. This may be used to make sure that estimates for
515
+ different lags in X, Z, all have the same sample size.
516
+
517
+ Returns:
518
+ score : float
519
+ Model score.
520
+ """
521
+ return self.gauss_pr._get_model_selection_criterion(j, parents, tau_max)
522
+
523
+ def get_dependence_measure(self, array, xyz, data_type=None):
524
+ """Return GPDC measure.
525
+
526
+ Estimated as the distance correlation of the residuals of a GP
527
+ regression.
528
+
529
+ Parameters
530
+ ----------
531
+ array : array-like
532
+ data array with X, Y, Z in rows and observations in columns
533
+
534
+ xyz : array of ints
535
+ XYZ identifier array of shape (dim,).
536
+
537
+ Returns
538
+ -------
539
+ val : float
540
+ GPDC test statistic.
541
+ """
542
+
543
+ x_vals = self._get_single_residuals(array, target_var=0)
544
+ y_vals = self._get_single_residuals(array, target_var=1)
545
+ val = self._get_dcorr(np.array([x_vals, y_vals]))
546
+ return val
547
+
548
+
549
+ def _get_dcorr(self, array_resid):
550
+ r"""Return distance correlation coefficient.
551
+
552
+ The variables are transformed to uniform marginals using the empirical
553
+ cumulative distribution function beforehand. Here the null distribution
554
+ is not analytically available, but can be precomputed with the function
555
+ generate_and_save_nulldists(...) which saves a *.npz file containing
556
+ the null distribution for different sample sizes. This file can then be
557
+ supplied as null_dist_filename.
558
+
559
+ Parameters
560
+ ----------
561
+ array_resid : array-like
562
+ data array must be of shape (2, T)
563
+
564
+ Returns
565
+ -------
566
+ val : float
567
+ Distance correlation coefficient.
568
+ """
569
+ # Remove ties before applying transformation to uniform marginals
570
+ # array_resid = self._remove_ties(array_resid, verbosity=4)
571
+ x_vals, y_vals = self._trafo2uniform(array_resid)
572
+ val = dcor.distance_correlation(x_vals, y_vals, method='AVL')
573
+ return val
574
+
575
+ def get_shuffle_significance(self, array, xyz, value,
576
+ return_null_dist=False,
577
+ data_type=None):
578
+ """Returns p-value for shuffle significance test.
579
+
580
+ For residual-based test statistics only the residuals are shuffled.
581
+
582
+ Parameters
583
+ ----------
584
+ array : array-like
585
+ data array with X, Y, Z in rows and observations in columns
586
+
587
+ xyz : array of ints
588
+ XYZ identifier array of shape (dim,).
589
+
590
+ value : number
591
+ Value of test statistic for unshuffled estimate.
592
+
593
+ Returns
594
+ -------
595
+ pval : float
596
+ p-value
597
+ """
598
+
599
+ x_vals = self._get_single_residuals(array, target_var=0)
600
+ y_vals = self._get_single_residuals(array, target_var=1)
601
+ array_resid = np.array([x_vals, y_vals])
602
+ xyz_resid = np.array([0, 1])
603
+
604
+ null_dist = self._get_shuffle_dist(array_resid, xyz_resid,
605
+ self.get_dependence_measure,
606
+ sig_samples=self.sig_samples,
607
+ sig_blocklength=self.sig_blocklength,
608
+ verbosity=self.verbosity)
609
+
610
+ # pval = (null_dist >= value).mean()
611
+ pval = float(np.sum(null_dist >= value) + 1) / (self.sig_samples + 1)
612
+
613
+ if return_null_dist:
614
+ return pval, null_dist
615
+ return pval
616
+
617
+ def get_analytic_significance(self, value, T, dim, xyz):
618
+ """Returns p-value for the distance correlation coefficient.
619
+
620
+ The null distribution for necessary degrees of freedom (df) is loaded.
621
+ If not available, the null distribution is generated with the function
622
+ generate_nulldist(). It is recommended to generate the nulldists for a
623
+ wide range of sample sizes beforehand with the function
624
+ generate_and_save_nulldists(...). The distance correlation coefficient
625
+ is one-sided. If the degrees of freedom are less than 1, numpy.nan is
626
+ returned.
627
+
628
+ Parameters
629
+ ----------
630
+ value : float
631
+ Test statistic value.
632
+
633
+ T : int
634
+ Sample length
635
+
636
+ dim : int
637
+ Dimensionality, ie, number of features.
638
+
639
+ xyz : array of ints
640
+ XYZ identifier array of shape (dim,).
641
+
642
+ Returns
643
+ -------
644
+ pval : float or numpy.nan
645
+ p-value.
646
+ """
647
+
648
+ # GP regression approximately doesn't cost degrees of freedom
649
+ df = T
650
+
651
+ if df < 1:
652
+ pval = np.nan
653
+ else:
654
+ # idx_near = (np.abs(self.sample_sizes - df)).argmin()
655
+ if int(df) not in list(self.gauss_pr.null_dists):
656
+ # if np.abs(self.sample_sizes[idx_near] - df) / float(df) > 0.01:
657
+ if self.verbosity > 0:
658
+ print("Null distribution for GPDC not available "
659
+ "for deg. of freed. = %d." % df)
660
+ self.generate_nulldist(df)
661
+ null_dist_here = self.gauss_pr.null_dists[int(df)]
662
+ pval = np.mean(null_dist_here > np.abs(value))
663
+ return pval
664
+