tigramite-fast 5.2.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. tigramite/__init__.py +0 -0
  2. tigramite/causal_effects.py +1525 -0
  3. tigramite/causal_mediation.py +1592 -0
  4. tigramite/data_processing.py +1574 -0
  5. tigramite/graphs.py +1509 -0
  6. tigramite/independence_tests/LBFGS.py +1114 -0
  7. tigramite/independence_tests/__init__.py +0 -0
  8. tigramite/independence_tests/cmiknn.py +661 -0
  9. tigramite/independence_tests/cmiknn_mixed.py +1397 -0
  10. tigramite/independence_tests/cmisymb.py +286 -0
  11. tigramite/independence_tests/gpdc.py +664 -0
  12. tigramite/independence_tests/gpdc_torch.py +820 -0
  13. tigramite/independence_tests/gsquared.py +190 -0
  14. tigramite/independence_tests/independence_tests_base.py +1310 -0
  15. tigramite/independence_tests/oracle_conditional_independence.py +1582 -0
  16. tigramite/independence_tests/pairwise_CI.py +383 -0
  17. tigramite/independence_tests/parcorr.py +369 -0
  18. tigramite/independence_tests/parcorr_mult.py +485 -0
  19. tigramite/independence_tests/parcorr_wls.py +451 -0
  20. tigramite/independence_tests/regressionCI.py +403 -0
  21. tigramite/independence_tests/robust_parcorr.py +403 -0
  22. tigramite/jpcmciplus.py +966 -0
  23. tigramite/lpcmci.py +3649 -0
  24. tigramite/models.py +2257 -0
  25. tigramite/pcmci.py +3935 -0
  26. tigramite/pcmci_base.py +1218 -0
  27. tigramite/plotting.py +4735 -0
  28. tigramite/rpcmci.py +467 -0
  29. tigramite/toymodels/__init__.py +0 -0
  30. tigramite/toymodels/context_model.py +261 -0
  31. tigramite/toymodels/non_additive.py +1231 -0
  32. tigramite/toymodels/structural_causal_processes.py +1201 -0
  33. tigramite/toymodels/surrogate_generator.py +319 -0
  34. tigramite_fast-5.2.10.1.dist-info/METADATA +182 -0
  35. tigramite_fast-5.2.10.1.dist-info/RECORD +38 -0
  36. tigramite_fast-5.2.10.1.dist-info/WHEEL +5 -0
  37. tigramite_fast-5.2.10.1.dist-info/licenses/license.txt +621 -0
  38. tigramite_fast-5.2.10.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,820 @@
1
+ """Tigramite causal discovery for time series."""
2
+
3
+ # Author: Jakob Runge <jakob@jakob-runge.com>
4
+ #
5
+ # License: GNU General Public License v3.0
6
+
7
+ from __future__ import print_function
8
+ import json, warnings, os, pathlib
9
+ import numpy as np
10
+ import gc
11
+ import dcor
12
+ import torch
13
+ import gpytorch
14
+ from .LBFGS import FullBatchLBFGS
15
+ from .independence_tests_base import CondIndTest
16
+
17
+ class GaussProcRegTorch():
18
+ r"""Gaussian processes abstract base class.
19
+
20
+ GP is estimated with gpytorch. Note that the kernel's hyperparameters are
21
+ optimized during fitting.
22
+
23
+ When the null distribution is not analytically available, but can be
24
+ precomputed with the function generate_and_save_nulldists(...) which saves
25
+ a \*.npz file containing the null distribution for different sample sizes.
26
+ This file can then be supplied as null_dist_filename.
27
+
28
+ Assumes one-dimensional X, Y. But can be combined with PairwiseMultCI to
29
+ obtain a test for multivariate X, Y.
30
+
31
+ Parameters
32
+ ----------
33
+ null_samples : int
34
+ Number of null samples to use
35
+
36
+ cond_ind_test : CondIndTest
37
+ Conditional independence test that this Gaussian Proccess Regressor will
38
+ calculate the null distribution for. This is used to grab the
39
+ get_dependence_measure function.
40
+
41
+ null_dist_filename : str, otional (default: None)
42
+ Path to file containing null distribution.
43
+
44
+ verbosity : int, optional (default: 0)
45
+ Level of verbosity.
46
+ """
47
+
48
+ def __init__(self,
49
+ null_samples,
50
+ cond_ind_test,
51
+ null_dist_filename=None,
52
+ checkpoint_size=None,
53
+ verbosity=0):
54
+ # Set the dependence measure function
55
+ self.cond_ind_test = cond_ind_test
56
+ # Set member variables
57
+ self.verbosity = verbosity
58
+ # Set the null distribution defaults
59
+ self.null_samples = null_samples
60
+ self.null_dists = {}
61
+ self.null_dist_filename = null_dist_filename
62
+ # Check if we are loading a null distrubtion from a cached file
63
+ if self.null_dist_filename is not None:
64
+ self.null_dists, self.null_samples = \
65
+ self._load_nulldist(self.null_dist_filename)
66
+ # Size for batching
67
+ self.checkpoint_size = checkpoint_size
68
+
69
+ def _load_nulldist(self, filename):
70
+ r"""
71
+ Load a precomputed null distribution from a \*.npz file. This
72
+ distribution can be calculated using generate_and_save_nulldists(...).
73
+
74
+ Parameters
75
+ ----------
76
+ filename : strng
77
+ Path to the \*.npz file
78
+
79
+ Returns
80
+ -------
81
+ null_dists, null_samples : dict, int
82
+ The null distirbution as a dictionary of distributions keyed by
83
+ sample size, the number of null samples in total.
84
+ """
85
+ null_dist_file = np.load(filename)
86
+ null_dists = dict(zip(null_dist_file['T'],
87
+ null_dist_file['exact_dist']))
88
+ null_samples = len(null_dist_file['exact_dist'][0])
89
+ return null_dists, null_samples
90
+
91
+ def _generate_nulldist(self, df,
92
+ add_to_null_dists=True):
93
+ """Generates null distribution for pairwise independence tests.
94
+
95
+ Generates the null distribution for sample size df. Assumes pairwise
96
+ samples transformed to uniform marginals. Uses get_dependence_measure
97
+ available in class and generates self.sig_samples random samples. Adds
98
+ the null distributions to self.null_dists.
99
+
100
+ Parameters
101
+ ----------
102
+ df : int
103
+ Degrees of freedom / sample size to generate null distribution for.
104
+ add_to_null_dists : bool, optional (default: True)
105
+ Whether to add the null dist to the dictionary of null dists or
106
+ just return it.
107
+
108
+ Returns
109
+ -------
110
+ null_dist : array of shape [df,]
111
+ Only returned,if add_to_null_dists is False.
112
+ """
113
+
114
+ if self.verbosity > 0:
115
+ print("Generating null distribution for df = %d. " % df)
116
+ if add_to_null_dists:
117
+ print("For faster computations, run function "
118
+ "generate_and_save_nulldists(...) to "
119
+ "precompute null distribution and load *.npz file with "
120
+ "argument null_dist_filename")
121
+
122
+ xyz = np.array([0, 1])
123
+
124
+ null_dist = np.zeros(self.null_samples)
125
+ for i in range(self.null_samples):
126
+ array = self.cond_ind_test.random_state.random((2, df))
127
+ null_dist[i] = self.cond_ind_test.get_dependence_measure(
128
+ array, xyz)
129
+
130
+ null_dist.sort()
131
+ if add_to_null_dists:
132
+ self.null_dists[df] = null_dist
133
+ return null_dist
134
+
135
+ def _generate_and_save_nulldists(self, sample_sizes, null_dist_filename):
136
+ """Generates and saves null distribution for pairwise independence
137
+ tests.
138
+
139
+ Generates the null distribution for different sample sizes. Calls
140
+ generate_nulldist. Null dists are saved to disk as
141
+ self.null_dist_filename.npz. Also adds the null distributions to
142
+ self.null_dists.
143
+
144
+ Parameters
145
+ ----------
146
+ sample_sizes : list
147
+ List of sample sizes.
148
+
149
+ null_dist_filename : str
150
+ Name to save file containing null distributions.
151
+ """
152
+
153
+ self.null_dist_filename = null_dist_filename
154
+
155
+ null_dists = np.zeros((len(sample_sizes), self.null_samples))
156
+
157
+ for iT, T in enumerate(sample_sizes):
158
+ null_dists[iT] = self._generate_nulldist(
159
+ T, add_to_null_dists=False)
160
+ self.null_dists[T] = null_dists[iT]
161
+
162
+ np.savez("%s" % null_dist_filename,
163
+ exact_dist=null_dists,
164
+ T=np.array(sample_sizes))
165
+
166
+
167
+ def _get_single_residuals(self, array, target_var,
168
+ return_means=False,
169
+ standardize=True,
170
+ return_likelihood=False,
171
+ training_iter=50,
172
+ lr=0.1):
173
+ """Returns residuals of Gaussian process regression.
174
+
175
+ Performs a GP regression of the variable indexed by target_var on the
176
+ conditions Z. Here array is assumed to contain X and Y as the first two
177
+ rows with the remaining rows (if present) containing the conditions Z.
178
+ Optionally returns the estimated mean and the likelihood.
179
+
180
+ Parameters
181
+ ----------
182
+ array : array-like
183
+ data array with X, Y, Z in rows and observations in columns
184
+
185
+ target_var : {0, 1}
186
+ Variable to regress out conditions from.
187
+
188
+ standardize : bool, optional (default: True)
189
+ Whether to standardize the array beforehand.
190
+
191
+ return_means : bool, optional (default: False)
192
+ Whether to return the estimated regression line.
193
+
194
+ return_likelihood : bool, optional (default: False)
195
+ Whether to return the log_marginal_likelihood of the fitted GP.
196
+
197
+ training_iter : int, optional (default: 50)
198
+ Number of training iterations.
199
+
200
+ lr : float, optional (default: 0.1)
201
+ Learning rate (default: 0.1).
202
+
203
+ Returns
204
+ -------
205
+ resid [, mean, likelihood] : array-like
206
+ The residual of the regression and optionally the estimated mean
207
+ and/or the likelihood.
208
+ """
209
+
210
+ dim, T = array.shape
211
+
212
+ if dim <= 2:
213
+ if return_likelihood:
214
+ return array[target_var, :], -np.inf
215
+ return array[target_var, :]
216
+
217
+ # Implement using PyTorch
218
+ # Standardize
219
+ if standardize:
220
+ array -= array.mean(axis=1).reshape(dim, 1)
221
+ std = array.std(axis=1)
222
+ for i in range(dim):
223
+ if std[i] != 0.:
224
+ array[i] /= std[i]
225
+ if np.any(std == 0.) and self.verbosity > 0:
226
+ warnings.warn("Possibly constant array!")
227
+ # array /= array.std(axis=1).reshape(dim, 1)
228
+ # if np.isnan(array).any():
229
+ # raise ValueError("Nans after standardizing, "
230
+ # "possibly constant array!")
231
+
232
+ target_series = array[target_var, :]
233
+ z = array[2:].T.copy()
234
+ if np.ndim(z) == 1:
235
+ z = z.reshape(-1, 1)
236
+
237
+ train_x = torch.tensor(z).float()
238
+ train_y = torch.tensor(target_series).float()
239
+
240
+ device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
241
+ output_device = torch.device(device_type)
242
+ train_x, train_y = train_x.to(output_device), train_y.to(output_device)
243
+
244
+ if device_type == 'cuda':
245
+ # If GPU is available, use MultiGPU with Kernel Partitioning
246
+ n_devices = torch.cuda.device_count()
247
+ class mExactGPModel(gpytorch.models.ExactGP):
248
+ def __init__(self, train_x, train_y, likelihood, n_devices):
249
+ super(mExactGPModel, self).__init__(train_x, train_y, likelihood)
250
+ self.mean_module = gpytorch.means.ConstantMean()
251
+ base_covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
252
+
253
+ self.covar_module = gpytorch.kernels.MultiDeviceKernel(
254
+ base_covar_module, device_ids=range(n_devices),
255
+ output_device=output_device
256
+ )
257
+
258
+ def forward(self, x):
259
+ mean_x = self.mean_module(x)
260
+ covar_x = self.covar_module(x)
261
+ return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
262
+
263
+ def mtrain(train_x,
264
+ train_y,
265
+ n_devices,
266
+ output_device,
267
+ checkpoint_size,
268
+ preconditioner_size,
269
+ n_training_iter,
270
+ ):
271
+ likelihood = gpytorch.likelihoods.GaussianLikelihood().to(output_device)
272
+ model = mExactGPModel(train_x, train_y, likelihood, n_devices).to(output_device)
273
+ model.train()
274
+ likelihood.train()
275
+
276
+ optimizer = FullBatchLBFGS(model.parameters(), lr=lr)
277
+ # "Loss" for GPs - the marginal log likelihood
278
+ mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
279
+
280
+ with gpytorch.beta_features.checkpoint_kernel(checkpoint_size), \
281
+ gpytorch.settings.max_preconditioner_size(preconditioner_size):
282
+
283
+ def closure():
284
+ optimizer.zero_grad()
285
+ output = model(train_x)
286
+ loss = -mll(output, train_y)
287
+ return loss
288
+
289
+ loss = closure()
290
+ loss.backward()
291
+
292
+ for i in range(n_training_iter):
293
+ options = {'closure': closure, 'current_loss': loss, 'max_ls': 10}
294
+ loss, _, _, _, _, _, _, fail = optimizer.step(options)
295
+
296
+ '''print('Iter %d/%d - Loss: %.3f lengthscale: %.3f noise: %.3f' % (
297
+ i + 1, n_training_iter, loss.item(),
298
+ model.covar_module.module.base_kernel.lengthscale.item(),
299
+ model.likelihood.noise.item()
300
+ ))'''
301
+
302
+ if fail:
303
+ # print('Convergence reached!')
304
+ break
305
+
306
+ return model, likelihood, mll
307
+
308
+ def find_best_gpu_setting(train_x,
309
+ train_y,
310
+ n_devices,
311
+ output_device,
312
+ preconditioner_size
313
+ ):
314
+ N = train_x.size(0)
315
+
316
+ # Find the optimum partition/checkpoint size by decreasing in powers of 2
317
+ # Start with no partitioning (size = 0)
318
+ settings = [0] + [int(n) for n in np.ceil(N / 2 ** np.arange(1, np.floor(np.log2(N))))]
319
+
320
+ for checkpoint_size in settings:
321
+ print('Number of devices: {} -- Kernel partition size: {}'.format(n_devices, checkpoint_size))
322
+ try:
323
+ # Try a full forward and backward pass with this setting to check memory usage
324
+ _, _, _ = mtrain(train_x, train_y,
325
+ n_devices=n_devices, output_device=output_device,
326
+ checkpoint_size=checkpoint_size,
327
+ preconditioner_size=preconditioner_size, n_training_iter=1)
328
+
329
+ # when successful, break out of for-loop and jump to finally block
330
+ break
331
+ except RuntimeError as e:
332
+ pass
333
+ except AttributeError as e:
334
+ pass
335
+ finally:
336
+ # handle CUDA OOM error
337
+ gc.collect()
338
+ torch.cuda.empty_cache()
339
+ return checkpoint_size
340
+
341
+ # Set a large enough preconditioner size to reduce the number of CG iterations run
342
+ preconditioner_size = 100
343
+ if self.checkpoint_size is None:
344
+ self.checkpoint_size = find_best_gpu_setting(train_x, train_y,
345
+ n_devices=n_devices,
346
+ output_device=output_device,
347
+ preconditioner_size=preconditioner_size)
348
+
349
+ model, likelihood, mll = mtrain(train_x, train_y,
350
+ n_devices=n_devices, output_device=output_device,
351
+ checkpoint_size=self.checkpoint_size,
352
+ preconditioner_size=100,
353
+ n_training_iter=training_iter)
354
+
355
+ # Get into evaluation (predictive posterior) mode
356
+ model.eval()
357
+ likelihood.eval()
358
+
359
+ # Make predictions by feeding model through likelihood
360
+ with torch.no_grad(), gpytorch.settings.fast_pred_var(), gpytorch.beta_features.checkpoint_kernel(1000):
361
+ mean = model(train_x).loc.detach()
362
+ loglik = mll(model(train_x), train_y)*T
363
+
364
+ resid = (train_y - mean).detach().cpu().numpy()
365
+ mean = mean.detach().cpu().numpy()
366
+
367
+ else:
368
+ # If only CPU is available, we will use the simplest form of GP model, exact inference
369
+ class ExactGPModel(gpytorch.models.ExactGP):
370
+ def __init__(self, train_x, train_y, likelihood):
371
+ super(ExactGPModel, self).__init__(
372
+ train_x, train_y, likelihood)
373
+ self.mean_module = gpytorch.means.ConstantMean()
374
+
375
+ # We only use the RBF kernel here, the WhiteNoiseKernel is deprecated
376
+ # and its featured integrated into the Likelihood-Module.
377
+ self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
378
+
379
+ def forward(self, x):
380
+ mean_x = self.mean_module(x)
381
+ covar_x = self.covar_module(x)
382
+ return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
383
+
384
+ # initialize likelihood and model
385
+ likelihood = gpytorch.likelihoods.GaussianLikelihood()
386
+ model = ExactGPModel(train_x, train_y, likelihood)
387
+
388
+ # Find optimal model hyperparameters
389
+ model.train()
390
+ likelihood.train()
391
+
392
+ # Use the adam optimizer
393
+ # Includes GaussianLikelihood parameters
394
+ optimizer = torch.optim.Adam(model.parameters(), lr=lr)
395
+
396
+ # "Loss" for GPs - the marginal log likelihood
397
+ mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
398
+
399
+ for i in range(training_iter):
400
+ # Zero gradients from previous iteration
401
+ optimizer.zero_grad()
402
+ # Output from model
403
+ output = model(train_x)
404
+
405
+ # Calc loss and backprop gradients
406
+ loss = -mll(output, train_y)
407
+ loss.backward()
408
+ optimizer.step()
409
+
410
+ # Get into evaluation (predictive posterior) mode
411
+ model.eval()
412
+ likelihood.eval()
413
+
414
+ # Make predictions by feeding model through likelihood
415
+ with torch.no_grad(), gpytorch.settings.fast_pred_var():
416
+ mean = model(train_x).loc.detach()
417
+ loglik = mll(model(train_x), train_y) * T
418
+
419
+ resid = (train_y - mean).detach().numpy()
420
+ mean = mean.detach().numpy()
421
+
422
+ if return_means and not return_likelihood:
423
+ return resid, mean
424
+ elif return_likelihood and not return_means:
425
+ return resid, loglik
426
+ elif return_means and return_likelihood:
427
+ return resid, mean, loglik
428
+ return resid
429
+
430
+ def _get_model_selection_criterion(self, j, parents, tau_max=0):
431
+ """Returns log marginal likelihood for GP regression.
432
+
433
+ Fits a GP model of the parents to variable j and returns the negative
434
+ log marginal likelihood as a model selection score. Is used to determine
435
+ optimal hyperparameters in PCMCI, in particular the pc_alpha value.
436
+
437
+ Parameters
438
+ ----------
439
+ j : int
440
+ Index of target variable in data array.
441
+
442
+ parents : list
443
+ List of form [(0, -1), (3, -2), ...] containing parents.
444
+
445
+ tau_max : int, optional (default: 0)
446
+ Maximum time lag. This may be used to make sure that estimates for
447
+ different lags in X, Z, all have the same sample size.
448
+
449
+ Returns:
450
+ score : float
451
+ Model score.
452
+ """
453
+
454
+ Y = [(j, 0)]
455
+ X = [(j, 0)] # dummy variable here
456
+ Z = parents
457
+ array, xyz, _ = \
458
+ self.cond_ind_test.dataframe.construct_array(
459
+ X=X, Y=Y, Z=Z,
460
+ tau_max=tau_max,
461
+ mask_type=self.cond_ind_test.mask_type,
462
+ return_cleaned_xyz=False,
463
+ do_checks=True,
464
+ verbosity=self.verbosity)
465
+
466
+ dim, T = array.shape
467
+
468
+ _, logli = self._get_single_residuals(array,
469
+ target_var=1,
470
+ return_likelihood=True)
471
+
472
+ score = -logli
473
+ return score
474
+
475
+
476
+ class GPDCtorch(CondIndTest):
477
+ r"""GPDC conditional independence test based on Gaussian processes and distance correlation. Here with gpytorch implementation.
478
+
479
+ GPDC is based on a Gaussian process (GP) regression and a distance
480
+ correlation test on the residuals [2]_. GP is estimated with gpytorch.
481
+ The distance correlation test is implemented with the dcor package available
482
+ from pip. Here the null distribution is not analytically available, but can be
483
+ precomputed with the function generate_and_save_nulldists(...) which saves a
484
+ \*.npz file containing the null distribution for different sample sizes.
485
+ This file can then be supplied as null_dist_filename.
486
+
487
+ Notes
488
+ -----
489
+
490
+ GPDC is based on a Gaussian process (GP) regression and a distance
491
+ correlation test on the residuals. Distance correlation is described in
492
+ [2]_. To test :math:`X \perp Y | Z`, first :math:`Z` is regressed out from
493
+ :math:`X` and :math:`Y` assuming the model
494
+
495
+ .. math:: X & = f_X(Z) + \epsilon_{X} \\
496
+ Y & = f_Y(Z) + \epsilon_{Y} \\
497
+ \epsilon_{X,Y} &\sim \mathcal{N}(0, \sigma^2)
498
+
499
+ using GP regression. Here :math:`\sigma^2` and the kernel bandwidth are
500
+ optimzed using ``gpytorch``. Then the residuals are transformed to uniform
501
+ marginals yielding :math:`r_X,r_Y` and their dependency is tested with
502
+
503
+ .. math:: \mathcal{R}\left(r_X, r_Y\right)
504
+
505
+ The null distribution of the distance correlation should be pre-computed.
506
+ Otherwise it is computed during runtime.
507
+
508
+ Parameters
509
+ ----------
510
+ null_dist_filename : str, otional (default: None)
511
+ Path to file containing null distribution.
512
+
513
+ **kwargs :
514
+ Arguments passed on to parent class GaussProcRegTorch.
515
+
516
+ """
517
+ @property
518
+ def measure(self):
519
+ """
520
+ Concrete property to return the measure of the independence test
521
+ """
522
+ return self._measure
523
+
524
+ def __init__(self,
525
+ null_dist_filename=None,
526
+ **kwargs):
527
+ self._measure = 'gp_dc'
528
+ self.two_sided = False
529
+ self.residual_based = True
530
+ # Call the parent constructor
531
+ CondIndTest.__init__(self, **kwargs)
532
+ # Build the regressor
533
+ self.gauss_pr = GaussProcRegTorch(self.sig_samples,
534
+ self,
535
+ null_dist_filename=null_dist_filename,
536
+ verbosity=self.verbosity)
537
+
538
+ if self.verbosity > 0:
539
+ print("null_dist_filename = %s" % self.gauss_pr.null_dist_filename)
540
+ print("")
541
+
542
+ def _load_nulldist(self, filename):
543
+ r"""
544
+ Load a precomputed null distribution from a \*.npz file. This
545
+ distribution can be calculated using generate_and_save_nulldists(...).
546
+
547
+ Parameters
548
+ ----------
549
+ filename : strng
550
+ Path to the \*.npz file
551
+
552
+ Returns
553
+ -------
554
+ null_dists, null_samples : dict, int
555
+ The null distirbution as a dictionary of distributions keyed by
556
+ sample size, the number of null samples in total.
557
+ """
558
+ return self.gauss_pr._load_nulldist(filename)
559
+
560
+ def generate_nulldist(self, df, add_to_null_dists=True):
561
+ """Generates null distribution for pairwise independence tests.
562
+
563
+ Generates the null distribution for sample size df. Assumes pairwise
564
+ samples transformed to uniform marginals. Uses get_dependence_measure
565
+ available in class and generates self.sig_samples random samples. Adds
566
+ the null distributions to self.gauss_pr.null_dists.
567
+
568
+ Parameters
569
+ ----------
570
+ df : int
571
+ Degrees of freedom / sample size to generate null distribution for.
572
+
573
+ add_to_null_dists : bool, optional (default: True)
574
+ Whether to add the null dist to the dictionary of null dists or
575
+ just return it.
576
+
577
+ Returns
578
+ -------
579
+ null_dist : array of shape [df,]
580
+ Only returned,if add_to_null_dists is False.
581
+ """
582
+ return self.gauss_pr._generate_nulldist(df, add_to_null_dists)
583
+
584
+ def generate_and_save_nulldists(self, sample_sizes, null_dist_filename):
585
+ """Generates and saves null distribution for pairwise independence
586
+ tests.
587
+
588
+ Generates the null distribution for different sample sizes. Calls
589
+ generate_nulldist. Null dists are saved to disk as
590
+ self.null_dist_filename.npz. Also adds the null distributions to
591
+ self.gauss_pr.null_dists.
592
+
593
+ Parameters
594
+ ----------
595
+ sample_sizes : list
596
+ List of sample sizes.
597
+
598
+ null_dist_filename : str
599
+ Name to save file containing null distributions.
600
+ """
601
+ self.gauss_pr._generate_and_save_nulldists(sample_sizes,
602
+ null_dist_filename)
603
+
604
+
605
+ def _get_single_residuals(self, array, target_var,
606
+ return_means=False,
607
+ standardize=True,
608
+ return_likelihood=False,
609
+ training_iter=50,
610
+ lr=0.1):
611
+ """Returns residuals of Gaussian process regression.
612
+
613
+ Performs a GP regression of the variable indexed by target_var on the
614
+ conditions Z. Here array is assumed to contain X and Y as the first two
615
+ rows with the remaining rows (if present) containing the conditions Z.
616
+ Optionally returns the estimated mean and the likelihood.
617
+
618
+ Parameters
619
+ ----------
620
+ array : array-like
621
+ data array with X, Y, Z in rows and observations in columns
622
+
623
+ target_var : {0, 1}
624
+ Variable to regress out conditions from.
625
+
626
+ standardize : bool, optional (default: True)
627
+ Whether to standardize the array beforehand.
628
+
629
+ return_means : bool, optional (default: False)
630
+ Whether to return the estimated regression line.
631
+
632
+ return_likelihood : bool, optional (default: False)
633
+ Whether to return the log_marginal_likelihood of the fitted GP
634
+
635
+ training_iter : int, optional (default: 50)
636
+ Number of training iterations.
637
+
638
+ lr : float, optional (default: 0.1)
639
+ Learning rate (default: 0.1).
640
+
641
+ Returns
642
+ -------
643
+ resid [, mean, likelihood] : array-like
644
+ The residual of the regression and optionally the estimated mean
645
+ and/or the likelihood.
646
+ """
647
+ return self.gauss_pr._get_single_residuals(
648
+ array, target_var,
649
+ return_means,
650
+ standardize,
651
+ return_likelihood,
652
+ training_iter,
653
+ lr)
654
+
655
+ def get_model_selection_criterion(self, j, parents, tau_max=0):
656
+ """Returns log marginal likelihood for GP regression.
657
+
658
+ Fits a GP model of the parents to variable j and returns the negative
659
+ log marginal likelihood as a model selection score. Is used to determine
660
+ optimal hyperparameters in PCMCI, in particular the pc_alpha value.
661
+
662
+ Parameters
663
+ ----------
664
+ j : int
665
+ Index of target variable in data array.
666
+
667
+ parents : list
668
+ List of form [(0, -1), (3, -2), ...] containing parents.
669
+
670
+ tau_max : int, optional (default: 0)
671
+ Maximum time lag. This may be used to make sure that estimates for
672
+ different lags in X, Z, all have the same sample size.
673
+
674
+ Returns:
675
+ score : float
676
+ Model score.
677
+ """
678
+ return self.gauss_pr._get_model_selection_criterion(j, parents, tau_max)
679
+
680
+ def get_dependence_measure(self, array, xyz, data_type=None):
681
+ """Return GPDC measure.
682
+
683
+ Estimated as the distance correlation of the residuals of a GP
684
+ regression.
685
+
686
+ Parameters
687
+ ----------
688
+ array : array-like
689
+ data array with X, Y, Z in rows and observations in columns
690
+
691
+ xyz : array of ints
692
+ XYZ identifier array of shape (dim,).
693
+
694
+ Returns
695
+ -------
696
+ val : float
697
+ GPDC test statistic.
698
+ """
699
+
700
+ x_vals = self._get_single_residuals(array, target_var=0)
701
+ y_vals = self._get_single_residuals(array, target_var=1)
702
+ val = self._get_dcorr(np.array([x_vals, y_vals]))
703
+ return val
704
+
705
+ def _get_dcorr(self, array_resid):
706
+ """Return distance correlation coefficient.
707
+
708
+ The variables are transformed to uniform marginals using the empirical
709
+ cumulative distribution function beforehand. Here the null distribution
710
+ is not analytically available, but can be precomputed with the function
711
+ generate_and_save_nulldists(...) which saves a *.npz file containing
712
+ the null distribution for different sample sizes. This file can then be
713
+ supplied as null_dist_filename.
714
+
715
+ Parameters
716
+ ----------
717
+ array_resid : array-like
718
+ data array must be of shape (2, T)
719
+
720
+ Returns
721
+ -------
722
+ val : float
723
+ Distance correlation coefficient.
724
+ """
725
+ # Remove ties before applying transformation to uniform marginals
726
+ # array_resid = self._remove_ties(array_resid, verbosity=4)
727
+ x_vals, y_vals = self._trafo2uniform(array_resid)
728
+ val = dcor.distance_correlation(x_vals, y_vals, method='AVL')
729
+ return val
730
+
731
+ def get_shuffle_significance(self, array, xyz, value,
732
+ return_null_dist=False,
733
+ data_type=None):
734
+ """Returns p-value for shuffle significance test.
735
+
736
+ For residual-based test statistics only the residuals are shuffled.
737
+
738
+ Parameters
739
+ ----------
740
+ array : array-like
741
+ data array with X, Y, Z in rows and observations in columns
742
+
743
+ xyz : array of ints
744
+ XYZ identifier array of shape (dim,).
745
+
746
+ value : number
747
+ Value of test statistic for unshuffled estimate.
748
+
749
+ Returns
750
+ -------
751
+ pval : float
752
+ p-value
753
+ """
754
+
755
+ x_vals = self._get_single_residuals(array, target_var=0)
756
+ y_vals = self._get_single_residuals(array, target_var=1)
757
+ array_resid = np.array([x_vals, y_vals])
758
+ xyz_resid = np.array([0, 1])
759
+
760
+ null_dist = self._get_shuffle_dist(array_resid, xyz_resid,
761
+ self.get_dependence_measure,
762
+ sig_samples=self.sig_samples,
763
+ sig_blocklength=self.sig_blocklength,
764
+ verbosity=self.verbosity)
765
+
766
+ # pval = (null_dist >= value).mean()
767
+ pval = float(np.sum(null_dist >= value) + 1) / (self.sig_samples + 1)
768
+
769
+ if return_null_dist:
770
+ return pval, null_dist
771
+ return pval
772
+
773
+ def get_analytic_significance(self, value, T, dim, xyz):
774
+ """Returns p-value for the distance correlation coefficient.
775
+
776
+ The null distribution for necessary degrees of freedom (df) is loaded.
777
+ If not available, the null distribution is generated with the function
778
+ generate_nulldist(). It is recommended to generate the nulldists for a
779
+ wide range of sample sizes beforehand with the function
780
+ generate_and_save_nulldists(...). The distance correlation coefficient
781
+ is one-sided. If the degrees of freedom are less than 1, numpy.nan is
782
+ returned.
783
+
784
+ Parameters
785
+ ----------
786
+ value : float
787
+ Test statistic value.
788
+
789
+ T : int
790
+ Sample length
791
+
792
+ dim : int
793
+ Dimensionality, ie, number of features.
794
+
795
+ xyz : array of ints
796
+ XYZ identifier array of shape (dim,).
797
+
798
+ Returns
799
+ -------
800
+ pval : float or numpy.nan
801
+ p-value.
802
+ """
803
+
804
+ # GP regression approximately doesn't cost degrees of freedom
805
+ df = T
806
+
807
+ if df < 1:
808
+ pval = np.nan
809
+ else:
810
+ # idx_near = (np.abs(self.sample_sizes - df)).argmin()
811
+ if int(df) not in list(self.gauss_pr.null_dists):
812
+ # if np.abs(self.sample_sizes[idx_near] - df) / float(df) > 0.01:
813
+ if self.verbosity > 0:
814
+ print("Null distribution for GPDC not available "
815
+ "for deg. of freed. = %d." % df)
816
+ self.generate_nulldist(df)
817
+ null_dist_here = self.gauss_pr.null_dists[int(df)]
818
+ pval = np.mean(null_dist_here > np.abs(value))
819
+ return pval
820
+