tigramite-fast 5.2.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. tigramite/__init__.py +0 -0
  2. tigramite/causal_effects.py +1525 -0
  3. tigramite/causal_mediation.py +1592 -0
  4. tigramite/data_processing.py +1574 -0
  5. tigramite/graphs.py +1509 -0
  6. tigramite/independence_tests/LBFGS.py +1114 -0
  7. tigramite/independence_tests/__init__.py +0 -0
  8. tigramite/independence_tests/cmiknn.py +661 -0
  9. tigramite/independence_tests/cmiknn_mixed.py +1397 -0
  10. tigramite/independence_tests/cmisymb.py +286 -0
  11. tigramite/independence_tests/gpdc.py +664 -0
  12. tigramite/independence_tests/gpdc_torch.py +820 -0
  13. tigramite/independence_tests/gsquared.py +190 -0
  14. tigramite/independence_tests/independence_tests_base.py +1310 -0
  15. tigramite/independence_tests/oracle_conditional_independence.py +1582 -0
  16. tigramite/independence_tests/pairwise_CI.py +383 -0
  17. tigramite/independence_tests/parcorr.py +369 -0
  18. tigramite/independence_tests/parcorr_mult.py +485 -0
  19. tigramite/independence_tests/parcorr_wls.py +451 -0
  20. tigramite/independence_tests/regressionCI.py +403 -0
  21. tigramite/independence_tests/robust_parcorr.py +403 -0
  22. tigramite/jpcmciplus.py +966 -0
  23. tigramite/lpcmci.py +3649 -0
  24. tigramite/models.py +2257 -0
  25. tigramite/pcmci.py +3935 -0
  26. tigramite/pcmci_base.py +1218 -0
  27. tigramite/plotting.py +4735 -0
  28. tigramite/rpcmci.py +467 -0
  29. tigramite/toymodels/__init__.py +0 -0
  30. tigramite/toymodels/context_model.py +261 -0
  31. tigramite/toymodels/non_additive.py +1231 -0
  32. tigramite/toymodels/structural_causal_processes.py +1201 -0
  33. tigramite/toymodels/surrogate_generator.py +319 -0
  34. tigramite_fast-5.2.10.1.dist-info/METADATA +182 -0
  35. tigramite_fast-5.2.10.1.dist-info/RECORD +38 -0
  36. tigramite_fast-5.2.10.1.dist-info/WHEEL +5 -0
  37. tigramite_fast-5.2.10.1.dist-info/licenses/license.txt +621 -0
  38. tigramite_fast-5.2.10.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1310 @@
1
+ """Tigramite causal discovery for time series."""
2
+
3
+ # Author: Jakob Runge <jakob@jakob-runge.com>
4
+ #
5
+ # License: GNU General Public License v3.0
6
+
7
+ from __future__ import print_function
8
+ import warnings
9
+ import math
10
+ import abc
11
+ import numpy as np
12
+ import six
13
+ from hashlib import sha1
14
+
15
+
16
+ @six.add_metaclass(abc.ABCMeta)
17
+ class CondIndTest():
18
+ """Base class of conditional independence tests.
19
+
20
+ Provides useful general functions for different independence tests such as
21
+ shuffle significance testing and bootstrap confidence estimation. Also
22
+ handles masked samples. Other test classes can inherit from this class.
23
+
24
+ Parameters
25
+ ----------
26
+ seed : int, optional(default = 42)
27
+ Seed for RandomState (default_rng)
28
+
29
+ mask_type : str, optional (default = None)
30
+ Must be in {None, 'y','x','z','xy','xz','yz','xyz'}
31
+ Masking mode: Indicators for which variables in the dependence measure
32
+ I(X; Y | Z) the samples should be masked. If None, the mask is not used.
33
+ Explained in tutorial on masking and missing values.
34
+
35
+ significance : str, optional (default: 'analytic')
36
+ Type of significance test to use. In this package 'analytic',
37
+ 'fixed_thres' and 'shuffle_test' are available.
38
+
39
+ fixed_thres : float, optional (default: 0.1)
40
+ Deprecated.
41
+
42
+ sig_samples : int, optional (default: 500)
43
+ Number of samples for shuffle significance test.
44
+
45
+ sig_blocklength : int, optional (default: None)
46
+ Block length for block-shuffle significance test. If None, the
47
+ block length is determined from the decay of the autocovariance as
48
+ explained in [1]_.
49
+
50
+ confidence : str, optional (default: None)
51
+ Specify type of confidence estimation. If False, numpy.nan is returned.
52
+ 'bootstrap' can be used with any test, for ParCorr also 'analytic' is
53
+ implemented.
54
+
55
+ conf_lev : float, optional (default: 0.9)
56
+ Two-sided confidence interval.
57
+
58
+ conf_samples : int, optional (default: 100)
59
+ Number of samples for bootstrap.
60
+
61
+ conf_blocklength : int, optional (default: None)
62
+ Block length for block-bootstrap. If None, the block length is
63
+ determined from the decay of the autocovariance as explained in [1]_.
64
+
65
+ recycle_residuals : bool, optional (default: False)
66
+ Specifies whether residuals should be stored. This may be faster, but
67
+ can cost considerable memory.
68
+
69
+ verbosity : int, optional (default: 0)
70
+ Level of verbosity.
71
+ """
72
+ @abc.abstractmethod
73
+ def get_dependence_measure(self, array, xyz, data_type=None):
74
+ """
75
+ Abstract function that all concrete classes must instantiate.
76
+ """
77
+ pass
78
+
79
+ @abc.abstractproperty
80
+ def measure(self):
81
+ """
82
+ Abstract property to store the type of independence test.
83
+ """
84
+ pass
85
+
86
+ def __init__(self,
87
+ seed=42,
88
+ mask_type=None,
89
+ significance='analytic',
90
+ fixed_thres=None,
91
+ sig_samples=500,
92
+ sig_blocklength=None,
93
+ confidence=None,
94
+ conf_lev=0.9,
95
+ conf_samples=100,
96
+ conf_blocklength=None,
97
+ recycle_residuals=False,
98
+ verbosity=0):
99
+ # Set the dataframe to None for now, will be reset during pcmci call
100
+ self.dataframe = None
101
+ # Set the options
102
+ self.random_state = np.random.default_rng(seed)
103
+ self.significance = significance
104
+ self.sig_samples = sig_samples
105
+ self.sig_blocklength = sig_blocklength
106
+ if fixed_thres is not None:
107
+ raise ValueError("fixed_thres is replaced by providing alpha_or_thres in run_test")
108
+ self.verbosity = verbosity
109
+ self.cached_ci_results = {}
110
+ self.ci_results = {}
111
+ # If we recycle residuals, then set up a residual cache
112
+ self.recycle_residuals = recycle_residuals
113
+ if self.recycle_residuals:
114
+ self.residuals = {}
115
+ # If we use a mask, we cannot recycle residuals
116
+ self.set_mask_type(mask_type)
117
+
118
+ # Set the confidence type and details
119
+ self.confidence = confidence
120
+ self.conf_lev = conf_lev
121
+ self.conf_samples = conf_samples
122
+ self.conf_blocklength = conf_blocklength
123
+
124
+ # Print information about the
125
+ if self.verbosity > 0:
126
+ self.print_info()
127
+
128
+ def set_mask_type(self, mask_type):
129
+ """
130
+ Setter for mask type to ensure that this option does not clash with
131
+ recycle_residuals.
132
+
133
+ Parameters
134
+ ----------
135
+ mask_type : str
136
+ Must be in {None, 'y','x','z','xy','xz','yz','xyz'}
137
+ Masking mode: Indicators for which variables in the dependence measure
138
+ I(X; Y | Z) the samples should be masked. If None, the mask is not used.
139
+ Explained in tutorial on masking and missing values.
140
+ """
141
+ # Set the mask type
142
+ self.mask_type = mask_type
143
+ # Check if this clashes with residual recycling
144
+ if self.mask_type is not None:
145
+ if self.recycle_residuals is True:
146
+ warnings.warn("Using a mask disables recycling residuals.")
147
+ self.recycle_residuals = False
148
+ # Check the mask type is keyed correctly
149
+ self._check_mask_type()
150
+
151
+ def print_info(self):
152
+ """
153
+ Print information about the conditional independence test parameters
154
+ """
155
+ info_str = "\n# Initialize conditional independence test\n\nParameters:"
156
+ info_str += "\nindependence test = %s" % self.measure
157
+ info_str += "\nsignificance = %s" % self.significance
158
+ # Check if we are using a shuffle test
159
+ if self.significance == 'shuffle_test':
160
+ info_str += "\nsig_samples = %s" % self.sig_samples
161
+ info_str += "\nsig_blocklength = %s" % self.sig_blocklength
162
+ # # Check if we are using a fixed threshold
163
+ # elif self.significance == 'fixed_thres':
164
+ # info_str += "\nfixed_thres = %s" % self.fixed_thres
165
+ # Check if we have a confidence type
166
+ if self.confidence:
167
+ info_str += "\nconfidence = %s" % self.confidence
168
+ info_str += "\nconf_lev = %s" % self.conf_lev
169
+ # Check if this confidence type is boostrapping
170
+ if self.confidence == 'bootstrap':
171
+ info_str += "\nconf_samples = %s" % self.conf_samples
172
+ info_str += "\nconf_blocklength = %s" %self.conf_blocklength
173
+ # Check if we use a non-trivial mask type
174
+ if self.mask_type is not None:
175
+ info_str += "\nmask_type = %s" % self.mask_type
176
+ # Check if we are recycling residuals or not
177
+ if self.recycle_residuals:
178
+ info_str += "\nrecycle_residuals = %s" % self.recycle_residuals
179
+ # Print the information string
180
+ print(info_str)
181
+
182
+ def _check_mask_type(self):
183
+ """
184
+ mask_type : str, optional (default = None)
185
+ Must be in {None, 'y','x','z','xy','xz','yz','xyz'}
186
+ Masking mode: Indicators for which variables in the dependence measure
187
+ I(X; Y | Z) the samples should be masked. If None, the mask is not used.
188
+ Explained in tutorial on masking and missing values.
189
+ """
190
+ if self.mask_type is not None:
191
+ mask_set = set(self.mask_type) - set(['x', 'y', 'z'])
192
+ if mask_set:
193
+ err_msg = "mask_type = %s," % self.mask_type + " but must be" +\
194
+ " list containing 'x','y','z', or any combination"
195
+ raise ValueError(err_msg)
196
+
197
+
198
+ def get_analytic_confidence(self, value, df, conf_lev):
199
+ """
200
+ Base class assumption that this is not implemented. Concrete classes
201
+ should override when possible.
202
+ """
203
+ raise NotImplementedError("Analytic confidence not"+\
204
+ " implemented for %s" % self.measure)
205
+
206
+ def get_model_selection_criterion(self, j, parents, tau_max=0):
207
+ """
208
+ Base class assumption that this is not implemented. Concrete classes
209
+ should override when possible.
210
+ """
211
+ raise NotImplementedError("Model selection not"+\
212
+ " implemented for %s" % self.measure)
213
+
214
+ def get_analytic_significance(self, value, T, dim):
215
+ """
216
+ Base class assumption that this is not implemented. Concrete classes
217
+ should override when possible.
218
+ """
219
+ raise NotImplementedError("Analytic significance not"+\
220
+ " implemented for %s" % self.measure)
221
+
222
+ def get_shuffle_significance(self, array, xyz, value,
223
+ data_type=None,
224
+ return_null_dist=False):
225
+ """
226
+ Base class assumption that this is not implemented. Concrete classes
227
+ should override when possible.
228
+ """
229
+ raise NotImplementedError("Shuffle significance not"+\
230
+ " implemented for %s" % self.measure)
231
+
232
+ def _get_single_residuals(self, array, target_var,
233
+ standardize=True, return_means=False):
234
+ """
235
+ Base class assumption that this is not implemented. Concrete classes
236
+ should override when possible.
237
+ """
238
+ raise NotImplementedError("Residual calculation not"+\
239
+ " implemented for %s" % self.measure)
240
+
241
+ def set_dataframe(self, dataframe):
242
+ """Initialize and check the dataframe.
243
+
244
+ Parameters
245
+ ----------
246
+ dataframe : data object
247
+ Set tigramite dataframe object. It must have the attributes
248
+ dataframe.values yielding a numpy array of shape (observations T,
249
+ variables N) and optionally a mask of the same shape and a missing
250
+ values flag.
251
+
252
+ """
253
+ self.dataframe = dataframe
254
+ if self.mask_type is not None:
255
+ if dataframe.mask is None:
256
+ raise ValueError("mask_type is not None, but no mask in dataframe.")
257
+ dataframe._check_mask(dataframe.mask)
258
+
259
+ def _keyfy(self, x, z):
260
+ """Helper function to make lists unique."""
261
+ return (tuple(set(x)), tuple(set(z)))
262
+
263
+ def _get_array(self, X, Y, Z, tau_max=0, cut_off='2xtau_max',
264
+ remove_constant_data=False,
265
+ verbosity=0):
266
+ """Convencience wrapper around construct_array."""
267
+
268
+ if self.measure in ['par_corr', 'par_corr_wls', 'robust_par_corr', 'regressionCI',
269
+ 'gsquared', 'gp_dc']:
270
+ if len(X) > 1 or len(Y) > 1:
271
+ raise ValueError("X and Y for %s must be univariate." %
272
+ self.measure)
273
+
274
+ if self.dataframe is None:
275
+ raise ValueError("Call set_dataframe first when using CI test outside causal discovery classes.")
276
+
277
+ # Call the wrapped function
278
+ array, xyz, XYZ, type_array = self.dataframe.construct_array(X=X, Y=Y, Z=Z,
279
+ tau_max=tau_max,
280
+ mask_type=self.mask_type,
281
+ return_cleaned_xyz=True,
282
+ do_checks=True,
283
+ remove_overlaps=True,
284
+ cut_off=cut_off,
285
+ verbosity=verbosity)
286
+
287
+ if remove_constant_data:
288
+ zero_components = np.where(array.std(axis=1)==0.)[0]
289
+
290
+ X, Y, Z = XYZ
291
+ x_indices = np.where(xyz == 0)[0]
292
+ newX = [X[entry] for entry, ind in enumerate(x_indices) if ind not in zero_components]
293
+
294
+ y_indices = np.where(xyz == 1)[0]
295
+ newY = [Y[entry] for entry, ind in enumerate(y_indices) if ind not in zero_components]
296
+
297
+ z_indices = np.where(xyz == 2)[0]
298
+ newZ = [Z[entry] for entry, ind in enumerate(z_indices) if ind not in zero_components]
299
+
300
+ nonzero_XYZ = (newX, newY, newZ)
301
+
302
+ nonzero_array = np.delete(array, zero_components, axis=0)
303
+ nonzero_xyz = np.delete(xyz, zero_components, axis=0)
304
+ if type_array is not None:
305
+ nonzero_type_array = np.delete(type_array, zero_components, axis=0)
306
+ else:
307
+ nonzero_type_array = None
308
+
309
+ return array, xyz, XYZ, type_array, nonzero_array, nonzero_xyz, nonzero_XYZ, nonzero_type_array
310
+
311
+ return array, xyz, XYZ, type_array
312
+
313
+
314
+ def _get_array_hash(self, array, xyz, XYZ):
315
+ """Helper function to get hash of array.
316
+
317
+ For a CI test X _|_ Y | Z the order of variables within X or Y or Z
318
+ does not matter and also the order X and Y can be swapped.
319
+ Hence, to compare hashes of the whole array, we order accordingly
320
+ to create a unique, order-independent hash.
321
+
322
+ Parameters
323
+ ----------
324
+ array : Data array of shape (dim, T)
325
+ Data array.
326
+ xyz : array
327
+ Identifier array of shape (dim,) identifying which row in array
328
+ corresponds to X, Y, and Z
329
+ XYZ : list of tuples
330
+
331
+ Returns
332
+ -------
333
+ combined_hash : str
334
+ Hash that identifies uniquely an array of XYZ
335
+ """
336
+
337
+ X, Y, Z = XYZ
338
+
339
+ # First check whether CI result was already computed
340
+ # by checking whether hash of (xyz, array) already exists
341
+ # Individually sort X, Y, Z since for a CI test it does not matter
342
+ # how they are aranged
343
+ x_orderd = sorted(range(len(X)), key=X.__getitem__)
344
+ arr_x = array[xyz==0][x_orderd]
345
+ x_hash = sha1(np.ascontiguousarray(arr_x)).hexdigest()
346
+
347
+ y_orderd = sorted(range(len(Y)), key=Y.__getitem__)
348
+ arr_y = array[xyz==1][y_orderd]
349
+ y_hash = sha1(np.ascontiguousarray(arr_y)).hexdigest()
350
+
351
+ z_orderd = sorted(range(len(Z)), key=Z.__getitem__)
352
+ arr_z = array[xyz==2][z_orderd]
353
+ z_hash = sha1(np.ascontiguousarray(arr_z)).hexdigest()
354
+
355
+ sorted_xy = sorted([x_hash, y_hash])
356
+ combined_hash = (sorted_xy[0], sorted_xy[1], z_hash)
357
+ return combined_hash
358
+
359
+
360
+ def run_test(self, X, Y, Z=None, tau_max=0, cut_off='2xtau_max', alpha_or_thres=None):
361
+ """Perform conditional independence test.
362
+
363
+ Calls the dependence measure and significance test functions. The child
364
+ classes must specify a function get_dependence_measure and either or
365
+ both functions get_analytic_significance and get_shuffle_significance.
366
+ If recycle_residuals is True, also _get_single_residuals must be
367
+ available.
368
+
369
+ Parameters
370
+ ----------
371
+ X, Y, Z : list of tuples
372
+ X,Y,Z are of the form [(var, -tau)], where var specifies the
373
+ variable index and tau the time lag.
374
+ tau_max : int, optional (default: 0)
375
+ Maximum time lag. This may be used to make sure that estimates for
376
+ different lags in X, Z, all have the same sample size.
377
+ cut_off : {'2xtau_max', 'max_lag', 'max_lag_or_tau_max'}
378
+ How many samples to cutoff at the beginning. The default is
379
+ '2xtau_max', which guarantees that MCI tests are all conducted on
380
+ the same samples. For modeling, 'max_lag_or_tau_max' can be used,
381
+ which uses the maximum of tau_max and the conditions, which is
382
+ useful to compare multiple models on the same sample. Last,
383
+ 'max_lag' uses as much samples as possible.
384
+ alpha_or_thres : float (optional)
385
+ Significance level (if significance='analytic' or 'shuffle_test') or
386
+ threshold (if significance='fixed_thres'). If given, run_test returns
387
+ the test decision dependent=True/False.
388
+
389
+ Returns
390
+ -------
391
+ val, pval, [dependent] : Tuple of floats and bool
392
+ The test statistic value and the p-value. If alpha_or_thres is
393
+ given, run_test also returns the test decision dependent=True/False.
394
+ """
395
+
396
+ if self.significance == 'fixed_thres' and alpha_or_thres is None:
397
+ raise ValueError("significance == 'fixed_thres' requires setting alpha_or_thres")
398
+
399
+ # Get the array to test on
400
+ (array, xyz, XYZ, data_type,
401
+ nonzero_array, nonzero_xyz, nonzero_XYZ, nonzero_data_type) = self._get_array(
402
+ X=X, Y=Y, Z=Z, tau_max=tau_max, cut_off=cut_off,
403
+ remove_constant_data=True, verbosity=self.verbosity)
404
+ X, Y, Z = XYZ
405
+ nonzero_X, nonzero_Y, nonzero_Z = nonzero_XYZ
406
+
407
+ # Record the dimensions
408
+ # dim, T = array.shape
409
+
410
+ # Ensure it is a valid array
411
+ if np.any(np.isnan(array)):
412
+ raise ValueError("nans in the array!")
413
+
414
+ combined_hash = self._get_array_hash(array, xyz, XYZ)
415
+
416
+ # Get test statistic value and p-value [cached if possible]
417
+ if combined_hash in self.cached_ci_results.keys():
418
+ cached = True
419
+ val, pval = self.cached_ci_results[combined_hash]
420
+ else:
421
+ cached = False
422
+
423
+ # If all X or all Y are zero, then return pval=1, val=0, dependent=False
424
+ if len(nonzero_X) == 0 or len(nonzero_Y) == 0:
425
+ val = 0.
426
+ pval = None if self.significance == 'fixed_thres' else 1.
427
+ else:
428
+ # Get the dependence measure, reycling residuals if need be
429
+ val = self._get_dependence_measure_recycle(nonzero_X, nonzero_Y, nonzero_Z,
430
+ nonzero_xyz, nonzero_array, nonzero_data_type)
431
+ # Get the p-value (None if significance = 'fixed_thres')
432
+ dim, T = nonzero_array.shape
433
+ pval = self._get_p_value(val=val, array=nonzero_array, xyz=nonzero_xyz, T=T, dim=dim,
434
+ data_type=nonzero_data_type)
435
+ self.cached_ci_results[combined_hash] = (val, pval)
436
+
437
+ # Make test decision
438
+ if len(nonzero_X) == 0 or len(nonzero_Y) == 0:
439
+ dependent = False
440
+ else:
441
+ if self.significance == 'fixed_thres':
442
+ if self.two_sided:
443
+ dependent = np.abs(val) >= np.abs(alpha_or_thres)
444
+ else:
445
+ dependent = val >= alpha_or_thres
446
+ pval = 0. if dependent else 1.
447
+ else:
448
+ if alpha_or_thres is None:
449
+ dependent = None
450
+ else:
451
+ dependent = pval <= alpha_or_thres
452
+
453
+ # Saved here, but not currently used
454
+ self.ci_results[(tuple(X), tuple(Y),tuple(Z))] = (val, pval, dependent)
455
+
456
+ # Return the calculated value(s)
457
+ if self.verbosity > 1:
458
+ self._print_cond_ind_results(val=val, pval=pval, cached=cached, dependent=dependent,
459
+ conf=None)
460
+
461
+ if alpha_or_thres is None:
462
+ return val, pval
463
+ else:
464
+ return val, pval, dependent
465
+
466
+
467
+ def run_test_raw(self, x, y, z=None, x_type=None, y_type=None, z_type=None, alpha_or_thres=None):
468
+ """Perform conditional independence test directly on input arrays x, y, z.
469
+
470
+ Calls the dependence measure and signficicance test functions. The child
471
+ classes must specify a function get_dependence_measure and either or
472
+ both functions get_analytic_significance and get_shuffle_significance.
473
+
474
+ Parameters
475
+ ----------
476
+ x, y, z : arrays
477
+ x,y,z are of the form (samples, dimension).
478
+
479
+ x_type, y_type, z_type : array-like
480
+ data arrays of same shape as x, y and z respectively, which describes whether variables
481
+ are continuous or discrete: 0s for continuous variables and
482
+ 1s for discrete variables
483
+
484
+ alpha_or_thres : float (optional)
485
+ Significance level (if significance='analytic' or 'shuffle_test') or
486
+ threshold (if significance='fixed_thres'). If given, run_test returns
487
+ the test decision dependent=True/False.
488
+
489
+ Returns
490
+ -------
491
+ val, pval, [dependent] : Tuple of floats and bool
492
+ The test statistic value and the p-value. If alpha_or_thres is
493
+ given, run_test also returns the test decision dependent=True/False.
494
+ """
495
+
496
+ if np.ndim(x) != 2 or np.ndim(y) != 2:
497
+ raise ValueError("x,y must be arrays of shape (samples, dimension)"
498
+ " where dimension can be 1.")
499
+
500
+ if z is not None and np.ndim(z) != 2:
501
+ raise ValueError("z must be array of shape (samples, dimension)"
502
+ " where dimension can be 1.")
503
+
504
+ if x_type is not None or y_type is not None or z_type is not None:
505
+ has_data_type = True
506
+ else:
507
+ has_data_type = False
508
+
509
+ if x_type is None and has_data_type:
510
+ x_type = np.zeros(x.shape, dtype='int')
511
+
512
+ if y_type is None and has_data_type:
513
+ y_type = np.zeros(y.shape, dtype='int')
514
+
515
+ if z is None:
516
+ # Get the array to test on
517
+ array = np.vstack((x.T, y.T))
518
+ if has_data_type:
519
+ data_type = np.vstack((x_type.T, y_type.T))
520
+
521
+ # xyz is the dimension indicator
522
+ xyz = np.array([0 for i in range(x.shape[1])] +
523
+ [1 for i in range(y.shape[1])])
524
+
525
+ else:
526
+ # Get the array to test on
527
+ array = np.vstack((x.T, y.T, z.T))
528
+ if z_type is None and has_data_type:
529
+ z_type = np.zeros(z.shape, dtype='int')
530
+
531
+ if has_data_type:
532
+ data_type = np.vstack((x_type.T, y_type.T, z_type.T))
533
+ # xyz is the dimension indicator
534
+ xyz = np.array([0 for i in range(x.shape[1])] +
535
+ [1 for i in range(y.shape[1])] +
536
+ [2 for i in range(z.shape[1])])
537
+
538
+ if self.significance == 'fixed_thres' and alpha_or_thres is None:
539
+ raise ValueError("significance == 'fixed_thres' requires setting alpha_or_thres")
540
+
541
+ # Record the dimensions
542
+ dim, T = array.shape
543
+ # Ensure it is a valid array
544
+ if np.isnan(array).sum() != 0:
545
+ raise ValueError("nans in the array!")
546
+ # Get the dependence measure
547
+ if has_data_type:
548
+ val = self.get_dependence_measure(array, xyz, data_type=data_type)
549
+ else:
550
+ val = self.get_dependence_measure(array, xyz)
551
+
552
+
553
+ # Get the p-value (returns None if significance='fixed_thres')
554
+ if has_data_type:
555
+ pval = self._get_p_value(val=val, array=array, xyz=xyz,
556
+ T=T, dim=dim, data_type=data_type)
557
+ else:
558
+ pval = self._get_p_value(val=val, array=array, xyz=xyz,
559
+ T=T, dim=dim)
560
+
561
+ # Make test decision
562
+ if self.significance == 'fixed_thres':
563
+ if self.two_sided:
564
+ dependent = np.abs(val) >= np.abs(alpha_or_thres)
565
+ else:
566
+ dependent = val >= alpha_or_thres
567
+ pval = 0. if dependent else 1.
568
+ else:
569
+ if alpha_or_thres is None:
570
+ dependent = None
571
+ else:
572
+ dependent = pval <= alpha_or_thres
573
+
574
+ # Return the value and the pvalue
575
+ if alpha_or_thres is None:
576
+ return val, pval
577
+ else:
578
+ return val, pval, dependent
579
+
580
+ def get_dependence_measure_raw(self, x, y, z=None, x_type=None, y_type=None, z_type=None):
581
+ """Return test statistic directly on input arrays x, y, z.
582
+
583
+ Calls the dependence measure function. The child classes must specify
584
+ a function get_dependence_measure.
585
+
586
+ Parameters
587
+ ----------
588
+ x, y, z : arrays
589
+ x,y,z are of the form (samples, dimension).
590
+
591
+ x_type, y_type, z_type : array-like
592
+ data arrays of same shape as x, y and z respectively, which describes whether variables
593
+ are continuous or discrete: 0s for continuous variables and
594
+ 1s for discrete variables
595
+
596
+ Returns
597
+ -------
598
+ val : float
599
+ The test statistic value.
600
+ """
601
+
602
+ if np.ndim(x) != 2 or np.ndim(y) != 2:
603
+ raise ValueError("x,y must be arrays of shape (samples, dimension)"
604
+ " where dimension can be 1.")
605
+
606
+ if z is not None and np.ndim(z) != 2:
607
+ raise ValueError("z must be array of shape (samples, dimension)"
608
+ " where dimension can be 1.")
609
+
610
+ if x_type is not None or y_type is not None or z_type is not None:
611
+ has_data_type = True
612
+ else:
613
+ has_data_type = False
614
+
615
+ if x_type is None and has_data_type:
616
+ x_type = np.zeros(x.shape, dtype='int')
617
+
618
+ if y_type is None and has_data_type:
619
+ y_type = np.zeros(y.shape, dtype='int')
620
+
621
+ if z is None:
622
+ # Get the array to test on
623
+ array = np.vstack((x.T, y.T))
624
+ if has_data_type:
625
+ data_type = np.vstack((x_type.T, y_type.T))
626
+
627
+ # xyz is the dimension indicator
628
+ xyz = np.array([0 for i in range(x.shape[1])] +
629
+ [1 for i in range(y.shape[1])])
630
+
631
+ else:
632
+ # Get the array to test on
633
+ array = np.vstack((x.T, y.T, z.T))
634
+ if z_type is None and has_data_type:
635
+ z_type = np.zeros(z.shape, dtype='int')
636
+
637
+ if has_data_type:
638
+ data_type = np.vstack((x_type.T, y_type.T, z_type.T))
639
+ # xyz is the dimension indicator
640
+ xyz = np.array([0 for i in range(x.shape[1])] +
641
+ [1 for i in range(y.shape[1])] +
642
+ [2 for i in range(z.shape[1])])
643
+
644
+ # Record the dimensions
645
+ dim, T = array.shape
646
+ # Ensure it is a valid array
647
+ if np.isnan(array).sum() != 0:
648
+ raise ValueError("nans in the array!")
649
+ # Get the dependence measure
650
+ if has_data_type:
651
+ val = self.get_dependence_measure(array, xyz, data_type=data_type)
652
+ else:
653
+ val = self.get_dependence_measure(array, xyz)
654
+
655
+ return val
656
+
657
+ def _get_dependence_measure_recycle(self, X, Y, Z, xyz, array, data_type=None):
658
+ """Get the dependence_measure, optionally recycling residuals
659
+
660
+ If self.recycle_residuals is True, also _get_single_residuals must be
661
+ available.
662
+
663
+ Parameters
664
+ ----------
665
+ X, Y, Z : list of tuples
666
+ X,Y,Z are of the form [(var, -tau)], where var specifies the
667
+ variable index and tau the time lag.
668
+
669
+ xyz : array of ints
670
+ XYZ identifier array of shape (dim,).
671
+
672
+ array : array
673
+ Data array of shape (dim, T)
674
+
675
+ data_type : array-like
676
+ Binary data array of same shape as array which describes whether
677
+ individual samples in a variable (or all samples) are continuous
678
+ or discrete: 0s for continuous variables and 1s for discrete variables.
679
+
680
+ Return
681
+ ------
682
+ val : float
683
+ Test statistic
684
+ """
685
+ # Check if we are recycling residuals
686
+ if self.recycle_residuals:
687
+ # Get or calculate the cached residuals
688
+ x_resid = self._get_cached_residuals(X, Z, array, 0)
689
+ y_resid = self._get_cached_residuals(Y, Z, array, 1)
690
+ # Make a new residual array
691
+ array_resid = np.array([x_resid, y_resid])
692
+ xyz_resid = np.array([0, 1])
693
+ # Return the dependence measure
694
+ # data type can only be continuous in this case
695
+ return self.get_dependence_measure(array_resid, xyz_resid)
696
+
697
+ # If not, return the dependence measure on the array and xyz
698
+ if data_type is not None:
699
+ return self.get_dependence_measure(array, xyz,
700
+ data_type=data_type)
701
+ else:
702
+ return self.get_dependence_measure(array, xyz)
703
+
704
+ def _get_cached_residuals(self, x_nodes, z_nodes, array, target_var):
705
+ """
706
+ Retrieve or calculate the cached residuals for the given node sets.
707
+
708
+ Parameters
709
+ ----------
710
+ x_nodes : list of tuples
711
+ List of nodes, X or Y normally. Used to key the residual cache
712
+ during lookup
713
+
714
+ z_nodes : list of tuples
715
+ List of nodes, Z normally
716
+
717
+ target_var : int
718
+ Key to differentiate X from Y.
719
+ x_nodes == X => 0, x_nodes == Y => 1
720
+
721
+ array : array
722
+ Data array of shape (dim, T)
723
+
724
+ Returns
725
+ -------
726
+ x_resid : array
727
+ Residuals calculated by _get_single_residual
728
+ """
729
+ # Check if we have calculated these residuals
730
+ if self._keyfy(x_nodes, z_nodes) in self.residuals:
731
+ x_resid = self.residuals[self._keyfy(x_nodes, z_nodes)]
732
+ # If not, calculate the residuals
733
+ else:
734
+ x_resid = self._get_single_residuals(array, target_var=target_var)
735
+ if z_nodes:
736
+ self.residuals[self._keyfy(x_nodes, z_nodes)] = x_resid
737
+ # Return these residuals
738
+ return x_resid
739
+
740
+ def _get_p_value(self, val, array, xyz, T, dim,
741
+ data_type=None,
742
+ sig_override=None):
743
+ """
744
+ Returns the p-value from whichever significance function is specified
745
+ for this test. If an override is used, then it will call a different
746
+ function then specified by self.significance
747
+
748
+ Parameters
749
+ ----------
750
+ val : float
751
+ Test statistic value.
752
+
753
+ array : array-like
754
+ data array with X, Y, Z in rows and observations in columns
755
+
756
+ xyz : array of ints
757
+ XYZ identifier array of shape (dim,).
758
+
759
+ T : int
760
+ Sample length
761
+
762
+ dim : int
763
+ Dimensionality, ie, number of features.
764
+
765
+ data_type : array-like
766
+ Binary data array of same shape as array which describes whether
767
+ individual samples in a variable (or all samples) are continuous
768
+ or discrete: 0s for continuous variables and 1s for discrete variables.
769
+
770
+ sig_override : string
771
+ Must be in 'analytic', 'shuffle_test', 'fixed_thres'
772
+
773
+ Returns
774
+ -------
775
+ pval : float or numpy.nan
776
+ P-value.
777
+ """
778
+ # Defaults to the self.significance member value
779
+ use_sig = self.significance
780
+ if sig_override is not None:
781
+ use_sig = sig_override
782
+ # Check if we are using the analytic significance
783
+ if use_sig == 'analytic':
784
+ pval = self.get_analytic_significance(value=val, T=T, dim=dim, xyz=xyz)
785
+ # Check if we are using the shuffle significance
786
+ elif use_sig == 'shuffle_test':
787
+ pval = self.get_shuffle_significance(array=array,
788
+ xyz=xyz,
789
+ value=val,
790
+ data_type=data_type)
791
+ # Check if we are using the fixed_thres significance
792
+ elif use_sig == 'fixed_thres':
793
+ # Determined outside then
794
+ pval = None
795
+ # if self.two_sided:
796
+ # dependent = np.abs(val) >= np.abs(alpha_or_thres)
797
+ # else:
798
+ # dependent = val >= alpha_or_thres
799
+ # pval = 0. if dependent else 1.
800
+ # # pval = self.get_fixed_thres_significance(
801
+ # # value=val,
802
+ # # fixed_thres=self.fixed_thres)
803
+ else:
804
+ raise ValueError("%s not known." % self.significance)
805
+
806
+ # # Return the calculated value(s)
807
+ # if alpha_or_thres is not None:
808
+ # if use_sig != 'fixed_thres':
809
+ # dependent = pval <= alpha_or_thres
810
+ # return pval, dependent
811
+ # else:
812
+ return pval
813
+
814
+ def get_measure(self, X, Y, Z=None, tau_max=0,
815
+ data_type=None):
816
+ """Estimate dependence measure.
817
+
818
+ Calls the dependence measure function. The child classes must specify
819
+ a function get_dependence_measure.
820
+
821
+ Parameters
822
+ ----------
823
+ X, Y [, Z] : list of tuples
824
+ X,Y,Z are of the form [(var, -tau)], where var specifies the
825
+ variable index and tau the time lag.
826
+
827
+ tau_max : int, optional (default: 0)
828
+ Maximum time lag. This may be used to make sure that estimates for
829
+ different lags in X, Z, all have the same sample size.
830
+
831
+ data_type : array-like
832
+ Binary data array of same shape as array which describes whether
833
+ individual samples in a variable (or all samples) are continuous
834
+ or discrete: 0s for continuous variables and 1s for discrete variables.
835
+
836
+
837
+ Returns
838
+ -------
839
+ val : float
840
+ The test statistic value.
841
+
842
+ """
843
+
844
+ # Get the array to test on
845
+ (array, xyz, XYZ, data_type,
846
+ nonzero_array, nonzero_xyz, nonzero_XYZ, nonzero_data_type) = self._get_array(
847
+ X=X, Y=Y, Z=Z, tau_max=tau_max,
848
+ remove_constant_data=True,
849
+ verbosity=self.verbosity)
850
+ X, Y, Z = XYZ
851
+ nonzero_X, nonzero_Y, nonzero_Z = nonzero_XYZ
852
+
853
+ # Record the dimensions
854
+ # dim, T = array.shape
855
+
856
+ # Ensure it is a valid array
857
+ if np.any(np.isnan(array)):
858
+ raise ValueError("nans in the array!")
859
+
860
+ # If all X or all Y are zero, then return pval=1, val=0, dependent=False
861
+ if len(nonzero_X) == 0 or len(nonzero_Y) == 0:
862
+ val = 0.
863
+ else:
864
+ # Get the dependence measure, reycling residuals if need be
865
+ val = self._get_dependence_measure_recycle(nonzero_X, nonzero_Y, nonzero_Z,
866
+ nonzero_xyz, nonzero_array, nonzero_data_type)
867
+
868
+ return val
869
+
870
+ def get_confidence(self, X, Y, Z=None, tau_max=0,
871
+ data_type=None):
872
+ """Perform confidence interval estimation.
873
+
874
+ Calls the dependence measure and confidence test functions. The child
875
+ classes can specify a function get_dependence_measure and
876
+ get_analytic_confidence or get_bootstrap_confidence. If confidence is
877
+ False, (numpy.nan, numpy.nan) is returned.
878
+
879
+ Parameters
880
+ ----------
881
+ X, Y, Z : list of tuples
882
+ X,Y,Z are of the form [(var, -tau)], where var specifies the
883
+ variable index and tau the time lag.
884
+
885
+ tau_max : int, optional (default: 0)
886
+ Maximum time lag. This may be used to make sure that estimates for
887
+ different lags in X, Z, all have the same sample size.
888
+
889
+ data_type : array-like
890
+ Binary data array of same shape as array which describes whether
891
+ individual samples in a variable (or all samples) are continuous
892
+ or discrete: 0s for continuous variables and 1s for discrete variables.
893
+
894
+ Returns
895
+ -------
896
+ (conf_lower, conf_upper) : Tuple of floats
897
+ Upper and lower confidence bound of confidence interval.
898
+ """
899
+ # Check if a confidence type has been defined
900
+ if self.confidence:
901
+ # Ensure the confidence level given makes sense
902
+ if self.conf_lev < .5 or self.conf_lev >= 1.:
903
+ raise ValueError("conf_lev = %.2f, " % self.conf_lev +
904
+ "but must be between 0.5 and 1")
905
+ half_conf = self.conf_samples * (1. - self.conf_lev)/2.
906
+ if self.confidence == 'bootstrap' and half_conf < 1.:
907
+ raise ValueError("conf_samples*(1.-conf_lev)/2 is %.2f"
908
+ % half_conf + ", must be >> 1")
909
+
910
+ if self.confidence:
911
+ # Make and check the array
912
+ array, xyz, _, data_type = self._get_array(X=X, Y=Y, Z=Z, tau_max=tau_max,
913
+ remove_constant_data=False, verbosity=0)
914
+ dim, T = array.shape
915
+ if np.isnan(array).sum() != 0:
916
+ raise ValueError("nans in the array!")
917
+
918
+ # Check if we are using analytic confidence or bootstrapping it
919
+ if self.confidence == 'analytic':
920
+ val = self.get_dependence_measure(array, xyz)
921
+ (conf_lower, conf_upper) = \
922
+ self.get_analytic_confidence(df=T-dim,
923
+ value=val,
924
+ conf_lev=self.conf_lev)
925
+ elif self.confidence == 'bootstrap':
926
+ # Overwrite analytic values
927
+ (conf_lower, conf_upper) = \
928
+ self.get_bootstrap_confidence(
929
+ array, xyz,
930
+ conf_samples=self.conf_samples,
931
+ conf_blocklength=self.conf_blocklength,
932
+ conf_lev=self.conf_lev, verbosity=self.verbosity)
933
+ else:
934
+ raise ValueError("%s confidence estimation not implemented"
935
+ % self.confidence)
936
+ else:
937
+ return None
938
+
939
+ # Cache the confidence interval
940
+ self.conf = (conf_lower, conf_upper)
941
+ # Return the confidence interval
942
+ return (conf_lower, conf_upper)
943
+
944
+ def _print_cond_ind_results(self, val, pval=None, cached=None, dependent=None, conf=None):
945
+ """Print results from conditional independence test.
946
+
947
+ Parameters
948
+ ----------
949
+ val : float
950
+ Test stastistic value.
951
+
952
+ pval : float, optional (default: None)
953
+ p-value
954
+
955
+ dependent : bool
956
+ Test decision.
957
+
958
+ conf : tuple of floats, optional (default: None)
959
+ Confidence bounds.
960
+ """
961
+ printstr = " val = % .3f" % (val)
962
+ if pval is not None:
963
+ printstr += " | pval = %.5f" % (pval)
964
+ if dependent is not None:
965
+ printstr += " | dependent = %s" % (dependent)
966
+ if conf is not None:
967
+ printstr += " | conf bounds = (%.3f, %.3f)" % (
968
+ conf[0], conf[1])
969
+ if cached is not None:
970
+ printstr += " %s" % ({0:"", 1:"[cached]"}[cached])
971
+
972
+ print(printstr)
973
+
974
+ def get_bootstrap_confidence(self, array, xyz, dependence_measure=None,
975
+ conf_samples=100, conf_blocklength=None,
976
+ conf_lev=.95,
977
+ data_type=None,
978
+ verbosity=0):
979
+ """Perform bootstrap confidence interval estimation.
980
+
981
+ With conf_blocklength > 1 or None a block-bootstrap is performed.
982
+
983
+ Parameters
984
+ ----------
985
+ array : array-like
986
+ data array with X, Y, Z in rows and observations in columns
987
+
988
+ xyz : array of ints
989
+ XYZ identifier array of shape (dim,).
990
+
991
+ dependence_measure : function (default = self.get_dependence_measure)
992
+ Dependence measure function must be of form
993
+ dependence_measure(array, xyz) and return a numeric value
994
+
995
+ conf_lev : float, optional (default: 0.9)
996
+ Two-sided confidence interval.
997
+
998
+ conf_samples : int, optional (default: 100)
999
+ Number of samples for bootstrap.
1000
+
1001
+ conf_blocklength : int, optional (default: None)
1002
+ Block length for block-bootstrap. If None, the block length is
1003
+ determined from the decay of the autocovariance as explained in
1004
+ [1]_.
1005
+
1006
+ data_type : array-like
1007
+ Binary data array of same shape as array which describes whether
1008
+ individual samples in a variable (or all samples) are continuous
1009
+ or discrete: 0s for continuous variables and 1s for discrete variables.
1010
+
1011
+ verbosity : int, optional (default: 0)
1012
+ Level of verbosity.
1013
+
1014
+ Returns
1015
+ -------
1016
+ (conf_lower, conf_upper) : Tuple of floats
1017
+ Upper and lower confidence bound of confidence interval.
1018
+ """
1019
+
1020
+ # Check if a dependence measure if provided or if to use default
1021
+ if not dependence_measure:
1022
+ dependence_measure = self.get_dependence_measure
1023
+
1024
+ # confidence interval is two-sided
1025
+ c_int = 1. - (1. - conf_lev)/2.
1026
+ dim, T = array.shape
1027
+
1028
+ # If not block length is given, determine the optimal block length.
1029
+ # This has a maximum of 10% of the time sample length
1030
+ if conf_blocklength is None:
1031
+ conf_blocklength = \
1032
+ self._get_block_length(array, xyz, mode='confidence')
1033
+ # Determine the number of blocks total, rounding up for non-integer
1034
+ # amounts
1035
+ n_blks = int(math.ceil(float(T)/conf_blocklength))
1036
+
1037
+ # Print some information
1038
+ if verbosity > 2:
1039
+ print(" block_bootstrap confidence intervals"
1040
+ " with block-length = %d ..." % conf_blocklength)
1041
+
1042
+ # Generate the block bootstrapped distribution
1043
+ bootdist = np.zeros(conf_samples)
1044
+ for smpl in range(conf_samples):
1045
+ # Get the starting indices for the blocks
1046
+ blk_strt = self.random_state.integers(0, T - conf_blocklength + 1, n_blks)
1047
+ # Get the empty array of block resampled values
1048
+ array_bootstrap = \
1049
+ np.zeros((dim, n_blks*conf_blocklength), dtype=array.dtype)
1050
+ # Fill the array of block resamples
1051
+ for i in range(conf_blocklength):
1052
+ array_bootstrap[:, i::conf_blocklength] = array[:, blk_strt + i]
1053
+ # Cut to proper length
1054
+ array_bootstrap = array_bootstrap[:, :T]
1055
+
1056
+ bootdist[smpl] = dependence_measure(array_bootstrap, xyz)
1057
+
1058
+ # Sort and get quantile
1059
+ bootdist.sort()
1060
+ conf_lower = bootdist[int((1. - c_int) * conf_samples)]
1061
+ conf_upper = bootdist[int(c_int * conf_samples)]
1062
+ # Return the confidance limits as a tuple
1063
+ return (conf_lower, conf_upper)
1064
+
1065
+ def _get_acf(self, series, max_lag=None):
1066
+ """Returns autocorrelation function.
1067
+
1068
+ Parameters
1069
+ ----------
1070
+ series : 1D-array
1071
+ data series to compute autocorrelation from
1072
+
1073
+ max_lag : int, optional (default: None)
1074
+ maximum lag for autocorrelation function. If None is passed, 10% of
1075
+ the data series length are used.
1076
+
1077
+ Returns
1078
+ -------
1079
+ autocorr : array of shape (max_lag + 1,)
1080
+ Autocorrelation function.
1081
+ """
1082
+ # Set the default max lag
1083
+ if max_lag is None:
1084
+ max_lag = int(max(5, 0.1*len(series)))
1085
+ # Initialize the result
1086
+ T = len(series)
1087
+ autocorr = np.ones(max_lag + 1)
1088
+ # Compute autocorrelation using direct dot products
1089
+ # (avoids np.corrcoef overhead of building 2x2 matrix per lag)
1090
+ for lag in range(1, max_lag + 1):
1091
+ y1 = series[lag:]
1092
+ y2 = series[:T - lag]
1093
+ y1_centered = y1 - y1.mean()
1094
+ y2_centered = y2 - y2.mean()
1095
+ denom_sq = np.dot(y1_centered, y1_centered) * np.dot(y2_centered, y2_centered)
1096
+ if denom_sq > 0.:
1097
+ autocorr[lag] = np.dot(y1_centered, y2_centered) / np.sqrt(denom_sq)
1098
+ else:
1099
+ autocorr[lag] = 0.
1100
+ return autocorr
1101
+
1102
+ def _get_block_length(self, array, xyz, mode):
1103
+ """Returns optimal block length for significance and confidence tests.
1104
+
1105
+ Determine block length using approach in Mader (2013) [Eq. (6)] which
1106
+ improves the method of Peifer (2005) with non-overlapping blocks In
1107
+ case of multidimensional X, the max is used. Further details in [1]_.
1108
+ Two modes are available. For mode='significance', only the indices
1109
+ corresponding to X are shuffled in array. For mode='confidence' all
1110
+ variables are jointly shuffled. If the autocorrelation curve fit fails,
1111
+ a block length of 5% of T is used. The block length is limited to a
1112
+ maximum of 10% of T.
1113
+
1114
+ Mader et al., Journal of Neuroscience Methods,
1115
+ Volume 219, Issue 2, 15 October 2013, Pages 285-291
1116
+
1117
+ Parameters
1118
+ ----------
1119
+ array : array-like
1120
+ data array with X, Y, Z in rows and observations in columns
1121
+
1122
+ xyz : array of ints
1123
+ XYZ identifier array of shape (dim,).
1124
+
1125
+ mode : str
1126
+ Which mode to use.
1127
+
1128
+ Returns
1129
+ -------
1130
+ block_len : int
1131
+ Optimal block length.
1132
+ """
1133
+ # Inject a dependency on siganal, optimize
1134
+ from scipy import signal, optimize
1135
+ # Get the shape of the array
1136
+ dim, T = array.shape
1137
+ # Initiailize the indices
1138
+ indices = range(dim)
1139
+ if mode == 'significance':
1140
+ indices = np.where(xyz == 0)[0]
1141
+
1142
+ # Maximum lag for autocov estimation
1143
+ max_lag = int(0.1*T)
1144
+ # Define the function to optimize against
1145
+ def func(x_vals, a_const, decay):
1146
+ return a_const * decay**x_vals
1147
+
1148
+ # Calculate the block length
1149
+ block_len = 1
1150
+ for i in indices:
1151
+ # Get decay rate of envelope of autocorrelation functions
1152
+ # via hilbert trafo
1153
+ autocov = self._get_acf(series=array[i], max_lag=max_lag)
1154
+ autocov[0] = 1.
1155
+ hilbert = np.abs(signal.hilbert(autocov))
1156
+ # Try to fit the curve
1157
+ try:
1158
+ popt, _ = optimize.curve_fit(
1159
+ f=func,
1160
+ xdata=np.arange(0, max_lag+1),
1161
+ ydata=hilbert,
1162
+ )
1163
+ phi = popt[1]
1164
+ # Formula assuming non-overlapping blocks
1165
+ l_opt = (4. * T * (phi / (1. - phi) + phi**2 / (1. - phi)**2)**2
1166
+ / (1. + 2. * phi / (1. - phi))**2)**(1. / 3.)
1167
+ block_len = max(block_len, int(l_opt))
1168
+ except RuntimeError:
1169
+ print("Error - curve_fit failed in block_shuffle, using"
1170
+ " block_len = %d" % (int(.05 * T)))
1171
+ # block_len = max(int(.05 * T), block_len)
1172
+ # Limit block length to a maximum of 10% of T
1173
+ block_len = min(block_len, int(0.1 * T))
1174
+ return block_len
1175
+
1176
+ def _get_shuffle_dist(self, array, xyz, dependence_measure,
1177
+ sig_samples, sig_blocklength=None,
1178
+ verbosity=0):
1179
+ """Returns shuffle distribution of test statistic.
1180
+
1181
+ The rows in array corresponding to the X-variable are shuffled using
1182
+ a block-shuffle approach.
1183
+
1184
+ Parameters
1185
+ ----------
1186
+ array : array-like
1187
+ data array with X, Y, Z in rows and observations in columns
1188
+
1189
+ xyz : array of ints
1190
+ XYZ identifier array of shape (dim,).
1191
+
1192
+ dependence_measure : object
1193
+ Dependence measure function must be of form
1194
+ dependence_measure(array, xyz) and return a numeric value
1195
+
1196
+ sig_samples : int, optional (default: 100)
1197
+ Number of samples for shuffle significance test.
1198
+
1199
+ sig_blocklength : int, optional (default: None)
1200
+ Block length for block-shuffle significance test. If None, the
1201
+ block length is determined from the decay of the autocovariance as
1202
+ explained in [1]_.
1203
+
1204
+ verbosity : int, optional (default: 0)
1205
+ Level of verbosity.
1206
+
1207
+ Returns
1208
+ -------
1209
+ null_dist : array of shape (sig_samples,)
1210
+ Contains the sorted test statistic values estimated from the
1211
+ shuffled arrays.
1212
+ """
1213
+
1214
+ dim, T = array.shape
1215
+
1216
+ x_indices = np.where(xyz == 0)[0]
1217
+ dim_x = len(x_indices)
1218
+
1219
+ if sig_blocklength is None:
1220
+ sig_blocklength = self._get_block_length(array, xyz,
1221
+ mode='significance')
1222
+
1223
+ n_blks = int(math.floor(float(T)/sig_blocklength))
1224
+ # print 'n_blks ', n_blks
1225
+ if verbosity > 2:
1226
+ print(" Significance test with block-length = %d "
1227
+ "..." % (sig_blocklength))
1228
+
1229
+ array_shuffled = np.copy(array)
1230
+ block_starts = np.arange(0, T - sig_blocklength + 1, sig_blocklength)
1231
+
1232
+ # Dividing the array up into n_blks of length sig_blocklength may
1233
+ # leave a tail. This tail is later randomly inserted
1234
+ tail = array[x_indices, n_blks*sig_blocklength:]
1235
+ has_tail = tail.shape[1] > 0
1236
+ tail_len = tail.shape[1]
1237
+
1238
+ # Pre-compute block offset indices for vectorized block shuffle
1239
+ blk_offsets = np.arange(sig_blocklength)
1240
+ blk_total = n_blks * sig_blocklength
1241
+
1242
+ # Pre-allocate buffer to avoid np.insert reallocation each iteration
1243
+ if has_tail:
1244
+ x_buffer = np.empty((dim_x, T))
1245
+
1246
+ null_dist = np.zeros(sig_samples)
1247
+ for sam in range(sig_samples):
1248
+
1249
+ blk_starts = self.random_state.permutation(block_starts)[:n_blks]
1250
+
1251
+ # Vectorized block shuffle: compute all indices at once
1252
+ # blk_starts[:, None] + blk_offsets[None, :] gives (n_blks, sig_blocklength)
1253
+ # .T.ravel() interleaves to match blk::sig_blocklength pattern
1254
+ gather_indices = (blk_starts[:, np.newaxis] + blk_offsets[np.newaxis, :]).T.ravel()
1255
+ x_shuffled = array[np.ix_(x_indices, gather_indices)]
1256
+
1257
+ # Insert tail randomly using pre-allocated buffer (avoids np.insert realloc)
1258
+ if has_tail:
1259
+ insert_tail_at = self.random_state.choice(block_starts)
1260
+ x_buffer[:, :insert_tail_at] = x_shuffled[:, :insert_tail_at]
1261
+ x_buffer[:, insert_tail_at:insert_tail_at + tail_len] = tail
1262
+ x_buffer[:, insert_tail_at + tail_len:] = x_shuffled[:, insert_tail_at:]
1263
+ array_shuffled[x_indices] = x_buffer
1264
+ else:
1265
+ array_shuffled[x_indices] = x_shuffled
1266
+
1267
+ null_dist[sam] = dependence_measure(array=array_shuffled,
1268
+ xyz=xyz)
1269
+
1270
+ return null_dist
1271
+
1272
+ def get_fixed_thres_significance(self, value, fixed_thres):
1273
+ """DEPRECATED Returns signficance for thresholding test.
1274
+ """
1275
+ raise ValueError("fixed_thres is replaced by alpha_or_thres in run_test.")
1276
+ # if np.abs(value) < np.abs(fixed_thres):
1277
+ # pval = 1.
1278
+ # else:
1279
+ # pval = 0.
1280
+
1281
+ # return pval
1282
+
1283
+ def _trafo2uniform(self, x):
1284
+ """Transforms input array to uniform marginals.
1285
+
1286
+ Assumes x.shape = (dim, T)
1287
+
1288
+ Parameters
1289
+ ----------
1290
+ x : array-like
1291
+ Input array.
1292
+
1293
+ Returns
1294
+ -------
1295
+ u : array-like
1296
+ array with uniform marginals.
1297
+ """
1298
+
1299
+ def trafo(xi):
1300
+ xisorted = np.sort(xi)
1301
+ yi = np.linspace(1. / len(xi), 1, len(xi))
1302
+ return np.interp(xi, xisorted, yi)
1303
+
1304
+ if np.ndim(x) == 1:
1305
+ u = trafo(x)
1306
+ else:
1307
+ u = np.empty(x.shape)
1308
+ for i in range(x.shape[0]):
1309
+ u[i] = trafo(x[i])
1310
+ return u