tigramite-fast 5.2.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. tigramite/__init__.py +0 -0
  2. tigramite/causal_effects.py +1525 -0
  3. tigramite/causal_mediation.py +1592 -0
  4. tigramite/data_processing.py +1574 -0
  5. tigramite/graphs.py +1509 -0
  6. tigramite/independence_tests/LBFGS.py +1114 -0
  7. tigramite/independence_tests/__init__.py +0 -0
  8. tigramite/independence_tests/cmiknn.py +661 -0
  9. tigramite/independence_tests/cmiknn_mixed.py +1397 -0
  10. tigramite/independence_tests/cmisymb.py +286 -0
  11. tigramite/independence_tests/gpdc.py +664 -0
  12. tigramite/independence_tests/gpdc_torch.py +820 -0
  13. tigramite/independence_tests/gsquared.py +190 -0
  14. tigramite/independence_tests/independence_tests_base.py +1310 -0
  15. tigramite/independence_tests/oracle_conditional_independence.py +1582 -0
  16. tigramite/independence_tests/pairwise_CI.py +383 -0
  17. tigramite/independence_tests/parcorr.py +369 -0
  18. tigramite/independence_tests/parcorr_mult.py +485 -0
  19. tigramite/independence_tests/parcorr_wls.py +451 -0
  20. tigramite/independence_tests/regressionCI.py +403 -0
  21. tigramite/independence_tests/robust_parcorr.py +403 -0
  22. tigramite/jpcmciplus.py +966 -0
  23. tigramite/lpcmci.py +3649 -0
  24. tigramite/models.py +2257 -0
  25. tigramite/pcmci.py +3935 -0
  26. tigramite/pcmci_base.py +1218 -0
  27. tigramite/plotting.py +4735 -0
  28. tigramite/rpcmci.py +467 -0
  29. tigramite/toymodels/__init__.py +0 -0
  30. tigramite/toymodels/context_model.py +261 -0
  31. tigramite/toymodels/non_additive.py +1231 -0
  32. tigramite/toymodels/structural_causal_processes.py +1201 -0
  33. tigramite/toymodels/surrogate_generator.py +319 -0
  34. tigramite_fast-5.2.10.1.dist-info/METADATA +182 -0
  35. tigramite_fast-5.2.10.1.dist-info/RECORD +38 -0
  36. tigramite_fast-5.2.10.1.dist-info/WHEEL +5 -0
  37. tigramite_fast-5.2.10.1.dist-info/licenses/license.txt +621 -0
  38. tigramite_fast-5.2.10.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1574 @@
1
+ """Tigramite data processing functions."""
2
+
3
+ # Authors: Jakob Runge <jakob@jakob-runge.com>
4
+ # Andreas Gerhardus <andreas.gerhardus@dlr.de>
5
+ # License: GNU General Public License v3.0
6
+
7
+ from __future__ import print_function
8
+ from collections import defaultdict, OrderedDict
9
+ import sys
10
+ import warnings
11
+ from copy import deepcopy
12
+ import math
13
+ import numpy as np
14
+ import scipy.sparse
15
+ import scipy.sparse.linalg
16
+ from scipy import stats
17
+ # from numba import jit
18
+
19
+ class DataFrame():
20
+ """Data object containing single or multiple time series arrays and optional
21
+ mask, as well as variable definitions.
22
+
23
+ Parameters
24
+ ----------
25
+ data : array-like
26
+ if analysis_mode == 'single':
27
+ Numpy array of shape (observations T, variables N)
28
+ OR
29
+ Dictionary with a single entry whose value is a numpy array of
30
+ shape (observations T, variables N)
31
+ if analysis_mode == 'multiple':
32
+ Numpy array of shape (multiple datasets M, observations T,
33
+ variables N)
34
+ OR
35
+ Dictionary whose values are numpy arrays of shape
36
+ (observations T_i, variables N), where the number of observations
37
+ T_i may vary across the multiple datasets but the number of variables
38
+ N is fixed.
39
+ mask : array-like, optional (default: None)
40
+ Optional mask array, must be of same format and shape as data.
41
+ data_type : array-like
42
+ Binary data array of same shape as array which describes whether
43
+ individual samples in a variable (or all samples) are continuous
44
+ or discrete: 0s for continuous variables and 1s for discrete variables.
45
+ missing_flag : number, optional (default: None)
46
+ Flag for missing values in dataframe. Dismisses all time slices of
47
+ samples where missing values occur in any variable. For
48
+ remove_missing_upto_maxlag=True also flags samples for all lags up to
49
+ 2*tau_max (more precisely, this depends on the cut_off argument in
50
+ self.construct_array(), see further below). This avoids biases, see
51
+ section on masking in Supplement of Runge et al. SciAdv (2019).
52
+ vector_vars : dict
53
+ Dictionary of vector variables of the form,
54
+ Eg. {0: [(0, 0), (1, 0)], 1: [(2, 0)], 2: [(3, 0)], 3: [(4, 0)]}
55
+ The keys are the new vectorized variables and respective tuple values
56
+ are the individual components of the vector variables. In the method of
57
+ construct_array(), the individual components are parsed from vector_vars
58
+ and added (accounting for lags) to the list that creates X, Y and Z for
59
+ conditional independence test.
60
+ var_names : list of strings, optional (default: range(N))
61
+ Names of variables, must match the number of variables. If None is
62
+ passed, variables are enumerated as [0, 1, ...]
63
+ datatime : array-like, optional (default: None)
64
+ Timelabel array. If None, range(T) is used.
65
+ remove_missing_upto_maxlag : bool, optional (default: False)
66
+ Whether to remove not only missing samples, but also all neighboring
67
+ samples up to max_lag (as given by cut_off in construct_array).
68
+ analysis_mode : string, optional (default: 'single')
69
+ Must be 'single' or 'multiple'.
70
+ Determines whether data contains a single (potentially multivariate)
71
+ time series (--> 'single') or multiple time series (--> 'multiple').
72
+ reference_points : None, int, or list (or 1D array) of integers,
73
+ optional (default:None)
74
+ Determines the time steps --- relative to the shared time axis as
75
+ defined by the optional time_offset argument (see below) --- that are
76
+ used to create samples. Set to [0, 1, ..., T_max-1] if None is passed,
77
+ where T_max is self.largest_time_step, see below.
78
+ All values smaller than 0 and bigger than T_max-1 will be ignored.
79
+ At least one value must be in [0, 1, ..., T_max-1].
80
+ time_offsets : None or dict, optional (default: None)
81
+ if analysis_mode == 'single':
82
+ Must be None.
83
+ Shared time axis defined by the time indices of the single time series
84
+ if analysis_mode == 'multiple' and data is numpy array:
85
+ Must be None.
86
+ All datasets are assumed to be already aligned in time with
87
+ respect to a shared time axis, which is the time axis of data
88
+ if analysis_mode == 'multiple' and data is dictionary:
89
+ Must be dictionary of the form {key(m): time_offset(m), ...} whose
90
+ set of keys agrees with the set of keys of data and whose values are
91
+ non-negative integers, at least one of which is 0. The value
92
+ time_offset(m) defines the time offset of dataset m with
93
+ respect to a shared time axis.
94
+
95
+ Attributes
96
+ ----------
97
+ self._initialized_from : string
98
+ Specifies the data format in which data was given at instantiation.
99
+ Possible values: '2d numpy array', '3d numpy array', 'dict'.
100
+ self.values : dictionary
101
+ Dictionary holding the observations given by data internally mapped to a
102
+ dictionary representation as follows:
103
+ If analysis_mode == 'single': for self._initialized_from == '2d numpy array' this
104
+ is {0: data} and for self._initialized_from == 'dict' this is data.
105
+ If analysis_mode == 'multiple': If self._initialized_from == '3d numpy array', this is
106
+ {m: data[m, :, :] for m in range(data.shape[0])} and for self._initialized_from == 'dict' this
107
+ is data.
108
+ self.datasets: list
109
+ List of the keys identifiying the multiple datasets, i.e.,
110
+ list(self.values.keys())
111
+ self.mask : dictionary
112
+ Mask internally mapped to a dictionary representation in the same way as
113
+ data is mapped to self.values
114
+ self.data_type : array-like
115
+ Binary data array of same shape as array which describes whether
116
+ individual samples in a variable (or all samples) are continuous
117
+ or discrete: 0s for continuous variables and 1s for discrete variables.
118
+ self.missing_flag:
119
+ Is missing_flag
120
+ self.var_names:
121
+ If var_names is not None:
122
+ Is var_names
123
+ If var_names is None:
124
+ Is {i: i for i in range(self.N)}
125
+ self.datatime : dictionary
126
+ Time axis for each of the multiple datasets.
127
+ self.analysis_mode : string
128
+ Is analysis_mode
129
+ self.reference_points: array-like
130
+ If reference_points is not None:
131
+ 1D numpy array holding all specified reference_points, less those
132
+ smaller than 0 and larger than self.largest_time_step-1
133
+ If reference_points is None:
134
+ Is np.array(self.largest_time_step)
135
+ self.time_offsets : dictionary
136
+ If time_offsets is not None:
137
+ Is time_offsets
138
+ If time_offsets is None:
139
+ Is {key: 0 for key in self.values.keys()}
140
+ self.M : int
141
+ Number of datasets
142
+ self.N : int
143
+ Number of variables (constant across datasets)
144
+ self.T : dictionary
145
+ Dictionary {key(m): T(m), ...}, where T(m) is the time length of
146
+ datasets m and key(m) its identifier as in self.values
147
+ self.largest_time_step : int
148
+ max_{0 <= m <= M} [ T(m) + time_offset(m)], i.e., the largest (latest)
149
+ time step relative to the shared time axis for which at least one
150
+ observation exists in the dataset.
151
+ self.bootstrap : dictionary
152
+ Whether to use bootstrap. Must be a dictionary with keys random_state,
153
+ boot_samples, and boot_blocklength.
154
+ """
155
+
156
+ def __init__(self, data, mask=None, missing_flag=None, vector_vars=None, var_names=None,
157
+ data_type=None, datatime=None, analysis_mode ='single', reference_points=None,
158
+ time_offsets=None, remove_missing_upto_maxlag=False):
159
+
160
+ # Check that a valid analysis mode, specified by the argument
161
+ # 'analysis_mode', has been chosen
162
+ if analysis_mode in ['single', 'multiple']:
163
+ self.analysis_mode = analysis_mode
164
+ else:
165
+ raise ValueError("'analysis_mode' is '{}', must be 'single' or "\
166
+ "'multiple'.".format(analysis_mode))
167
+
168
+ # Check for correct type and format of 'data', internally cast to the
169
+ # analysis mode 'multiple' case in dictionary representation
170
+ if self.analysis_mode == 'single':
171
+ # In this case the 'time_offset' functionality must not be used
172
+ if time_offsets is not None:
173
+ raise ValueError("'time_offsets' must be None in analysis "\
174
+ "mode'single'.")
175
+
176
+ # 'data' must be either
177
+ # - np.ndarray of shape (T, N)
178
+ # - np.ndarray of shape (1, T, N)
179
+ # - a dictionary with one element whose value is a np.ndarray of
180
+ # shape (T, N)
181
+
182
+ if isinstance(data, np.ndarray):
183
+ _data_shape = data.shape
184
+ if len(_data_shape) == 2:
185
+ self.values = {0: np.copy(data)}
186
+ self._initialized_from = "2d numpy array"
187
+ elif len(_data_shape) == 3 and _data_shape[0] == 1:
188
+ self.values = {0: np.copy(data[0, :, :])}
189
+ self._initialized_from = "3d numpy array"
190
+ else:
191
+ raise TypeError("In analysis mode 'single', 'data' given "\
192
+ "as np.ndarray. 'data' is of shape {}, must be of "\
193
+ "shape (T, N) or (1, T, N).".format(_data_shape))
194
+
195
+ elif isinstance(data, dict):
196
+ if len(data) == 1:
197
+ _data = next(iter(data.values()))
198
+ if isinstance(_data, np.ndarray):
199
+ if len(_data.shape) == 2:
200
+ self.values = data.copy()
201
+ self._initialized_from = "dict"
202
+ else:
203
+ raise TypeError("In analysis mode 'single', "\
204
+ "'data'given as dictionary. The single value "\
205
+ "is a np.ndarray of shape {}, must be of "\
206
+ "shape (T, N).".format(_data.shape))
207
+ else:
208
+ raise TypeError("In analysis mode 'single', 'data' "\
209
+ "given as dictionary. The single value is of type "\
210
+ "{}, must be np.ndarray.".format(type(_data)))
211
+
212
+ else:
213
+ raise ValueError("In analysis mode 'single', 'data' given "\
214
+ "as dictionary. There are {} entries in 'data', there "\
215
+ "must be exactly one entry.".format(len(data)))
216
+
217
+ else:
218
+ raise TypeError("In analysis mode 'single'. 'data' is of type "\
219
+ "{}, must be np.ndarray or dict.".format(type(data)))
220
+
221
+ elif self.analysis_mode == 'multiple':
222
+ # 'data' must either be a
223
+ # - np.ndarray of shape (M, T, N)
224
+ # - dict whose values of are np.ndarray of shape (T_i, N), where T_i
225
+ # may vary across the values
226
+
227
+ if isinstance(data, np.ndarray):
228
+ _data_shape = data.shape
229
+ if len(_data_shape) == 3:
230
+ self.values = {i: np.copy(data[i, :, :]) for i in range(_data_shape[0])}
231
+ self._initialized_from = "3d numpy array"
232
+ else:
233
+ raise TypeError("In analysis mode 'multiple', 'data' "\
234
+ "given as np.ndarray. 'data' is of shape {}, must be "\
235
+ "of shape (M, T, N).".format(_data_shape))
236
+
237
+ # In this case the 'time_offset' functionality must not be used
238
+ if time_offsets is not None:
239
+ raise ValueError("In analysis mode 'multiple'. Since "\
240
+ "'data' is given as np.ndarray, 'time_offsets' must "\
241
+ "be None.")
242
+
243
+ elif isinstance(data, dict):
244
+ _N_list = set()
245
+ for dataset_key, dataset_data in data.items():
246
+ if isinstance(dataset_data, np.ndarray):
247
+ _dataset_data_shape = dataset_data.shape
248
+ if len(_dataset_data_shape) == 2:
249
+ _N_list.add(_dataset_data_shape[1])
250
+ else:
251
+ raise TypeError("In analysis mode 'multiple', "\
252
+ "'data' given as dictionary. 'data'[{}] is of "\
253
+ "shape {}, must be of shape (T_i, N).".format(
254
+ dataset_key, _dataset_data_shape))
255
+
256
+ else:
257
+ raise TypeError("In analysis mode 'multiple', 'data' "\
258
+ "given as dictionary. 'data'[{}] is of type {}, "\
259
+ "must be np.ndarray.".format(dataset_key,
260
+ type(dataset_data)))
261
+
262
+ if len(_N_list) == 1:
263
+ self.values = data.copy()
264
+ self._initialized_from = "dict"
265
+ else:
266
+ raise ValueError("In analysis mode 'multiple', 'data' "\
267
+ "given as dictionary. All entries must be np.ndarrays "\
268
+ "of shape (T_i, N), where T_i may vary across the "\
269
+ "entries while N must not vary. In the given 'data' N "\
270
+ "varies.")
271
+
272
+ else:
273
+ raise TypeError("In analysis mode 'multiple'. 'data' is of "\
274
+ "type {}, must be np.ndarray or dict.".format(type(data)))
275
+
276
+ # Store the keys of the datasets in a separated attribute
277
+ self.datasets = list(self.values.keys())
278
+
279
+ # Save the data format and check for NaNs:
280
+ self.M = len(self.values) # (Number of datasets)
281
+
282
+ self.T = dict() # (Time lengths of the individual datasets)
283
+ for dataset_key, dataset_data in self.values.items():
284
+ if np.isnan(dataset_data).sum() != 0:
285
+ raise ValueError("NaNs in the data.")
286
+
287
+ _dataset_data_shape = dataset_data.shape
288
+ self.T[dataset_key] = _dataset_data_shape[0]
289
+ self.Ndata = _dataset_data_shape[1] # (Number of variables)
290
+ # N does not vary across the datasets
291
+
292
+ # Setup dictionary of variables for vector mode
293
+ self.vector_vars = vector_vars
294
+ if self.vector_vars is None:
295
+ self.vector_vars = dict(zip(range(self.Ndata), [[(i, 0)]
296
+ for i in range(self.Ndata)]))
297
+ self.has_vector_data = False
298
+ else:
299
+ self.has_vector_data = True
300
+
301
+
302
+ # TODO: check vector_vars!
303
+ self.N = len(self.vector_vars)
304
+
305
+ # Warnings
306
+ if self.analysis_mode == 'single' and self.N > next(iter(self.T.values())):
307
+ warnings.warn("In analysis mode 'single', 'data'.shape = ({}, {});"\
308
+ " is it of shape (observations, variables)?".format(self.T[0],
309
+ self.N))
310
+
311
+ if self.analysis_mode == 'multiple' and self.M == 1:
312
+ warnings.warn("In analysis mode 'multiple'. There is just a "\
313
+ "single dataset, is this as intended?'")
314
+
315
+
316
+ # Save the variable names. If unspecified, use the default
317
+ if var_names is None:
318
+ self.var_names = {i: i for i in range(self.N)}
319
+ else:
320
+ self.var_names = var_names
321
+
322
+ self.mask = None
323
+ if mask is not None:
324
+ self.mask = self._check_mask(mask = mask)
325
+
326
+ self.data_type = None
327
+ if data_type is not None:
328
+ self.data_type = self._check_mask(mask = data_type)
329
+
330
+ # Check and prepare the time offsets
331
+ self._check_and_set_time_offsets(time_offsets)
332
+ self.time_offsets_is_none = time_offsets is None
333
+
334
+ # Set the default datatime if unspecified
335
+ if datatime is None:
336
+ self.datatime = {m: np.arange(self.time_offsets[m],
337
+ self.time_offsets[m] + self.T[m]) for m in self.values.keys()}
338
+ else:
339
+ if not isinstance(datatime, dict):
340
+ self.datatime = {0: datatime}
341
+ else:
342
+ self.datatime = datatime
343
+
344
+ # Save the largest/smallest relevant time step
345
+ self.largest_time_step = np.add(np.asarray(list(self.T.values())), np.asarray(list(self.time_offsets.values()))).max()
346
+ self.smallest_time_step = np.add(np.asarray(list(self.T.values())), np.asarray(list(self.time_offsets.values()))).min()
347
+
348
+ # Check and prepare the reference points
349
+ self._check_and_set_reference_points(reference_points)
350
+ self.reference_points_is_none = reference_points is None
351
+
352
+ # Save the 'missing_flag' value
353
+ self.missing_flag = missing_flag
354
+ if self.missing_flag is not None:
355
+ for dataset_key in self.values:
356
+ self.values[dataset_key][self.values[dataset_key] == self.missing_flag] = np.nan
357
+ self.remove_missing_upto_maxlag = remove_missing_upto_maxlag
358
+
359
+ # If PCMCI.run_bootstrap_of is called, then the
360
+ # bootstrap random draw can be set here
361
+ self.bootstrap = None
362
+
363
+
364
+ def _check_mask(self, mask):
365
+ """Checks that the mask and data_type arrays:
366
+ * have same shape as the data
367
+ * are an numpy ndarray (or subtype)
368
+ * do not contain any NaN entries
369
+ * contain only 0 and 1
370
+
371
+ """
372
+ # Check that there is a mask if required
373
+ _use_mask = mask
374
+
375
+ # If we have a mask, check it
376
+ if _use_mask is not None:
377
+ # Check data type and generic format of 'mask', map to multiple datasets mode
378
+ # dictionary representation
379
+ if isinstance(_use_mask, np.ndarray):
380
+ if len(_use_mask.shape) == 2:
381
+ _use_mask_dict = {0: _use_mask}
382
+ elif len(_use_mask.shape) == 3:
383
+ if _use_mask.shape[0] == self.M:
384
+ _use_mask_dict = {i: _use_mask[i, :, :] for i in range(self.M)}
385
+ else:
386
+ raise ValueError("Shape mismatch: {} datasets "\
387
+ " in data but {} in (type) mask, must be "\
388
+ "identical.".format(self.M, _use_mask.shape[0]))
389
+
390
+ else:
391
+ raise TypeError("data given as 3d np.ndarray. "\
392
+ "(type) mask is np.ndarray of shape {}, must be of "\
393
+ "shape (M, T, N).".format(_use_mask.shape))
394
+
395
+ elif isinstance(_use_mask, dict):
396
+ if len(_use_mask) == self.M:
397
+ for dataset_key in self.values.keys():
398
+ if _use_mask.get(dataset_key) is None:
399
+ raise ValueError("data has key {} (type {}) "\
400
+ "but (type) mask does not, keys must be "\
401
+ "identical.".format(dataset_key,
402
+ type(dataset_key)))
403
+
404
+ _use_mask_dict = _use_mask
405
+
406
+ else:
407
+ raise ValueError("Shape mismatch: {} datasets "\
408
+ "in data but {} in (type) mask, must be "\
409
+ "identical.".format(self.M, len(_use_mask)))
410
+ else:
411
+ raise TypeError("(type) mask is of type "\
412
+ "{}, must be dict or array.".format(type(_use_mask)))
413
+
414
+ # Check for consistency with shape of 'self.values' and for NaNs
415
+ for dataset_key, dataset_data in self.values.items():
416
+ _use_mask_dict_data = _use_mask_dict[dataset_key]
417
+ if _use_mask_dict_data.shape == dataset_data.shape:
418
+ if np.sum(np.isnan(_use_mask_dict_data)) != 0:
419
+ raise ValueError("NaNs in the (type) data mask")
420
+ # if check_data_type:
421
+ if not set(np.unique(_use_mask_dict_data)).issubset(set([0, 1])):
422
+ raise ValueError("(Type) mask contains other values than 0 and 1")
423
+ else:
424
+ if self.analysis_mode == 'single':
425
+ raise ValueError("Shape mismatch: data is of shape "\
426
+ "{}, (type) mask is of shape {}. Must be "\
427
+ "identical.".format(dataset_data.shape,
428
+ _use_mask_dict_data.shape))
429
+ elif self.analysis_mode == 'multiple':
430
+ raise ValueError("Shape mismatch: dataset {} "\
431
+ "is of shape {} in data and of shape {} in "\
432
+ "(type) mask. Must be identical.".format(dataset_key,
433
+ dataset_data.shape,
434
+ _use_mask_dict_data.shape))
435
+
436
+ # Return the mask in dictionary format
437
+ return _use_mask_dict
438
+
439
+ def _check_and_set_time_offsets(self, time_offsets):
440
+ """Check the argument 'time_offsets' for consistency and bring into
441
+ canonical format"""
442
+
443
+ if time_offsets is not None:
444
+
445
+ assert self.analysis_mode == 'multiple'
446
+ assert self._initialized_from == 'dict'
447
+
448
+ # Check data type and generic format of 'time_offsets', map to
449
+ # dictionary representation
450
+ if isinstance(time_offsets, dict):
451
+ if len(time_offsets) == self.M:
452
+ for dataset_key in self.values.keys():
453
+ if time_offsets.get(dataset_key) is None:
454
+ raise ValueError("'data' has key {} (type {}) but "\
455
+ "'time_offsets' does not, keys must be "\
456
+ "identical.".format(dataset_key,
457
+ type(dataset_key)))
458
+
459
+ self.time_offsets = time_offsets
460
+
461
+ else:
462
+ raise ValueError("Shape mismatch: {} datasets in "\
463
+ "'data' but {} in 'time_offsets', must be "\
464
+ "identical.".format(self.M, len(time_offsets)))
465
+
466
+ else:
467
+ raise TypeError("'time_offsets' is of type {}, must be "\
468
+ "dict.".format(type(time_offsets)))
469
+
470
+ # All time offsets must be non-negative integers, at least one of
471
+ # which is zero
472
+ found_zero_time_offset = False
473
+ for time_offset in self.time_offsets.values():
474
+ if np.issubdtype(type(time_offset), np.integer):
475
+ if time_offset >= 0:
476
+ if time_offset == 0:
477
+ found_zero_time_offset = True
478
+ else:
479
+ raise ValueError("A dataset has time offset "\
480
+ "{}, must be non-negative.".format(time_offset))
481
+
482
+ else:
483
+ raise TypeError("There is a time offset of type {}, must "\
484
+ "be int.".format(type(time_offset)))
485
+
486
+ if not found_zero_time_offset:
487
+ raise ValueError("At least one time offset must be 0.")
488
+
489
+ else:
490
+ # If no time offsets are specified, all of them are zero
491
+ self.time_offsets = {dataset_key: 0 for dataset_key in self.values.keys()}
492
+
493
+ def _check_and_set_reference_points(self, reference_points):
494
+ """Check the argument 'reference_point' for consistency and bring into
495
+ canonical format"""
496
+
497
+ # Check type of 'reference_points' and its elements
498
+ if reference_points is None:
499
+ # If no reference point is specified, use as many reference points
500
+ # as possible
501
+ self.reference_points = np.arange(self.largest_time_step)
502
+
503
+ elif isinstance(reference_points, int):
504
+ # If a single reference point is specified as an int, convert it to
505
+ # a single element numpy array
506
+ self.reference_points = np.array([reference_points])
507
+
508
+ elif isinstance(reference_points, np.ndarray):
509
+ # Check that all reference points are ints
510
+ for ref_point in reference_points:
511
+ if not np.issubdtype(type(ref_point), np.integer):
512
+ raise TypeError("All reference points must be integers.")
513
+
514
+ self.reference_points = reference_points
515
+
516
+ elif isinstance(reference_points, list):
517
+ # Check that all reference points are ints
518
+ for ref_point in reference_points:
519
+ if not isinstance(ref_point, int):
520
+ raise TypeError("All reference points must be integers.")
521
+
522
+ # If given as a list, cast to numpy array
523
+ self.reference_points = np.asarray(reference_points)
524
+
525
+ else:
526
+ raise TypeError("Unsupported data type of 'reference_points': Is "\
527
+ "{}, must be None or int or a list or np.ndarray of "\
528
+ "ints.".format(type(reference_points)))
529
+
530
+ # Remove negative reference points
531
+ if np.sum(self.reference_points < 0) > 0:
532
+ warnings.warn("Some reference points were negative. These are "\
533
+ "removed.")
534
+ self.reference_points = self.reference_points[self.reference_points >= 0]
535
+
536
+ # Remove reference points that are larger than the largest time step
537
+ if np.sum(self.reference_points >= self.largest_time_step) > 0:
538
+ warnings.warn("Some reference points were larger than the largest "\
539
+ "relevant time step, which here is {}. These are "\
540
+ "removed.".format(self.largest_time_step - 1))
541
+ self.reference_points = self.reference_points[self.reference_points < self.largest_time_step]
542
+
543
+ # Raise an error if no valid reference points was specified
544
+ if len(self.reference_points) == 0:
545
+ raise ValueError("No valid reference point.")
546
+
547
+
548
+ def construct_array(self, X, Y, Z, tau_max,
549
+ extraZ=None,
550
+ mask=None,
551
+ mask_type=None,
552
+ data_type=None,
553
+ return_cleaned_xyz=False,
554
+ do_checks=True,
555
+ remove_overlaps=True,
556
+ cut_off='2xtau_max',
557
+ verbosity=0):
558
+ """Constructs array from variables X, Y, Z from data.
559
+ Data is of shape (T, N) if analysis_mode == 'single', where T is the
560
+ time series length and N the number of variables, and of (n_ens, T, N)
561
+ if analysis_mode == 'multiple'.
562
+
563
+ Parameters
564
+ ----------
565
+ X, Y, Z, extraZ : list of tuples
566
+ For a dependence measure I(X;Y|Z), X, Y, Z can be multivariate of
567
+ the form [(var1, -lag), (var2, -lag), ...]. At least one varlag in Y
568
+ has to be at lag zero. extraZ is only used in CausalEffects class.
569
+ tau_max : int
570
+ Maximum time lag. This may be used to make sure that estimates for
571
+ different lags in X and Z all have the same sample size.
572
+ mask : array-like, optional (default: None)
573
+ Optional mask array, must be of same shape as data. If it is set,
574
+ then it overrides the self.mask assigned to the dataframe. If it is
575
+ None, then the self.mask is used, if it exists.
576
+ mask_type : {None, 'y','x','z','xy','xz','yz','xyz'}
577
+ Masking mode: Indicators for which variables in the dependence
578
+ measure I(X; Y | Z) the samples should be masked. If None, the mask
579
+ is not used. Explained in tutorial on masking and missing values.
580
+ data_type : array-like
581
+ Binary data array of same shape as array which describes whether
582
+ individual samples in a variable (or all samples) are continuous
583
+ or discrete: 0s for continuous variables and 1s for discrete variables.
584
+ If it is set, then it overrides the self.data_type assigned to the dataframe.
585
+ return_cleaned_xyz : bool, optional (default: False)
586
+ Whether to return cleaned X,Y,Z, where possible duplicates are
587
+ removed.
588
+ do_checks : bool, optional (default: True)
589
+ Whether to perform sanity checks on input X,Y,Z
590
+ remove_overlaps : bool, optional (default: True)
591
+ Whether to remove variables from Z/extraZ if they overlap with X or Y.
592
+ cut_off : {'2xtau_max', 'tau_max', 'max_lag', 'max_lag_or_tau_max', 2xtau_max_future}
593
+ If cut_off == '2xtau_max':
594
+ - 2*tau_max samples are cut off at the beginning of the time
595
+ series ('beginning' here refers to the temporally first
596
+ time steps). This guarantees that (as long as no mask is
597
+ used) all MCI tests are conducted on the same samples,
598
+ independent of X, Y, and Z.
599
+
600
+ - If at time step t_missing a data value is missing, then the
601
+ time steps t_missing, ..., t_missing + 2*tau_max are cut
602
+ out. The latter part only holds if
603
+ remove_missing_upto_maxlag=True.
604
+
605
+ If cut_off == 'max_lag':
606
+ - max_lag(X, Y, Z) samples are cut off at the beginning of the
607
+ time series, where max_lag(X, Y, Z) is the maximum lag of
608
+ all nodes in X, Y, and Z. These are all samples that can in
609
+ principle be used.
610
+
611
+ - If at time step t_missing a data value is missing, then the
612
+ time steps t_missing, ..., t_missing + max_lag(X, Y, Z) are
613
+ cut out. The latter part only holds if
614
+ remove_missing_upto_maxlag=True.
615
+
616
+ If cut_off == 'max_lag_or_tau_max':
617
+ - max(max_lag(X, Y, Z), tau_max) are cut off at the beginning.
618
+ This may be useful for modeling by comparing multiple
619
+ models on the same samples.
620
+
621
+ - If at time step t_missing a data value is missing, then the
622
+ time steps t_missing, ..., t_missing + max(max_lag(X, Y,
623
+ Z), tau_max) are cut out. The latter part only holds if
624
+ remove_missing_upto_maxlag=True.
625
+
626
+ If cut_off == 'tau_max':
627
+ - tau_max samples are cut off at the beginning. This may be
628
+ useful for modeling by comparing multiple models on the
629
+ same samples.
630
+
631
+ - If at time step t_missing a data value is missing, then the
632
+ time steps t_missing, ..., t_missing + max(max_lag(X, Y,
633
+ Z), tau_max) are cut out. The latter part only holds if
634
+ remove_missing_upto_maxlag=True.
635
+
636
+ If cut_off == '2xtau_max_future':
637
+ First, the relevant time steps are determined as for cut_off ==
638
+ 'max_lag'. Then, the temporally latest time steps are removed
639
+ such that the same number of time steps remains as there would
640
+ be for cut_off == '2xtau_max'. This may be useful when one is
641
+ mostly interested in the temporally first time steps and would
642
+ like all MCI tests to be performed on the same *number* of
643
+ samples. Note, however, that while the *number* of samples is
644
+ the same for all MCI tests, the samples themselves may be
645
+ different.
646
+ verbosity : int, optional (default: 0)
647
+ Level of verbosity.
648
+
649
+ Returns
650
+ -------
651
+ array, xyz [,XYZ], data_type : Tuple of data array of shape (dim, n_samples),
652
+ xyz identifier array of shape (dim,) identifying which row in array
653
+ corresponds to X, Y, and Z, and the type mask that indicates which samples
654
+ are continuous or discrete. For example: X = [(0, -1)],
655
+ Y = [(1, 0)], Z = [(1, -1), (0, -2)] yields an array of shape
656
+ (4, n_samples) and xyz is xyz = numpy.array([0,1,2,2]). If
657
+ return_cleaned_xyz is True, also outputs the cleaned XYZ lists.
658
+ """
659
+
660
+ # # This version does not yet work with bootstrap
661
+ # try:
662
+ # assert self.bootstrap is None
663
+ # except AssertionError:
664
+ # print("This version does not yet work with bootstrap.")
665
+ # raise
666
+
667
+ if extraZ is None:
668
+ extraZ = []
669
+
670
+ if Z is None:
671
+ Z = []
672
+
673
+ # If vector-valued variables exist, add them
674
+ def vectorize(varlag):
675
+ vectorized_var = []
676
+ for (var, lag) in varlag:
677
+ for (vector_var, vector_lag) in self.vector_vars[var]:
678
+ vectorized_var.append((vector_var, vector_lag + lag))
679
+ return vectorized_var
680
+
681
+ X = vectorize(X)
682
+ Y = vectorize(Y)
683
+ Z = vectorize(Z)
684
+ extraZ = vectorize(extraZ)
685
+
686
+ # Remove duplicates in X, Y, Z, extraZ
687
+ X = list(OrderedDict.fromkeys(X))
688
+ Y = list(OrderedDict.fromkeys(Y))
689
+ Z = list(OrderedDict.fromkeys(Z))
690
+ extraZ = list(OrderedDict.fromkeys(extraZ))
691
+
692
+ if remove_overlaps:
693
+ # If a node in Z occurs already in X or Y, remove it from Z
694
+ Z = [node for node in Z if (node not in X) and (node not in Y)]
695
+ extraZ = [node for node in extraZ if (node not in X) and (node not in Y) and (node not in Z)]
696
+
697
+ XYZ = X + Y + Z + extraZ
698
+ dim = len(XYZ)
699
+
700
+ # Check that all lags are non-positive and indices are in [0,N-1]
701
+ if do_checks:
702
+ self._check_nodes(Y, XYZ, self.Ndata, dim)
703
+
704
+ # Use the mask, override if needed
705
+ _mask = mask
706
+ if _mask is None:
707
+ _mask = self.mask
708
+ else:
709
+ _mask = self._check_mask(mask = _mask)
710
+
711
+ _data_type = data_type
712
+ if _data_type is None:
713
+ _data_type = self.data_type
714
+ else:
715
+ _data_type = self._check_mask(mask = _data_type, check_data_type=True)
716
+
717
+ # Figure out what cut off we will be using
718
+ if cut_off == '2xtau_max':
719
+ max_lag = 2*tau_max
720
+ elif cut_off == 'max_lag':
721
+ max_lag = abs(np.array(XYZ)[:, 1].min())
722
+ elif cut_off == 'tau_max':
723
+ max_lag = tau_max
724
+ elif cut_off == 'max_lag_or_tau_max':
725
+ max_lag = max(abs(np.array(XYZ)[:, 1].min()), tau_max)
726
+ elif cut_off == '2xtau_max_future':
727
+ ## TODO: CHECK THIS
728
+ max_lag = abs(np.array(XYZ)[:, 1].min())
729
+ # if vecotpr is not None or process_params is not none
730
+ # update maxlag as max(anylag, tau_max)
731
+ else:
732
+ raise ValueError("max_lag must be in {'2xtau_max', 'tau_max', 'max_lag', "\
733
+ "'max_lag_or_tau_max', '2xtau_max_future'}")
734
+
735
+ # Setup XYZ identifier
736
+ index_code = {'x' : 0,
737
+ 'y' : 1,
738
+ 'z' : 2,
739
+ 'e' : 3}
740
+ xyz = np.array([index_code[name]
741
+ for var, name in zip([X, Y, Z, extraZ], ['x', 'y', 'z', 'e'])
742
+ for _ in var])
743
+
744
+ # Run through all datasets and fill a dictionary holding the
745
+ # samples taken from the individual datasets
746
+ samples_datasets = dict()
747
+ data_types = dict()
748
+ self.use_indices_dataset_dict = dict()
749
+
750
+ for dataset_key, dataset_data in self.values.items():
751
+
752
+ # Apply time offset to the reference points
753
+ ref_points_here = self.reference_points - self.time_offsets[dataset_key]
754
+
755
+ # Remove reference points that are out of bounds or are to be
756
+ # excluded given the choice of 'cut_off'
757
+ ref_points_here = ref_points_here[ref_points_here >= max_lag]
758
+ ref_points_here = ref_points_here[ref_points_here < self.T[dataset_key]]
759
+
760
+ # Keep track of which reference points would have remained for
761
+ # max_lag == 2*tau_max
762
+ if cut_off == '2xtau_max_future':
763
+ ref_points_here_2_tau_max = self.reference_points - self.time_offsets[dataset_key]
764
+ ref_points_here_2_tau_max = ref_points_here_2_tau_max[ref_points_here_2_tau_max >= 2*tau_max]
765
+ ref_points_here_2_tau_max = ref_points_here_2_tau_max[ref_points_here_2_tau_max < self.T[dataset_key]]
766
+
767
+ # Sort the valid reference points (not needed, but might be useful
768
+ # for detailed debugging)
769
+ ref_points_here = np.sort(ref_points_here)
770
+
771
+ # For cut_off == '2xtau_max_future' reduce the samples size the
772
+ # number of samples that would have been obtained for cut_off ==
773
+ # '2xtau_max', removing the temporally latest ones
774
+ if cut_off == '2xtau_max_future':
775
+ n_to_cut_off = len(ref_points_here) - len(ref_points_here_2_tau_max)
776
+ assert n_to_cut_off >= 0
777
+ if n_to_cut_off > 0:
778
+ ref_points_here = np.sort(ref_points_here)
779
+ ref_points_here = ref_points_here[:-n_to_cut_off]
780
+
781
+ # If no valid reference points are left, continue with the next dataset
782
+ if len(ref_points_here) == 0:
783
+ continue
784
+
785
+ if self.bootstrap is not None:
786
+
787
+ boot_blocklength = self.bootstrap['boot_blocklength']
788
+
789
+ if boot_blocklength == 'cube_root':
790
+ boot_blocklength = max(1, int(len(ref_points_here)**(1/3)))
791
+ # elif boot_blocklength == 'from_autocorrelation':
792
+ # boot_blocklength = \
793
+ # get_block_length(overlapping_residuals.T, xyz=np.zeros(N), mode='confidence')
794
+ elif type(boot_blocklength) is int and boot_blocklength > 0:
795
+ pass
796
+ else:
797
+ raise ValueError("boot_blocklength must be integer > 0, 'cube_root', or 'from_autocorrelation'")
798
+
799
+ # Chooses THE SAME random seed for every dataset, maybe that's what we want...
800
+ # If the reference points are all the same, this will give the same bootstrap
801
+ # draw. However, if they are NOT the same, they will differ.
802
+ # TODO: Decide whether bootstrap draws should be the same for each dataset and
803
+ # how to achieve that if the reference points differ...
804
+ # random_state = self.bootstrap['random_state']
805
+ random_state = deepcopy(self.bootstrap['random_state'])
806
+
807
+ # Determine the number of blocks total, rounding up for non-integer
808
+ # amounts
809
+ n_blks = int(math.ceil(float(len(ref_points_here))/boot_blocklength))
810
+
811
+ if n_blks < 2:
812
+ raise ValueError("Only %d block(s) for block-sampling," %n_blks +
813
+ " choose smaller boot_blocklength!")
814
+ elif n_blks < 10:
815
+ warnings.warn("Only %d block(s) for block-sampling," %n_blks +
816
+ " choose smaller boot_blocklength!")
817
+
818
+ # Get the starting indices for the blocks
819
+ blk_strt = random_state.choice(np.arange(len(ref_points_here) - boot_blocklength), size=n_blks, replace=True)
820
+ # Get the empty array of block resampled values
821
+ boot_draw = np.zeros(n_blks*boot_blocklength, dtype='int')
822
+ # Fill the array of block resamples
823
+ for i in range(boot_blocklength):
824
+ boot_draw[i::boot_blocklength] = ref_points_here[blk_strt + i]
825
+ # Cut to proper length
826
+ ref_points_here = boot_draw[:len(ref_points_here)]
827
+
828
+ # Construct the data array holding the samples taken from the
829
+ # current dataset
830
+ samples_datasets[dataset_key] = np.zeros((dim, len(ref_points_here)), dtype = dataset_data.dtype)
831
+ for i, (var, lag) in enumerate(XYZ):
832
+ samples_datasets[dataset_key][i, :] = dataset_data[ref_points_here + lag, var]
833
+
834
+ # Build the mask array corresponding to this dataset
835
+ if _mask is not None:
836
+ mask_dataset = np.zeros((dim, len(ref_points_here)), dtype = 'bool')
837
+ for i, (var, lag) in enumerate(XYZ):
838
+ mask_dataset[i, :] = _mask[dataset_key][ref_points_here + lag, var]
839
+
840
+ # Take care of masking
841
+ use_indices_dataset = np.ones(len(ref_points_here), dtype = 'int')
842
+
843
+ # Build the type mask array corresponding to this dataset
844
+ if _data_type is not None:
845
+ data_type_dataset = np.zeros((dim, len(ref_points_here)), dtype = 'bool')
846
+ for i, (var, lag) in enumerate(XYZ):
847
+ data_type_dataset[i, :] = _data_type[dataset_key][ref_points_here + lag, var]
848
+ data_types[dataset_key] = data_type_dataset
849
+
850
+ # Remove all values that have missing value flag, and optionally as well the time
851
+ # slices that occur up to max_lag after
852
+ if self.missing_flag is not None:
853
+ missing_anywhere = np.array(np.where(np.any(np.isnan(samples_datasets[dataset_key]), axis=0))[0])
854
+
855
+ if self.remove_missing_upto_maxlag:
856
+ if len(missing_anywhere) > 0:
857
+ expanded = missing_anywhere[:, np.newaxis] + np.arange(max_lag + 1)[np.newaxis, :]
858
+ idx_to_remove = np.unique(expanded.ravel())
859
+ idx_to_remove = idx_to_remove[idx_to_remove < len(use_indices_dataset)]
860
+ else:
861
+ idx_to_remove = missing_anywhere
862
+ else:
863
+ idx_to_remove = missing_anywhere
864
+
865
+ use_indices_dataset[idx_to_remove.astype('int')] = 0
866
+
867
+ if _mask is not None:
868
+ # Remove samples with mask == 1 conditional on which mask_type
869
+ # is used
870
+
871
+ # Iterate over defined mapping from letter index to number index,
872
+ # i.e. 'x' -> 0, 'y' -> 1, 'z'-> 2, 'e'-> 3
873
+ for idx, cde in index_code.items():
874
+ # Check if the letter index is in the mask type
875
+ if (mask_type is not None) and (idx in mask_type):
876
+ # If so, check if any of the data that correspond to the
877
+ # letter index is masked by taking the product along the
878
+ # node-data to return a time slice selection, where 0
879
+ # means the time slice will not be used
880
+ slice_select = np.prod(mask_dataset[xyz == cde, :] == False, axis=0)
881
+ use_indices_dataset *= slice_select
882
+
883
+ # Accordingly update the data array and data type array
884
+ samples_datasets[dataset_key] = samples_datasets[dataset_key][:, use_indices_dataset == 1]
885
+ if _data_type is not None:
886
+ data_types[dataset_key] = data_types[dataset_key][:, use_indices_dataset == 1]
887
+
888
+ ## end for dataset_key, dataset_data in self.values.items()
889
+
890
+ # Save used indices as attribute
891
+ if len(ref_points_here) > 0:
892
+ self.use_indices_dataset_dict[dataset_key] = ref_points_here[use_indices_dataset==1]
893
+ else:
894
+ self.use_indices_dataset_dict[dataset_key] = []
895
+
896
+ # Concatenate the arrays of all datasets
897
+ array = np.concatenate(tuple(samples_datasets.values()), axis = 1)
898
+ if _data_type is not None:
899
+ type_array = np.concatenate(tuple(data_types.values()), axis = 1)
900
+ else:
901
+ type_array = None
902
+
903
+ # print(np.where(np.isnan(array)))
904
+ # print(array.shape)
905
+
906
+ # Check whether there is any valid sample
907
+ if array.shape[1] == 0:
908
+ raise ValueError("No valid samples")
909
+
910
+ # Print information about the constructed array
911
+ if verbosity > 2:
912
+ self.print_array_info(array, X, Y, Z, self.missing_flag, mask_type, type_array, extraZ)
913
+
914
+ # Return the array and xyz and optionally (X, Y, Z)
915
+ if return_cleaned_xyz:
916
+ return array, xyz, (X, Y, Z), type_array
917
+
918
+ return array, xyz, type_array
919
+
920
+ def _check_nodes(self, Y, XYZ, N, dim):
921
+ """
922
+ Checks that:
923
+ * The requests XYZ nodes have the correct shape
924
+ * All lags are non-positive
925
+ * All indices are less than N
926
+ * One of the Y nodes has zero lag
927
+
928
+ Parameters
929
+ ----------
930
+ Y : list of tuples
931
+ Of the form [(var, -tau)], where var specifies the variable
932
+ index and tau the time lag.
933
+ XYZ : list of tuples
934
+ List of nodes chosen for current independence test
935
+ N : int
936
+ Total number of listed nodes
937
+ dim : int
938
+ Number of nodes excluding repeated nodes
939
+ """
940
+ if np.array(XYZ).shape != (dim, 2):
941
+ raise ValueError("X, Y, Z must be lists of tuples in format"
942
+ " [(var, -lag),...], eg., [(2, -2), (1, 0), ...]")
943
+ if np.any(np.array(XYZ)[:, 1] > 0):
944
+ raise ValueError("nodes are %s, " % str(XYZ) +
945
+ "but all lags must be non-positive")
946
+ if (np.any(np.array(XYZ)[:, 0] >= N)
947
+ or np.any(np.array(XYZ)[:, 0] < 0)):
948
+ raise ValueError("var indices %s," % str(np.array(XYZ)[:, 0]) +
949
+ " but must be in [0, %d]" % (N - 1))
950
+ # if np.all(np.array(Y)[:, 1] != 0):
951
+ # raise ValueError("Y-nodes are %s, " % str(Y) +
952
+ # "but one of the Y-nodes must have zero lag")
953
+
954
+ def print_array_info(self, array, X, Y, Z, missing_flag, mask_type, data_type=None, extraZ=None):
955
+ """
956
+ Print info about the constructed array
957
+
958
+ Parameters
959
+ ----------
960
+ array : Data array of shape (dim, T)
961
+ Data array.
962
+ X, Y, Z, extraZ : list of tuples
963
+ For a dependence measure I(X;Y|Z), Y is of the form [(varY, 0)],
964
+ where var specifies the variable index. X typically is of the form
965
+ [(varX, -tau)] with tau denoting the time lag and Z can be
966
+ multivariate [(var1, -lag), (var2, -lag), ...] .
967
+ missing_flag : number, optional (default: None)
968
+ Flag for missing values. Dismisses all time slices of samples where
969
+ missing values occur in any variable and also flags samples for all
970
+ lags up to 2*tau_max. This avoids biases, see section on masking in
971
+ Supplement of [1]_.
972
+ mask_type : {'y','x','z','xy','xz','yz','xyz'}
973
+ Masking mode: Indicators for which variables in the dependence
974
+ measure I(X; Y | Z) the samples should be masked. If None, the mask
975
+ is not used. Explained in tutorial on masking and missing values.
976
+ data_type : array-like
977
+ Binary data array of same shape as array which describes whether
978
+ individual samples in a variable (or all samples) are continuous
979
+ or discrete: 0s for continuous variables and 1s for discrete variables.
980
+ """
981
+ if extraZ is None:
982
+ extraZ = []
983
+ indt = " " * 12
984
+ print(indt + "Constructed array of shape %s from"%str(array.shape) +
985
+ "\n" + indt + "X = %s" % str(X) +
986
+ "\n" + indt + "Y = %s" % str(Y) +
987
+ "\n" + indt + "Z = %s" % str(Z))
988
+ if extraZ is not None:
989
+ print(indt + "extraZ = %s" % str(extraZ))
990
+ if self.mask is not None and mask_type is not None:
991
+ print(indt+"with masked samples in %s removed" % mask_type)
992
+ if self.data_type is not None:
993
+ print(indt+"with %s % discrete values" % np.sum(data_type)/data_type.size)
994
+ if self.missing_flag is not None:
995
+ print(indt+"with missing values = %s removed" % self.missing_flag)
996
+
997
+
998
+ def get_acf(series, max_lag=None):
999
+ """Returns autocorrelation function.
1000
+
1001
+ Parameters
1002
+ ----------
1003
+ series : 1D-array
1004
+ data series to compute autocorrelation from
1005
+
1006
+ max_lag : int, optional (default: None)
1007
+ maximum lag for autocorrelation function. If None is passed, 10% of
1008
+ the data series length are used.
1009
+
1010
+ Returns
1011
+ -------
1012
+ autocorr : array of shape (max_lag + 1,)
1013
+ Autocorrelation function.
1014
+ """
1015
+ # Set the default max lag
1016
+ if max_lag is None:
1017
+ max_lag = int(max(5, 0.1*len(series)))
1018
+ # Initialize the result
1019
+ autocorr = np.ones(max_lag + 1)
1020
+ # Iterate over possible lags
1021
+ for lag in range(1, max_lag + 1):
1022
+ # Set the values
1023
+ y1_vals = series[lag:]
1024
+ y2_vals = series[:len(series) - lag]
1025
+ # Calculate the autocorrelation
1026
+ autocorr[lag] = np.corrcoef(y1_vals, y2_vals, ddof=0)[0, 1]
1027
+ return autocorr
1028
+
1029
+ def get_block_length(array, xyz, mode):
1030
+ """Returns optimal block length for significance and confidence tests.
1031
+
1032
+ Determine block length using approach in Mader (2013) [Eq. (6)] which
1033
+ improves the method of Pfeifer (2005) with non-overlapping blocks In
1034
+ case of multidimensional X, the max is used. Further details in [1]_.
1035
+ Two modes are available. For mode='significance', only the indices
1036
+ corresponding to X are shuffled in array. For mode='confidence' all
1037
+ variables are jointly shuffled. If the autocorrelation curve fit fails,
1038
+ a block length of 5% of T is used. The block length is limited to a
1039
+ maximum of 10% of T.
1040
+
1041
+ Mader et al., Journal of Neuroscience Methods,
1042
+ Volume 219, Issue 2, 15 October 2013, Pages 285-291
1043
+
1044
+ Parameters
1045
+ ----------
1046
+ array : array-like
1047
+ data array with X, Y, Z in rows and observations in columns
1048
+
1049
+ xyz : array of ints
1050
+ XYZ identifier array of shape (dim,).
1051
+
1052
+ mode : str
1053
+ Which mode to use.
1054
+
1055
+ Returns
1056
+ -------
1057
+ block_len : int
1058
+ Optimal block length.
1059
+ """
1060
+ # Inject a dependency on siganal, optimize
1061
+ from scipy import signal, optimize
1062
+ # Get the shape of the array
1063
+ dim, T = array.shape
1064
+ # Initiailize the indices
1065
+ indices = range(dim)
1066
+ if mode == 'significance':
1067
+ indices = np.where(xyz == 0)[0]
1068
+
1069
+ # Maximum lag for autocov estimation
1070
+ max_lag = int(0.1*T)
1071
+ # Define the function to optimize against
1072
+ def func(x_vals, a_const, decay):
1073
+ return a_const * decay**x_vals
1074
+
1075
+ # Calculate the block length
1076
+ block_len = 1
1077
+ for i in indices:
1078
+ # Get decay rate of envelope of autocorrelation functions
1079
+ # via hilbert trafo
1080
+ autocov = get_acf(series=array[i], max_lag=max_lag)
1081
+ autocov[0] = 1.
1082
+ hilbert = np.abs(signal.hilbert(autocov))
1083
+ # Try to fit the curve
1084
+ try:
1085
+ popt, _ = optimize.curve_fit(
1086
+ f=func,
1087
+ xdata=np.arange(0, max_lag+1),
1088
+ ydata=hilbert,
1089
+ )
1090
+ phi = popt[1]
1091
+ # Formula of Pfeifer (2005) assuming non-overlapping blocks
1092
+ l_opt = (4. * T * (phi / (1. - phi) + phi**2 / (1. - phi)**2)**2
1093
+ / (1. + 2. * phi / (1. - phi))**2)**(1. / 3.)
1094
+ block_len = max(block_len, int(l_opt))
1095
+ except RuntimeError:
1096
+ warnings.warn("Error - curve_fit failed for estimating block_shuffle length, using"
1097
+ " block_len = %d" % (int(.05 * T)))
1098
+ # block_len = max(int(.05 * T), block_len)
1099
+ # Limit block length to a maximum of 10% of T
1100
+ block_len = min(block_len, int(0.1 * T))
1101
+ return block_len
1102
+
1103
+
1104
+ def lowhighpass_filter(data, cutperiod, pass_periods='low'):
1105
+ """Butterworth low- or high pass filter.
1106
+
1107
+ This function applies a linear filter twice, once forward and once
1108
+ backwards. The combined filter has linear phase.
1109
+
1110
+ Parameters
1111
+ ----------
1112
+ data : array
1113
+ Data array of shape (time, variables).
1114
+ cutperiod : int
1115
+ Period of cutoff.
1116
+ pass_periods : str, optional (default: 'low')
1117
+ Either 'low' or 'high' to act as a low- or high-pass filter
1118
+
1119
+ Returns
1120
+ -------
1121
+ data : array
1122
+ Filtered data array.
1123
+ """
1124
+ try:
1125
+ from scipy.signal import butter, filtfilt
1126
+ except:
1127
+ print('Could not import scipy.signal for butterworth filtering!')
1128
+
1129
+ fs = 1.
1130
+ order = 3
1131
+ ws = 1. / cutperiod / (0.5 * fs)
1132
+ b, a = butter(order, ws, pass_periods)
1133
+ if np.ndim(data) == 1:
1134
+ data = filtfilt(b, a, data)
1135
+ else:
1136
+ for i in range(data.shape[1]):
1137
+ data[:, i] = filtfilt(b, a, data[:, i])
1138
+
1139
+ return data
1140
+
1141
+
1142
+ def smooth(data, smooth_width, kernel='gaussian',
1143
+ mask=None, residuals=False, verbosity=0):
1144
+ """Returns either smoothed time series or its residuals.
1145
+
1146
+ the difference between the original and the smoothed time series
1147
+ (=residuals) of a kernel smoothing with gaussian (smoothing kernel width =
1148
+ twice the sigma!) or heaviside window, equivalent to a running mean.
1149
+
1150
+ Assumes data of shape (T, N) or (T,)
1151
+ :rtype: array
1152
+ :returns: smoothed/residual data
1153
+
1154
+ Parameters
1155
+ ----------
1156
+ data : array
1157
+ Data array of shape (time, variables).
1158
+ smooth_width : float
1159
+ Window width of smoothing, 2*sigma for a gaussian.
1160
+ kernel : str, optional (default: 'gaussian')
1161
+ Smoothing kernel, 'gaussian' or 'heaviside' for a running mean.
1162
+ mask : bool array, optional (default: None)
1163
+ Data mask where True labels masked samples.
1164
+ residuals : bool, optional (default: False)
1165
+ True if residuals should be returned instead of smoothed data.
1166
+ verbosity : int, optional (default: 0)
1167
+ Level of verbosity.
1168
+
1169
+ Returns
1170
+ -------
1171
+ data : array-like
1172
+ Smoothed/residual data.
1173
+ """
1174
+
1175
+ if verbosity > 0:
1176
+ print("%s %s smoothing with " % ({True: "Take residuals of a ",
1177
+ False: ""}[residuals], kernel) +
1178
+ "window width %.2f (=2*sigma for a gaussian!)" % (smooth_width))
1179
+
1180
+ totaltime = len(data)
1181
+ if kernel == 'gaussian':
1182
+ window = np.exp(-(np.arange(totaltime).reshape((1, totaltime)) -
1183
+ np.arange(totaltime).reshape((totaltime, 1))
1184
+ ) ** 2 / ((2. * smooth_width / 2.) ** 2))
1185
+ elif kernel == 'heaviside':
1186
+ import scipy.linalg
1187
+ wtmp = np.zeros(totaltime)
1188
+ wtmp[:int(np.ceil(smooth_width / 2.))] = 1
1189
+ window = scipy.linalg.toeplitz(wtmp)
1190
+
1191
+ if mask is None:
1192
+ if np.ndim(data) == 1:
1193
+ smoothed_data = (data * window).sum(axis=1) / window.sum(axis=1)
1194
+ else:
1195
+ smoothed_data = np.zeros(data.shape)
1196
+ for i in range(data.shape[1]):
1197
+ smoothed_data[:, i] = (
1198
+ data[:, i] * window).sum(axis=1) / window.sum(axis=1)
1199
+ else:
1200
+ if np.ndim(data) == 1:
1201
+ smoothed_data = ((data * window * (mask==False)).sum(axis=1) /
1202
+ (window * (mask==False)).sum(axis=1))
1203
+ else:
1204
+ smoothed_data = np.zeros(data.shape)
1205
+ for i in range(data.shape[1]):
1206
+ smoothed_data[:, i] = ((
1207
+ data[:, i] * window * (mask==False)[:, i]).sum(axis=1) /
1208
+ (window * (mask==False)[:, i]).sum(axis=1))
1209
+
1210
+ if residuals:
1211
+ return data - smoothed_data
1212
+ else:
1213
+ return smoothed_data
1214
+
1215
+
1216
+ def weighted_avg_and_std(values, axis, weights):
1217
+ """Returns the weighted average and standard deviation.
1218
+
1219
+ Parameters
1220
+ ---------
1221
+ values : array
1222
+ Data array of shape (time, variables).
1223
+ axis : int
1224
+ Axis to average/std about
1225
+ weights : array
1226
+ Weight array of shape (time, variables).
1227
+
1228
+ Returns
1229
+ -------
1230
+ (average, std) : tuple of arrays
1231
+ Tuple of weighted average and standard deviation along axis.
1232
+ """
1233
+
1234
+ values[np.isnan(values)] = 0.
1235
+ average = np.ma.average(values, axis=axis, weights=weights)
1236
+
1237
+ variance = np.sum(weights * (values - np.expand_dims(average, axis)
1238
+ ) ** 2, axis=axis) / weights.sum(axis=axis)
1239
+
1240
+ return (average, np.sqrt(variance))
1241
+
1242
+
1243
+ def time_bin_with_mask(data, time_bin_length, mask=None):
1244
+ """Returns time binned data where only about non-masked values is averaged.
1245
+
1246
+ Parameters
1247
+ ----------
1248
+ data : array
1249
+ Data array of shape (time, variables).
1250
+ time_bin_length : int
1251
+ Length of time bin.
1252
+ mask : bool array, optional (default: None)
1253
+ Data mask where True labels masked samples.
1254
+
1255
+ Returns
1256
+ -------
1257
+ (bindata, T) : tuple of array and int
1258
+ Tuple of time-binned data array and new length of array.
1259
+ """
1260
+
1261
+ T = len(data)
1262
+
1263
+ time_bin_length = int(time_bin_length)
1264
+
1265
+ if mask is None:
1266
+ sample_selector = np.ones(data.shape)
1267
+ else:
1268
+ # Invert mask
1269
+ sample_selector = (mask == False)
1270
+
1271
+ if np.ndim(data) == 1.:
1272
+ data.shape = (T, 1)
1273
+ if mask is not None:
1274
+ mask.shape = (T, 1)
1275
+ else:
1276
+ sample_selector = np.ones(data.shape)
1277
+
1278
+ bindata = np.zeros(
1279
+ (T // time_bin_length,) + data.shape[1:], dtype="float32")
1280
+ for index, i in enumerate(range(0, T - time_bin_length + 1,
1281
+ time_bin_length)):
1282
+ # print weighted_avg_and_std(fulldata[i:i+time_bin_length], axis=0,
1283
+ # weights=sample_selector[i:i+time_bin_length])[0]
1284
+ bindata[index] = weighted_avg_and_std(data[i:i + time_bin_length],
1285
+ axis=0,
1286
+ weights=sample_selector[i:i +
1287
+ time_bin_length])[0]
1288
+
1289
+ T, grid_size = bindata.shape
1290
+
1291
+ return (bindata.squeeze(), T)
1292
+
1293
+ def trafo2normal(data, mask=None, thres=0.001):
1294
+ """Transforms input data to standard normal marginals.
1295
+
1296
+ Assumes data.shape = (T, dim)
1297
+
1298
+ Parameters
1299
+ ----------
1300
+ data : array
1301
+ Data array of shape (time, variables).
1302
+ thres : float
1303
+ Set outer points in CDF to this value.
1304
+ mask : bool array, optional (default: None)
1305
+ Data mask where True labels masked samples.
1306
+
1307
+ Returns
1308
+ -------
1309
+ normal_data : array-like
1310
+ data with standard normal marginals.
1311
+ """
1312
+
1313
+ def trafo(xi):
1314
+ xisorted = np.sort(xi)
1315
+ yi = np.linspace(1. / len(xi), 1, len(xi))
1316
+ return np.interp(xi, xisorted, yi)
1317
+
1318
+ normal_data = np.copy(data)
1319
+
1320
+ if np.ndim(data) == 1:
1321
+ if mask is None:
1322
+ nonmasked = np.where(np.isnan(data) == False)[0]
1323
+ else:
1324
+ nonmasked = np.where((mask==0)*(np.isnan(data) == False))
1325
+
1326
+ u = trafo(data[nonmasked])
1327
+ u[u==0.] = thres
1328
+ u[u==1.] = 1. - thres
1329
+ normal_data[nonmasked] = stats.norm.ppf(u)
1330
+ else:
1331
+ for i in range(data.shape[1]):
1332
+ if mask is None:
1333
+ nonmasked = np.where(np.isnan(data[:,i]) == False)[0]
1334
+ else:
1335
+ nonmasked = np.where((mask[:, i]==0)*(np.isnan(data[:, i]) == False))
1336
+ # nonmasked = np.where(mask[:, i]==0)
1337
+ # print(data[:, i].shape, nonmasked.shape)
1338
+ uniform = trafo(data[:, i][nonmasked])
1339
+
1340
+ # print(data[-3:, i][nonmasked])
1341
+
1342
+ uniform[uniform==0.] = thres
1343
+ uniform[uniform==1.] = 1. - thres
1344
+ normal_data[:, i][nonmasked] = stats.norm.ppf(uniform)
1345
+
1346
+ return normal_data
1347
+
1348
+ # @jit(nopython=True)
1349
+ def _get_patterns(array, array_mask, patt, patt_mask, weights, dim, step, fac, N, T):
1350
+ v = np.zeros(dim, dtype='float')
1351
+
1352
+ start = step * (dim - 1)
1353
+ for n in range(0, N):
1354
+ for t in range(start, T):
1355
+ mask = 1
1356
+ ave = 0.
1357
+ for k in range(0, dim):
1358
+ tau = k * step
1359
+ v[k] = array[t - tau, n]
1360
+ ave += v[k]
1361
+ mask *= array_mask[t - tau, n]
1362
+ ave /= dim
1363
+ var = 0.
1364
+ for k in range(0, dim):
1365
+ var += (v[k] - ave) ** 2
1366
+ var /= dim
1367
+ weights[t - start, n] = var
1368
+ if (v[0] < v[1]):
1369
+ p = 1
1370
+ else:
1371
+ p = 0
1372
+ for i in range(2, dim):
1373
+ for j in range(0, i):
1374
+ if (v[j] < v[i]):
1375
+ p += fac[i]
1376
+ patt[t - start, n] = p
1377
+ patt_mask[t - start, n] = mask
1378
+
1379
+ return patt, patt_mask, weights
1380
+
1381
+ def ordinal_patt_array(array, array_mask=None, dim=2, step=1,
1382
+ weights=False, seed=None, verbosity=0):
1383
+ """Returns symbolified array of ordinal patterns.
1384
+
1385
+ Each data vector (X_t, ..., X_t+(dim-1)*step) is converted to its rank
1386
+ vector. E.g., (0.2, -.6, 1.2) --> (1,0,2) which is then assigned to a
1387
+ unique integer (see Article). There are faculty(dim) possible rank vectors.
1388
+
1389
+ Note that the symb_array is step*(dim-1) shorter than the original array!
1390
+
1391
+ Reference: B. Pompe and J. Runge (2011). Momentary information transfer as
1392
+ a coupling measure of time series. Phys. Rev. E, 83(5), 1-12.
1393
+ doi:10.1103/PhysRevE.83.051122
1394
+
1395
+ Parameters
1396
+ ----------
1397
+ array : array-like
1398
+ Data array of shape (time, variables).
1399
+ array_mask : bool array
1400
+ Data mask where True labels masked samples.
1401
+ dim : int, optional (default: 2)
1402
+ Pattern dimension
1403
+ step : int, optional (default: 1)
1404
+ Delay of pattern embedding vector.
1405
+ weights : bool, optional (default: False)
1406
+ Whether to return array of variances of embedding vectors as weights.
1407
+ seed : int
1408
+ For adding noise to break ties.
1409
+ verbosity : int, optional (default: 0)
1410
+ Level of verbosity.
1411
+
1412
+ Returns
1413
+ -------
1414
+ patt, patt_mask [, patt_time] : tuple of arrays
1415
+ Tuple of converted pattern array and new length
1416
+ """
1417
+ random_state = np.random.default_rng(seed)
1418
+
1419
+ from scipy.special import factorial
1420
+
1421
+ array = array.astype('float64')
1422
+
1423
+ if array_mask is not None:
1424
+ assert array_mask.dtype == 'int32'
1425
+ else:
1426
+ array_mask = np.zeros(array.shape, dtype='int32')
1427
+
1428
+
1429
+ if np.ndim(array) == 1:
1430
+ T = len(array)
1431
+ array = array.reshape(T, 1)
1432
+ array_mask = array_mask.reshape(T, 1)
1433
+
1434
+ # Add noise to destroy ties...
1435
+ array += (1E-6 * array.std(axis=0)
1436
+ * random_state.random((array.shape[0], array.shape[1])).astype('float64'))
1437
+
1438
+ patt_time = int(array.shape[0] - step * (dim - 1))
1439
+ T, N = array.shape
1440
+
1441
+ if dim <= 1 or patt_time <= 0:
1442
+ raise ValueError("Dim mist be > 1 and length of delay vector smaller "
1443
+ "array length.")
1444
+
1445
+ patt = np.zeros((patt_time, N), dtype='int32')
1446
+ weights_array = np.zeros((patt_time, N), dtype='float64')
1447
+
1448
+ patt_mask = np.zeros((patt_time, N), dtype='int32')
1449
+
1450
+ # Precompute factorial for c-code... patterns of dimension
1451
+ # larger than 10 are not supported
1452
+ fac = factorial(np.arange(10)).astype('int32')
1453
+
1454
+ # _get_patterns assumes mask=0 to be a masked value
1455
+ array_mask = (array_mask == False).astype('int32')
1456
+
1457
+ (patt, patt_mask, weights_array) = _get_patterns(array, array_mask, patt, patt_mask, weights_array, dim, step, fac, N, T)
1458
+
1459
+ weights_array = np.asarray(weights_array)
1460
+ patt = np.asarray(patt)
1461
+ # Transform back to mask=1 implying a masked value
1462
+ patt_mask = np.asarray(patt_mask) == False
1463
+
1464
+ if weights:
1465
+ return patt, patt_mask, patt_time, weights_array
1466
+ else:
1467
+ return patt, patt_mask, patt_time
1468
+
1469
+
1470
+ def quantile_bin_array(data, bins=6):
1471
+ """Returns symbolified array with equal-quantile binning.
1472
+
1473
+ Parameters
1474
+ ----------
1475
+ data : array
1476
+ Data array of shape (time, variables).
1477
+ bins : int, optional (default: 6)
1478
+ Number of bins.
1479
+
1480
+ Returns
1481
+ -------
1482
+ symb_array : array
1483
+ Converted data of integer type.
1484
+ """
1485
+ T, N = data.shape
1486
+
1487
+ # get the bin quantile steps
1488
+ bin_edge = int(np.ceil(T / float(bins)))
1489
+
1490
+ symb_array = np.zeros((T, N), dtype='int32')
1491
+
1492
+ # get the lower edges of the bins for every time series
1493
+ edges = np.sort(data, axis=0)[::bin_edge, :].T
1494
+ bins = edges.shape[1]
1495
+
1496
+ # This gives the symbolic time series
1497
+ symb_array = (data.reshape(T, N, 1) >= edges.reshape(1, N, bins)).sum(
1498
+ axis=2) - 1
1499
+
1500
+ return symb_array.astype('int32')
1501
+
1502
+
1503
+ def var_process(parents_neighbors_coeffs, T=1000, use='inv_inno_cov',
1504
+ verbosity=0, initial_values=None):
1505
+ """Returns a vector-autoregressive process with correlated innovations.
1506
+
1507
+ Wrapper around var_network with possibly more user-friendly input options.
1508
+
1509
+ DEPRECATED. Will be removed in future.
1510
+ """
1511
+ print("data generating models are now in toymodels folder: "
1512
+ "from tigramite.toymodels import structural_causal_processes as toys.")
1513
+ return None
1514
+
1515
+ def structural_causal_process(links, T, noises=None,
1516
+ intervention=None, intervention_type='hard',
1517
+ seed=None):
1518
+ """Returns a structural causal process with contemporaneous and lagged
1519
+ dependencies.
1520
+
1521
+ DEPRECATED. Will be removed in future.
1522
+ """
1523
+ print("data generating models are now in toymodels folder: "
1524
+ "from tigramite.toymodels import structural_causal_processes as toys.")
1525
+ return None
1526
+
1527
+
1528
+ if __name__ == '__main__':
1529
+
1530
+ from tigramite.toymodels.structural_causal_processes import structural_causal_process
1531
+ ## Generate some time series from a structural causal process
1532
+ def lin_f(x): return x
1533
+ def nonlin_f(x): return (x + 5. * x**2 * np.exp(-x**2 / 20.))
1534
+
1535
+ links = {0: [((0, -1), 0.9, lin_f)],
1536
+ 1: [((1, -1), 0.8, lin_f), ((0, -1), 0.3, nonlin_f)],
1537
+ 2: [((2, -1), 0.7, lin_f), ((1, 0), -0.2, lin_f)],
1538
+ }
1539
+
1540
+ random_state_1 = np.random.default_rng(seed=1)
1541
+ random_state_2 = np.random.default_rng(seed=2)
1542
+ random_state_3 = np.random.default_rng(seed=3)
1543
+
1544
+ noises = [random_state_1.standard_normal, random_state_2.standard_normal, random_state_3.standard_normal]
1545
+
1546
+ ens = 3
1547
+ data_ens = {}
1548
+ for i in range(ens):
1549
+ data, nonstat = structural_causal_process(links,
1550
+ T=100, noises=noises)
1551
+ data[10, 1] == 999.
1552
+ data_ens[i] = data
1553
+ # print(data.shape)
1554
+
1555
+ frame = DataFrame(data_ens, missing_flag=999.,
1556
+ analysis_mode = 'multiple')
1557
+
1558
+ print(frame.T)
1559
+
1560
+ # X=[(0, 0)]
1561
+ # Y=[(0, 0)]
1562
+ # Z=[(0, -3)]
1563
+ # tau_max=5
1564
+ # frame.construct_array(X, Y, Z, tau_max,
1565
+ # extraZ=None,
1566
+ # mask=None,
1567
+ # mask_type=None,
1568
+ # return_cleaned_xyz=False,
1569
+ # do_checks=True,
1570
+ # cut_off='2xtau_max',
1571
+ # verbosity=4)
1572
+
1573
+ # print(ordinal_patt_array(data, array_mask=None, dim=2, step=1,
1574
+ # weights=False, verbosity=0)[0])