tigramite-fast 5.2.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tigramite/__init__.py +0 -0
- tigramite/causal_effects.py +1525 -0
- tigramite/causal_mediation.py +1592 -0
- tigramite/data_processing.py +1574 -0
- tigramite/graphs.py +1509 -0
- tigramite/independence_tests/LBFGS.py +1114 -0
- tigramite/independence_tests/__init__.py +0 -0
- tigramite/independence_tests/cmiknn.py +661 -0
- tigramite/independence_tests/cmiknn_mixed.py +1397 -0
- tigramite/independence_tests/cmisymb.py +286 -0
- tigramite/independence_tests/gpdc.py +664 -0
- tigramite/independence_tests/gpdc_torch.py +820 -0
- tigramite/independence_tests/gsquared.py +190 -0
- tigramite/independence_tests/independence_tests_base.py +1310 -0
- tigramite/independence_tests/oracle_conditional_independence.py +1582 -0
- tigramite/independence_tests/pairwise_CI.py +383 -0
- tigramite/independence_tests/parcorr.py +369 -0
- tigramite/independence_tests/parcorr_mult.py +485 -0
- tigramite/independence_tests/parcorr_wls.py +451 -0
- tigramite/independence_tests/regressionCI.py +403 -0
- tigramite/independence_tests/robust_parcorr.py +403 -0
- tigramite/jpcmciplus.py +966 -0
- tigramite/lpcmci.py +3649 -0
- tigramite/models.py +2257 -0
- tigramite/pcmci.py +3935 -0
- tigramite/pcmci_base.py +1218 -0
- tigramite/plotting.py +4735 -0
- tigramite/rpcmci.py +467 -0
- tigramite/toymodels/__init__.py +0 -0
- tigramite/toymodels/context_model.py +261 -0
- tigramite/toymodels/non_additive.py +1231 -0
- tigramite/toymodels/structural_causal_processes.py +1201 -0
- tigramite/toymodels/surrogate_generator.py +319 -0
- tigramite_fast-5.2.10.1.dist-info/METADATA +182 -0
- tigramite_fast-5.2.10.1.dist-info/RECORD +38 -0
- tigramite_fast-5.2.10.1.dist-info/WHEEL +5 -0
- tigramite_fast-5.2.10.1.dist-info/licenses/license.txt +621 -0
- tigramite_fast-5.2.10.1.dist-info/top_level.txt +1 -0
tigramite/pcmci_base.py
ADDED
|
@@ -0,0 +1,1218 @@
|
|
|
1
|
+
"""Tigramite causal discovery for time series."""
|
|
2
|
+
|
|
3
|
+
# Author: Jakob Runge <jakob@jakob-runge.com>
|
|
4
|
+
#
|
|
5
|
+
# License: GNU General Public License v3.0
|
|
6
|
+
|
|
7
|
+
from __future__ import print_function
|
|
8
|
+
import warnings
|
|
9
|
+
import itertools
|
|
10
|
+
from collections import defaultdict
|
|
11
|
+
from copy import deepcopy
|
|
12
|
+
import numpy as np
|
|
13
|
+
import scipy.stats
|
|
14
|
+
import math
|
|
15
|
+
from joblib import Parallel, delayed
|
|
16
|
+
|
|
17
|
+
class PCMCIbase():
|
|
18
|
+
r"""PCMCI base class.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
dataframe : data object
|
|
23
|
+
This is the Tigramite dataframe object. Among others, it has the
|
|
24
|
+
attributes dataframe.values yielding a numpy array of shape (
|
|
25
|
+
observations T, variables N) and optionally a mask of the same shape.
|
|
26
|
+
cond_ind_test : conditional independence test object
|
|
27
|
+
This can be ParCorr or other classes from
|
|
28
|
+
``tigramite.independence_tests`` or an external test passed as a
|
|
29
|
+
callable. This test can be based on the class
|
|
30
|
+
tigramite.independence_tests.CondIndTest.
|
|
31
|
+
verbosity : int, optional (default: 0)
|
|
32
|
+
Verbose levels 0, 1, ...
|
|
33
|
+
|
|
34
|
+
Attributes
|
|
35
|
+
----------
|
|
36
|
+
all_parents : dictionary
|
|
37
|
+
Dictionary of form {0:[(0, -1), (3, -2), ...], 1:[], ...} containing
|
|
38
|
+
the conditioning-parents estimated with PC algorithm.
|
|
39
|
+
val_min : dictionary
|
|
40
|
+
Dictionary of form val_min[j][(i, -tau)] = float
|
|
41
|
+
containing the minimum test statistic value for each link estimated in
|
|
42
|
+
the PC algorithm.
|
|
43
|
+
pval_max : dictionary
|
|
44
|
+
Dictionary of form pval_max[j][(i, -tau)] = float containing the maximum
|
|
45
|
+
p-value for each link estimated in the PC algorithm.
|
|
46
|
+
iterations : dictionary
|
|
47
|
+
Dictionary containing further information on algorithm steps.
|
|
48
|
+
N : int
|
|
49
|
+
Number of variables.
|
|
50
|
+
T : dict
|
|
51
|
+
Time series sample length of dataset(s).
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, dataframe,
|
|
55
|
+
cond_ind_test,
|
|
56
|
+
verbosity=0):
|
|
57
|
+
# Set the data for this iteration of the algorithm
|
|
58
|
+
self.dataframe = dataframe
|
|
59
|
+
# Set the conditional independence test to be used
|
|
60
|
+
self.cond_ind_test = deepcopy(cond_ind_test)
|
|
61
|
+
if isinstance(self.cond_ind_test, type):
|
|
62
|
+
raise ValueError("PCMCI requires that cond_ind_test "
|
|
63
|
+
"is instantiated, e.g. cond_ind_test = "
|
|
64
|
+
"ParCorr().")
|
|
65
|
+
self.cond_ind_test.set_dataframe(self.dataframe)
|
|
66
|
+
# Set the verbosity for debugging/logging messages
|
|
67
|
+
self.verbosity = verbosity
|
|
68
|
+
# Set the variable names
|
|
69
|
+
self.var_names = self.dataframe.var_names
|
|
70
|
+
|
|
71
|
+
# Store the shape of the data in the T and N variables
|
|
72
|
+
self.T = self.dataframe.T
|
|
73
|
+
self.N = self.dataframe.N
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _reverse_link(self, link):
|
|
77
|
+
"""Reverse a given link, taking care to replace > with < and vice versa."""
|
|
78
|
+
|
|
79
|
+
if link == "":
|
|
80
|
+
return ""
|
|
81
|
+
|
|
82
|
+
if link[2] == ">":
|
|
83
|
+
left_mark = "<"
|
|
84
|
+
else:
|
|
85
|
+
left_mark = link[2]
|
|
86
|
+
|
|
87
|
+
if link[0] == "<":
|
|
88
|
+
right_mark = ">"
|
|
89
|
+
else:
|
|
90
|
+
right_mark = link[0]
|
|
91
|
+
|
|
92
|
+
return left_mark + link[1] + right_mark
|
|
93
|
+
|
|
94
|
+
def _check_cyclic(self, link_dict):
|
|
95
|
+
"""Return True if the link_dict has a contemporaneous cycle.
|
|
96
|
+
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
path = set()
|
|
100
|
+
visited = set()
|
|
101
|
+
|
|
102
|
+
def visit(vertex):
|
|
103
|
+
if vertex in visited:
|
|
104
|
+
return False
|
|
105
|
+
visited.add(vertex)
|
|
106
|
+
path.add(vertex)
|
|
107
|
+
for itaui in link_dict.get(vertex, ()):
|
|
108
|
+
i, taui = itaui
|
|
109
|
+
link_type = link_dict[vertex][itaui]
|
|
110
|
+
if taui == 0 and link_type in ['-->', '-?>']:
|
|
111
|
+
if i in path or visit(i):
|
|
112
|
+
return True
|
|
113
|
+
path.remove(vertex)
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
return any(visit(v) for v in link_dict)
|
|
117
|
+
|
|
118
|
+
def _set_link_assumptions(self, link_assumptions, tau_min, tau_max,
|
|
119
|
+
remove_contemp=False):
|
|
120
|
+
"""Helper function to set and check the link_assumptions argument
|
|
121
|
+
|
|
122
|
+
Parameters
|
|
123
|
+
----------
|
|
124
|
+
link_assumptions : dict
|
|
125
|
+
Dictionary of form {j:{(i, -tau): link_type, ...}, ...} specifying
|
|
126
|
+
assumptions about links. This initializes the graph with entries
|
|
127
|
+
graph[i,j,tau] = link_type. For example, graph[i,j,0] = '-->'
|
|
128
|
+
implies that a directed link from i to j at lag 0 must exist.
|
|
129
|
+
Valid link types are 'o-o', '-->', '<--'. In addition, the middle
|
|
130
|
+
mark can be '?' instead of '-'. Then '-?>' implies that this link
|
|
131
|
+
may not exist, but if it exists, its orientation is '-->'. Link
|
|
132
|
+
assumptions need to be consistent, i.e., graph[i,j,0] = '-->'
|
|
133
|
+
requires graph[j,i,0] = '<--' and acyclicity must hold. If a link
|
|
134
|
+
does not appear in the dictionary, it is assumed absent. That is,
|
|
135
|
+
if link_assumptions is not None, then all links have to be specified
|
|
136
|
+
or the links are assumed absent.
|
|
137
|
+
tau_mix : int
|
|
138
|
+
Minimum time delay to test.
|
|
139
|
+
tau_max : int
|
|
140
|
+
Maximum time delay to test.
|
|
141
|
+
remove_contemp : bool
|
|
142
|
+
Whether contemporaneous links (at lag zero) should be removed.
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
link_assumptions : dict
|
|
147
|
+
Cleaned links.
|
|
148
|
+
"""
|
|
149
|
+
# Copy and pass into the function
|
|
150
|
+
# Shallow nested copy suffices: keys are tuples (immutable), values are strings (immutable)
|
|
151
|
+
_int_link_assumptions = {j: dict(link_assumptions[j]) for j in link_assumptions} if link_assumptions is not None else None
|
|
152
|
+
# Set the default selected links if none are set
|
|
153
|
+
_vars = list(range(self.N))
|
|
154
|
+
_lags = list(range(-(tau_max), -tau_min + 1, 1))
|
|
155
|
+
if _int_link_assumptions is None:
|
|
156
|
+
_int_link_assumptions = {}
|
|
157
|
+
# Set the default as all combinations
|
|
158
|
+
for j in _vars:
|
|
159
|
+
_int_link_assumptions[j] = {}
|
|
160
|
+
for i in _vars:
|
|
161
|
+
for lag in range(tau_min, tau_max + 1):
|
|
162
|
+
if not (i == j and lag == 0):
|
|
163
|
+
if lag == 0:
|
|
164
|
+
_int_link_assumptions[j][(i, 0)] = 'o?o'
|
|
165
|
+
else:
|
|
166
|
+
_int_link_assumptions[j][(i, -lag)] = '-?>'
|
|
167
|
+
|
|
168
|
+
else:
|
|
169
|
+
|
|
170
|
+
if remove_contemp:
|
|
171
|
+
for j in _int_link_assumptions.keys():
|
|
172
|
+
_int_link_assumptions[j] = {link:_int_link_assumptions[j][link]
|
|
173
|
+
for link in _int_link_assumptions[j]
|
|
174
|
+
if link[1] != 0}
|
|
175
|
+
|
|
176
|
+
# Make contemporaneous assumptions consistent and orient lagged links
|
|
177
|
+
for j in _vars:
|
|
178
|
+
for link in _int_link_assumptions[j]:
|
|
179
|
+
i, tau = link
|
|
180
|
+
link_type = _int_link_assumptions[j][link]
|
|
181
|
+
if tau == 0:
|
|
182
|
+
if (j, 0) in _int_link_assumptions[i]:
|
|
183
|
+
if _int_link_assumptions[j][link] != self._reverse_link(_int_link_assumptions[i][(j, 0)]):
|
|
184
|
+
raise ValueError("Inconsistent link assumptions for indices %d - %d " %(i, j))
|
|
185
|
+
else:
|
|
186
|
+
_int_link_assumptions[i][(j, 0)] = self._reverse_link(_int_link_assumptions[j][link])
|
|
187
|
+
else:
|
|
188
|
+
# Orient lagged links by time order while leaving the middle mark
|
|
189
|
+
new_link_type = '-' + link_type[1] + '>'
|
|
190
|
+
_int_link_assumptions[j][link] = new_link_type
|
|
191
|
+
|
|
192
|
+
# Otherwise, check that our assumpions are sane
|
|
193
|
+
# Check that the link_assumptions refer to links that are inside the
|
|
194
|
+
# data range and types
|
|
195
|
+
_key_set = set(_int_link_assumptions.keys())
|
|
196
|
+
valid_entries = _key_set == set(range(self.N))
|
|
197
|
+
|
|
198
|
+
valid_types = [
|
|
199
|
+
'o-o',
|
|
200
|
+
'o?o',
|
|
201
|
+
'-->',
|
|
202
|
+
'-?>',
|
|
203
|
+
'<--',
|
|
204
|
+
'<?-',
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
for links in _int_link_assumptions.values():
|
|
208
|
+
if isinstance(links, dict) and len(links) == 0:
|
|
209
|
+
continue
|
|
210
|
+
for var, lag in links:
|
|
211
|
+
if var not in _vars or lag not in _lags:
|
|
212
|
+
valid_entries = False
|
|
213
|
+
if links[(var, lag)] not in valid_types:
|
|
214
|
+
valid_entries = False
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
if not valid_entries:
|
|
218
|
+
raise ValueError("link_assumptions"
|
|
219
|
+
" must be dictionary with keys for all [0,...,N-1]"
|
|
220
|
+
" variables and contain only links from "
|
|
221
|
+
"these variables in range [tau_min, tau_max] "
|
|
222
|
+
"and with link types in %s" %str(valid_types))
|
|
223
|
+
|
|
224
|
+
# Check for contemporaneous cycles
|
|
225
|
+
if self._check_cyclic(_int_link_assumptions):
|
|
226
|
+
raise ValueError("link_assumptions has contemporaneous cycle(s).")
|
|
227
|
+
|
|
228
|
+
# Return the _int_link_assumptions
|
|
229
|
+
return _int_link_assumptions
|
|
230
|
+
|
|
231
|
+
def _dict_to_matrix(self, val_dict, tau_max, n_vars, default=1):
|
|
232
|
+
"""Helper function to convert dictionary to matrix format.
|
|
233
|
+
|
|
234
|
+
Parameters
|
|
235
|
+
---------
|
|
236
|
+
val_dict : dict
|
|
237
|
+
Dictionary of form {0:{(0, -1):float, ...}, 1:{...}, ...}.
|
|
238
|
+
tau_max : int
|
|
239
|
+
Maximum lag.
|
|
240
|
+
n_vars : int
|
|
241
|
+
Number of variables.
|
|
242
|
+
default : int
|
|
243
|
+
Default value for entries not part of val_dict.
|
|
244
|
+
|
|
245
|
+
Returns
|
|
246
|
+
-------
|
|
247
|
+
matrix : array of shape (N, N, tau_max+1)
|
|
248
|
+
Matrix format of p-values and test statistic values.
|
|
249
|
+
"""
|
|
250
|
+
matrix = np.ones((n_vars, n_vars, tau_max + 1))
|
|
251
|
+
matrix *= default
|
|
252
|
+
|
|
253
|
+
for j in val_dict.keys():
|
|
254
|
+
for link in val_dict[j].keys():
|
|
255
|
+
k, tau = link
|
|
256
|
+
if tau == 0:
|
|
257
|
+
matrix[k, j, 0] = matrix[j, k, 0] = val_dict[j][link]
|
|
258
|
+
else:
|
|
259
|
+
matrix[k, j, abs(tau)] = val_dict[j][link]
|
|
260
|
+
return matrix
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def get_corrected_pvalues(self, p_matrix,
|
|
264
|
+
fdr_method='fdr_bh',
|
|
265
|
+
exclude_contemporaneous=True,
|
|
266
|
+
tau_min=0,
|
|
267
|
+
tau_max=1,
|
|
268
|
+
link_assumptions=None,
|
|
269
|
+
):
|
|
270
|
+
"""Returns p-values corrected for multiple testing.
|
|
271
|
+
|
|
272
|
+
Currently implemented is Benjamini-Hochberg False Discovery Rate
|
|
273
|
+
method. Correction is performed either among all links if
|
|
274
|
+
exclude_contemporaneous==False, or only among lagged links.
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
p_matrix : array-like
|
|
279
|
+
Matrix of p-values. Must be of shape (N, N, tau_max + 1).
|
|
280
|
+
tau_min : int, default: 0
|
|
281
|
+
Minimum time lag. Only used as consistency check of link_assumptions.
|
|
282
|
+
tau_max : int, default: 1
|
|
283
|
+
Maximum time lag. Must be larger or equal to tau_min. Only used as
|
|
284
|
+
consistency check of link_assumptions.
|
|
285
|
+
link_assumptions : dict or None
|
|
286
|
+
Dictionary of form {j:{(i, -tau): link_type, ...}, ...} specifying
|
|
287
|
+
assumptions about links. This initializes the graph with entries
|
|
288
|
+
graph[i,j,tau] = link_type. For example, graph[i,j,0] = '-->'
|
|
289
|
+
implies that a directed link from i to j at lag 0 must exist.
|
|
290
|
+
Valid link types are 'o-o', '-->', '<--'. In addition, the middle
|
|
291
|
+
mark can be '?' instead of '-'. Then '-?>' implies that this link
|
|
292
|
+
may not exist, but if it exists, its orientation is '-->'. Link
|
|
293
|
+
assumptions need to be consistent, i.e., graph[i,j,0] = '-->'
|
|
294
|
+
requires graph[j,i,0] = '<--' and acyclicity must hold. If a link
|
|
295
|
+
does not appear in the dictionary, it is assumed absent. That is,
|
|
296
|
+
if link_assumptions is not None, then all links have to be specified
|
|
297
|
+
or the links are assumed absent.
|
|
298
|
+
fdr_method : str, optional (default: 'fdr_bh')
|
|
299
|
+
Correction method, currently implemented is Benjamini-Hochberg
|
|
300
|
+
False Discovery Rate method.
|
|
301
|
+
exclude_contemporaneous : bool, optional (default: True)
|
|
302
|
+
Whether to include contemporaneous links in correction.
|
|
303
|
+
|
|
304
|
+
Returns
|
|
305
|
+
-------
|
|
306
|
+
q_matrix : array-like
|
|
307
|
+
Matrix of shape (N, N, tau_max + 1) containing corrected p-values.
|
|
308
|
+
"""
|
|
309
|
+
|
|
310
|
+
def _ecdf(x):
|
|
311
|
+
"""No frills empirical cdf used in fdr correction.
|
|
312
|
+
"""
|
|
313
|
+
nobs = len(x)
|
|
314
|
+
return np.arange(1, nobs + 1) / float(nobs)
|
|
315
|
+
|
|
316
|
+
# Get the shape parameters from the p_matrix
|
|
317
|
+
_, N, tau_max_plusone = p_matrix.shape
|
|
318
|
+
# Check the limits on tau
|
|
319
|
+
self._check_tau_limits(tau_min, tau_max)
|
|
320
|
+
# Include only link_assumptions if given
|
|
321
|
+
if link_assumptions != None:
|
|
322
|
+
# Create a mask for these values
|
|
323
|
+
mask = np.zeros((N, N, tau_max_plusone), dtype='bool')
|
|
324
|
+
_int_link_assumptions = self._set_link_assumptions(link_assumptions, tau_min, tau_max)
|
|
325
|
+
for j, links_ in _int_link_assumptions.items():
|
|
326
|
+
for link in links_:
|
|
327
|
+
i, lag = link
|
|
328
|
+
if _int_link_assumptions[j][link] not in ["<--", "<?-"]:
|
|
329
|
+
mask[i, j, abs(lag)] = True
|
|
330
|
+
else:
|
|
331
|
+
# Create a mask for these values
|
|
332
|
+
mask = np.ones((N, N, tau_max_plusone), dtype='bool')
|
|
333
|
+
# Ignore values from lag-zero 'autocorrelation' indices
|
|
334
|
+
mask[range(N), range(N), 0] = False
|
|
335
|
+
# Exclude all contemporaneous values if requested
|
|
336
|
+
if exclude_contemporaneous:
|
|
337
|
+
mask[:, :, 0] = False
|
|
338
|
+
# Create the return value
|
|
339
|
+
q_matrix = np.array(p_matrix)
|
|
340
|
+
# Use the multiple tests function
|
|
341
|
+
if fdr_method is None or fdr_method == 'none':
|
|
342
|
+
pass
|
|
343
|
+
elif fdr_method == 'fdr_bh':
|
|
344
|
+
pvs = p_matrix[mask]
|
|
345
|
+
pvals_sortind = np.argsort(pvs)
|
|
346
|
+
pvals_sorted = np.take(pvs, pvals_sortind)
|
|
347
|
+
|
|
348
|
+
ecdffactor = _ecdf(pvals_sorted)
|
|
349
|
+
|
|
350
|
+
pvals_corrected_raw = pvals_sorted / ecdffactor
|
|
351
|
+
pvals_corrected = np.minimum.accumulate(
|
|
352
|
+
pvals_corrected_raw[::-1])[::-1]
|
|
353
|
+
del pvals_corrected_raw
|
|
354
|
+
|
|
355
|
+
pvals_corrected[pvals_corrected > 1] = 1
|
|
356
|
+
pvals_corrected_ = np.empty_like(pvals_corrected)
|
|
357
|
+
pvals_corrected_[pvals_sortind] = pvals_corrected
|
|
358
|
+
del pvals_corrected
|
|
359
|
+
|
|
360
|
+
q_matrix[mask] = pvals_corrected_
|
|
361
|
+
|
|
362
|
+
else:
|
|
363
|
+
raise ValueError('Only FDR method fdr_bh implemented')
|
|
364
|
+
|
|
365
|
+
# Return the new matrix
|
|
366
|
+
return q_matrix
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _get_adj_time_series(self, graph, include_conflicts=True, sort_by=None):
|
|
370
|
+
"""Helper function that returns dictionary of adjacencies from graph.
|
|
371
|
+
|
|
372
|
+
Parameters
|
|
373
|
+
----------
|
|
374
|
+
graph : array of shape [N, N, tau_max+1]
|
|
375
|
+
Resulting causal graph, see description above for interpretation.
|
|
376
|
+
include_conflicts : bool, optional (default: True)
|
|
377
|
+
Whether conflicting links (marked as 2 in graph) should be returned.
|
|
378
|
+
sort_by : dict or none, optional (default: None)
|
|
379
|
+
If not None, the adjacencies are sorted by the absolute values of
|
|
380
|
+
the corresponding entries.
|
|
381
|
+
|
|
382
|
+
Returns
|
|
383
|
+
-------
|
|
384
|
+
adjt : dictionary
|
|
385
|
+
Adjacency dictionary.
|
|
386
|
+
"""
|
|
387
|
+
N, N, tau_max_plusone = graph.shape
|
|
388
|
+
adjt = {}
|
|
389
|
+
if include_conflicts:
|
|
390
|
+
for j in range(N):
|
|
391
|
+
where = np.where(graph[:, j, :] != "")
|
|
392
|
+
adjt[j] = list(zip(*(where[0], -where[1])))
|
|
393
|
+
else:
|
|
394
|
+
for j in range(N):
|
|
395
|
+
where = np.where(np.logical_and.reduce((graph[:,j,:] != "",
|
|
396
|
+
graph[:,j,:] != "x-x",
|
|
397
|
+
graph[:,j,:] != "x?x")))
|
|
398
|
+
# where = np.where(graph[:, j, :] == 1)
|
|
399
|
+
adjt[j] = list(zip(*(where[0], -where[1])))
|
|
400
|
+
|
|
401
|
+
if sort_by is not None:
|
|
402
|
+
for j in range(N):
|
|
403
|
+
# Get the absolute value for all the test statistics
|
|
404
|
+
abs_values = {k: np.abs(sort_by[j][k]) for k in list(sort_by[j])
|
|
405
|
+
if k in adjt[j]}
|
|
406
|
+
adjt[j] = sorted(abs_values, key=abs_values.get, reverse=True)
|
|
407
|
+
|
|
408
|
+
return adjt
|
|
409
|
+
|
|
410
|
+
def _get_adj_time_series_contemp(self, graph, include_conflicts=True,
|
|
411
|
+
sort_by=None):
|
|
412
|
+
"""Helper function that returns dictionary of contemporaneous
|
|
413
|
+
adjacencies from graph.
|
|
414
|
+
|
|
415
|
+
Parameters
|
|
416
|
+
----------
|
|
417
|
+
graph : array of shape [N, N, tau_max+1]
|
|
418
|
+
Resulting causal graph, see description above for interpretation.
|
|
419
|
+
include_conflicts : bool, optional (default: True)
|
|
420
|
+
Whether conflicting links (marked as 2 in graph) should be returned.
|
|
421
|
+
sort_by : dict or none, optional (default: None)
|
|
422
|
+
If not None, the adjacencies are sorted by the absolute values of
|
|
423
|
+
the corresponding entries.
|
|
424
|
+
|
|
425
|
+
Returns
|
|
426
|
+
-------
|
|
427
|
+
adjt : dictionary
|
|
428
|
+
Contemporaneous adjacency dictionary.
|
|
429
|
+
"""
|
|
430
|
+
N, N, tau_max_plusone = graph.shape
|
|
431
|
+
adjt = self._get_adj_time_series(graph,
|
|
432
|
+
include_conflicts=include_conflicts,
|
|
433
|
+
sort_by=sort_by)
|
|
434
|
+
for j in range(N):
|
|
435
|
+
adjt[j] = [a for a in adjt[j] if a[1] == 0]
|
|
436
|
+
# adjt[j] = list(np.where(graph[:,j,0] != 0)[0])
|
|
437
|
+
|
|
438
|
+
return adjt
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def _get_simplicial_node(self, circle_cpdag, variable_order):
|
|
442
|
+
"""Find simplicial nodes in circle component CPDAG.
|
|
443
|
+
|
|
444
|
+
A vertex V is simplicial if all vertices adjacent to V are also adjacent
|
|
445
|
+
to each other (form a clique).
|
|
446
|
+
|
|
447
|
+
Parameters
|
|
448
|
+
----------
|
|
449
|
+
circle_cpdag : array of shape (N, N, tau_max+1)
|
|
450
|
+
Circle component of PCMCIplus graph.
|
|
451
|
+
variable_order : list of length N
|
|
452
|
+
Order of variables in which to search for simplicial nodes.
|
|
453
|
+
|
|
454
|
+
Returns
|
|
455
|
+
-------
|
|
456
|
+
(j, adj_j) or None
|
|
457
|
+
First found simplicial node and its adjacencies.
|
|
458
|
+
"""
|
|
459
|
+
|
|
460
|
+
for j in variable_order:
|
|
461
|
+
adj_j = np.where(np.logical_or(circle_cpdag[:,j,0] == "o-o",
|
|
462
|
+
circle_cpdag[:,j,0] == "o?o"))[0].tolist()
|
|
463
|
+
|
|
464
|
+
# Make sure the node has any adjacencies
|
|
465
|
+
all_adjacent = len(adj_j) > 0
|
|
466
|
+
|
|
467
|
+
# If it has just one adjacency, it's also simplicial
|
|
468
|
+
if len(adj_j) == 1:
|
|
469
|
+
return (j, adj_j)
|
|
470
|
+
else:
|
|
471
|
+
for (var1, var2) in itertools.combinations(adj_j, 2):
|
|
472
|
+
if circle_cpdag[var1, var2, 0] == "":
|
|
473
|
+
all_adjacent = False
|
|
474
|
+
break
|
|
475
|
+
|
|
476
|
+
if all_adjacent:
|
|
477
|
+
return (j, adj_j)
|
|
478
|
+
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
def _get_dag_from_cpdag(self, cpdag_graph, variable_order):
|
|
482
|
+
"""Yields one member of the Markov equivalence class of a CPDAG.
|
|
483
|
+
|
|
484
|
+
Removes conflicting edges.
|
|
485
|
+
|
|
486
|
+
Used in PCMCI to run model selection on the output of PCMCIplus in order
|
|
487
|
+
to, e.g., optimize pc_alpha.
|
|
488
|
+
|
|
489
|
+
Based on Zhang 2008, Theorem 2 (simplified for CPDAGs): Let H be the
|
|
490
|
+
graph resulting from the following procedure applied to a CPDAG:
|
|
491
|
+
|
|
492
|
+
Consider the circle component of the CPDAG (sub graph consisting of all
|
|
493
|
+
(o-o edges, i.e., only for contemporaneous links), CPDAG^C and turn into
|
|
494
|
+
a DAG with no unshielded colliders. Then (H is a member of the Markov
|
|
495
|
+
equivalence class of the CPDAG.
|
|
496
|
+
|
|
497
|
+
We use the approach mentioned in Colombo and Maathuis (2015) Lemma 7.6:
|
|
498
|
+
First note that CPDAG^C is chordal, that is, any cycle of length four or
|
|
499
|
+
more has a chord, which is an edge joining two vertices that are not
|
|
500
|
+
adjacent in the cycle; see the proof of Lemma 4.1 of Zhang (2008b). Any
|
|
501
|
+
chordal graph with more than one vertex has two simplicial vertices,
|
|
502
|
+
that is, vertices V such that all vertices adjacent to V are also
|
|
503
|
+
adjacent to each other. We choose such a vertex V1 and orient any edges
|
|
504
|
+
incident to V1 into V1. Since V1 is simplicial, this does not create
|
|
505
|
+
unshielded colliders. We then remove V1 and these edges from the graph.
|
|
506
|
+
The resulting graph is again chordal and therefore again has at least
|
|
507
|
+
two simplicial vertices. Choose such a vertex V2 , and orient any edges
|
|
508
|
+
incident to V2 into V2. We continue this procedure until all edges are
|
|
509
|
+
oriented. The resulting ordering is called a perfect elimination scheme
|
|
510
|
+
for CPDAG^C. Then the combined graph with the directed edges already
|
|
511
|
+
contained in the CPDAG is returned.
|
|
512
|
+
|
|
513
|
+
Parameters
|
|
514
|
+
----------
|
|
515
|
+
cpdag_graph : array of shape (N, N, tau_max+1)
|
|
516
|
+
Result of PCMCIplus, a CPDAG.
|
|
517
|
+
variable_order : list of length N
|
|
518
|
+
Order of variables in which to search for simplicial nodes.
|
|
519
|
+
|
|
520
|
+
Returns
|
|
521
|
+
-------
|
|
522
|
+
dag : array of shape (N, N, tau_max+1)
|
|
523
|
+
One member of the Markov equivalence class of the CPDAG.
|
|
524
|
+
"""
|
|
525
|
+
|
|
526
|
+
# TODO: Check whether CPDAG is chordal
|
|
527
|
+
|
|
528
|
+
# Initialize resulting MAG
|
|
529
|
+
dag = np.copy(cpdag_graph)
|
|
530
|
+
|
|
531
|
+
# Turn circle component CPDAG^C into a DAG with no unshielded colliders.
|
|
532
|
+
circle_cpdag = np.copy(cpdag_graph)
|
|
533
|
+
# All lagged links are directed by time, remove them here
|
|
534
|
+
circle_cpdag[:,:,1:] = ""
|
|
535
|
+
# Also remove conflicting links
|
|
536
|
+
circle_cpdag[circle_cpdag=="x-x"] = ""
|
|
537
|
+
# Find undirected links, remove directed links
|
|
538
|
+
for i, j, tau in zip(*np.where(circle_cpdag != "")):
|
|
539
|
+
if circle_cpdag[i,j,0][1] == '?':
|
|
540
|
+
raise ValueError("Invalid middle mark.")
|
|
541
|
+
if circle_cpdag[i,j,0] == "-->":
|
|
542
|
+
circle_cpdag[i,j,0] = ""
|
|
543
|
+
|
|
544
|
+
# Iterate through simplicial nodes
|
|
545
|
+
simplicial_node = self._get_simplicial_node(circle_cpdag,
|
|
546
|
+
variable_order)
|
|
547
|
+
while simplicial_node is not None:
|
|
548
|
+
|
|
549
|
+
# Choose such a vertex V1 and orient any edges incident to V1 into
|
|
550
|
+
# V1 in the MAG And remove V1 and these edges from the circle
|
|
551
|
+
# component PAG
|
|
552
|
+
(j, adj_j) = simplicial_node
|
|
553
|
+
for var in adj_j:
|
|
554
|
+
dag[var, j, 0] = "-->"
|
|
555
|
+
dag[j, var, 0] = "<--"
|
|
556
|
+
circle_cpdag[var, j, 0] = circle_cpdag[j, var, 0] = ""
|
|
557
|
+
|
|
558
|
+
# Iterate
|
|
559
|
+
simplicial_node = self._get_simplicial_node(circle_cpdag,
|
|
560
|
+
variable_order)
|
|
561
|
+
|
|
562
|
+
return dag
|
|
563
|
+
|
|
564
|
+
def convert_to_string_graph(self, graph_bool):
|
|
565
|
+
"""Converts the 0,1-based graph returned by PCMCI to a string array
|
|
566
|
+
with links '-->'.
|
|
567
|
+
|
|
568
|
+
Parameters
|
|
569
|
+
----------
|
|
570
|
+
graph_bool : array
|
|
571
|
+
0,1-based graph array output by PCMCI.
|
|
572
|
+
|
|
573
|
+
Returns
|
|
574
|
+
-------
|
|
575
|
+
graph : array
|
|
576
|
+
graph as string array with links '-->'.
|
|
577
|
+
"""
|
|
578
|
+
|
|
579
|
+
graph = np.zeros(graph_bool.shape, dtype='<U3')
|
|
580
|
+
graph[:] = ""
|
|
581
|
+
# Lagged links
|
|
582
|
+
graph[:,:,1:][graph_bool[:,:,1:]==1] = "-->"
|
|
583
|
+
# Unoriented contemporaneous links
|
|
584
|
+
graph[:,:,0][np.logical_and(graph_bool[:,:,0]==1,
|
|
585
|
+
graph_bool[:,:,0].T==1)] = "o-o"
|
|
586
|
+
# Conflicting contemporaneous links
|
|
587
|
+
graph[:,:,0][np.logical_and(graph_bool[:,:,0]==2,
|
|
588
|
+
graph_bool[:,:,0].T==2)] = "x-x"
|
|
589
|
+
# Directed contemporaneous links
|
|
590
|
+
for (i,j) in zip(*np.where(
|
|
591
|
+
np.logical_and(graph_bool[:,:,0]==1, graph_bool[:,:,0].T==0))):
|
|
592
|
+
graph[i,j,0] = "-->"
|
|
593
|
+
graph[j,i,0] = "<--"
|
|
594
|
+
|
|
595
|
+
return graph
|
|
596
|
+
|
|
597
|
+
def symmetrize_p_and_val_matrix(self, p_matrix, val_matrix, link_assumptions, conf_matrix=None):
|
|
598
|
+
"""Symmetrizes the p_matrix, val_matrix, and conf_matrix based on link_assumptions
|
|
599
|
+
and the larger p-value.
|
|
600
|
+
|
|
601
|
+
Parameters
|
|
602
|
+
----------
|
|
603
|
+
val_matrix : array of shape [N, N, tau_max+1]
|
|
604
|
+
Estimated matrix of test statistic values.
|
|
605
|
+
p_matrix : array of shape [N, N, tau_max+1]
|
|
606
|
+
Estimated matrix of p-values. Set to 1 if val_only=True.
|
|
607
|
+
conf_matrix : array of shape [N, N, tau_max+1,2]
|
|
608
|
+
Estimated matrix of confidence intervals of test statistic values.
|
|
609
|
+
Only computed if set in cond_ind_test, where also the percentiles
|
|
610
|
+
are set.
|
|
611
|
+
link_assumptions : dict or None
|
|
612
|
+
Dictionary of form {j:{(i, -tau): link_type, ...}, ...} specifying
|
|
613
|
+
assumptions about links. This initializes the graph with entries
|
|
614
|
+
graph[i,j,tau] = link_type. For example, graph[i,j,0] = '-->'
|
|
615
|
+
implies that a directed link from i to j at lag 0 must exist.
|
|
616
|
+
Valid link types are 'o-o', '-->', '<--'. In addition, the middle
|
|
617
|
+
mark can be '?' instead of '-'. Then '-?>' implies that this link
|
|
618
|
+
may not exist, but if it exists, its orientation is '-->'. Link
|
|
619
|
+
assumptions need to be consistent, i.e., graph[i,j,0] = '-->'
|
|
620
|
+
requires graph[j,i,0] = '<--' and acyclicity must hold. If a link
|
|
621
|
+
does not appear in the dictionary, it is assumed absent. That is,
|
|
622
|
+
if link_assumptions is not None, then all links have to be specified
|
|
623
|
+
or the links are assumed absent.
|
|
624
|
+
Returns
|
|
625
|
+
-------
|
|
626
|
+
val_matrix : array of shape [N, N, tau_max+1]
|
|
627
|
+
Estimated matrix of test statistic values.
|
|
628
|
+
p_matrix : array of shape [N, N, tau_max+1]
|
|
629
|
+
Estimated matrix of p-values. Set to 1 if val_only=True.
|
|
630
|
+
conf_matrix : array of shape [N, N, tau_max+1,2]
|
|
631
|
+
Estimated matrix of confidence intervals of test statistic values.
|
|
632
|
+
Only computed if set in cond_ind_test, where also the percentiles
|
|
633
|
+
are set.
|
|
634
|
+
"""
|
|
635
|
+
|
|
636
|
+
# Symmetrize p_matrix and val_matrix and conf_matrix
|
|
637
|
+
for i in range(self.N):
|
|
638
|
+
for j in range(self.N):
|
|
639
|
+
# If both the links are present in link_assumptions, symmetrize using maximum p-value
|
|
640
|
+
# if ((i, 0) in link_assumptions[j] and (j, 0) in link_assumptions[i]):
|
|
641
|
+
if (i, 0) in link_assumptions[j]:
|
|
642
|
+
if link_assumptions[j][(i, 0)] in ["o-o", 'o?o']:
|
|
643
|
+
if (p_matrix[i, j, 0]
|
|
644
|
+
>= p_matrix[j, i, 0]):
|
|
645
|
+
p_matrix[j, i, 0] = p_matrix[i, j, 0]
|
|
646
|
+
val_matrix[j, i, 0] = val_matrix[i, j, 0]
|
|
647
|
+
if conf_matrix is not None:
|
|
648
|
+
conf_matrix[j, i, 0] = conf_matrix[i, j, 0]
|
|
649
|
+
|
|
650
|
+
# If only one of the links is present in link_assumptions, symmetrize using the p-value of the link present
|
|
651
|
+
# elif ((i, 0) in link_assumptions[j] and (j, 0) not in link_assumptions[i]):
|
|
652
|
+
elif link_assumptions[j][(i, 0)] in ["-->", '-?>']:
|
|
653
|
+
p_matrix[j, i, 0] = p_matrix[i, j, 0]
|
|
654
|
+
val_matrix[j, i, 0] = val_matrix[i, j, 0]
|
|
655
|
+
if conf_matrix is not None:
|
|
656
|
+
conf_matrix[j, i, 0] = conf_matrix[i, j, 0]
|
|
657
|
+
else:
|
|
658
|
+
# Links not present in link_assumptions
|
|
659
|
+
pass
|
|
660
|
+
|
|
661
|
+
# Return the values as a dictionary and store in class
|
|
662
|
+
results = {'val_matrix': val_matrix,
|
|
663
|
+
'p_matrix': p_matrix,
|
|
664
|
+
'conf_matrix': conf_matrix}
|
|
665
|
+
return results
|
|
666
|
+
|
|
667
|
+
def run_sliding_window_of(self, method, method_args,
|
|
668
|
+
window_step,
|
|
669
|
+
window_length,
|
|
670
|
+
conf_lev = 0.9,
|
|
671
|
+
):
|
|
672
|
+
"""Runs chosen method on sliding windows taken from DataFrame.
|
|
673
|
+
|
|
674
|
+
The function returns summary_results and all_results (containing the
|
|
675
|
+
individual window results). summary_results contains val_matrix_mean
|
|
676
|
+
and val_matrix_interval, the latter containing the confidence bounds for
|
|
677
|
+
conf_lev. If the method also returns a graph, then 'most_frequent_links'
|
|
678
|
+
containing the most frequent link outcome (either 0 or 1 or a specific
|
|
679
|
+
link type) in each entry of graph, as well as 'link_frequency',
|
|
680
|
+
containing the occurence frequency of the most frequent link outcome,
|
|
681
|
+
are returned.
|
|
682
|
+
|
|
683
|
+
Parameters
|
|
684
|
+
----------
|
|
685
|
+
method : str
|
|
686
|
+
Chosen method among valid functions in PCMCI.
|
|
687
|
+
method_args : dict
|
|
688
|
+
Arguments passed to method.
|
|
689
|
+
window_step : int
|
|
690
|
+
Time step of windows.
|
|
691
|
+
window_length : int
|
|
692
|
+
Length of sliding window.
|
|
693
|
+
conf_lev : float, optional (default: 0.9)
|
|
694
|
+
Two-sided confidence interval for summary results.
|
|
695
|
+
|
|
696
|
+
Returns
|
|
697
|
+
-------
|
|
698
|
+
Dictionary of results for every sliding window.
|
|
699
|
+
"""
|
|
700
|
+
|
|
701
|
+
valid_methods = ['run_pc_stable',
|
|
702
|
+
'run_mci',
|
|
703
|
+
'get_lagged_dependencies',
|
|
704
|
+
'run_fullci',
|
|
705
|
+
'run_bivci',
|
|
706
|
+
'run_pcmci',
|
|
707
|
+
'run_pcalg',
|
|
708
|
+
'run_lpcmci',
|
|
709
|
+
'run_jpcmciplus',
|
|
710
|
+
# 'run_pcalg_non_timeseries_data',
|
|
711
|
+
'run_pcmciplus',]
|
|
712
|
+
|
|
713
|
+
if method not in valid_methods:
|
|
714
|
+
raise ValueError("method must be one of %s" % str(valid_methods))
|
|
715
|
+
|
|
716
|
+
if self.dataframe.reference_points_is_none is False:
|
|
717
|
+
raise ValueError("Reference points are not accepted in "
|
|
718
|
+
"sliding windows analysis, align data before and use masking"
|
|
719
|
+
" and/or missing values.")
|
|
720
|
+
|
|
721
|
+
T = self.dataframe.largest_time_step
|
|
722
|
+
|
|
723
|
+
if self.cond_ind_test.recycle_residuals:
|
|
724
|
+
# recycle_residuals clashes with sliding windows...
|
|
725
|
+
raise ValueError("cond_ind_test.recycle_residuals must be False.")
|
|
726
|
+
|
|
727
|
+
if self.verbosity > 0:
|
|
728
|
+
print("\n##\n## Running sliding window analysis of %s " % method +
|
|
729
|
+
"\n##\n" +
|
|
730
|
+
"\nwindow_step = %s \n" % window_step +
|
|
731
|
+
"\nwindow_length = %s \n" % window_length
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
original_reference_points = deepcopy(self.dataframe.reference_points)
|
|
735
|
+
|
|
736
|
+
window_start_points = np.arange(0, T - window_length, window_step)
|
|
737
|
+
n_windows = len(window_start_points)
|
|
738
|
+
|
|
739
|
+
if len(window_start_points) == 0:
|
|
740
|
+
raise ValueError("Empty list of windows, check window_length and window_step!")
|
|
741
|
+
|
|
742
|
+
window_results = {}
|
|
743
|
+
for iw, w in enumerate(window_start_points):
|
|
744
|
+
if self.verbosity > 0:
|
|
745
|
+
print("\n# Window start %s (%d/%d) \n" %(w, iw+1, len(window_start_points)))
|
|
746
|
+
# Construct reference_points from window
|
|
747
|
+
time_window = np.arange(w, w + window_length, 1)
|
|
748
|
+
# Remove points beyond T
|
|
749
|
+
time_window = time_window[time_window < T]
|
|
750
|
+
|
|
751
|
+
self.dataframe.reference_points = time_window
|
|
752
|
+
window_res = deepcopy(getattr(self, method)(**method_args))
|
|
753
|
+
|
|
754
|
+
# Aggregate val_matrix and other arrays to new arrays with
|
|
755
|
+
# windows as first dimension. Lists and other objects
|
|
756
|
+
# are stored in dictionary
|
|
757
|
+
for key in window_res:
|
|
758
|
+
res_item = window_res[key]
|
|
759
|
+
if iw == 0:
|
|
760
|
+
if type(res_item) is np.ndarray:
|
|
761
|
+
window_results[key] = np.empty((n_windows,)
|
|
762
|
+
+ res_item.shape,
|
|
763
|
+
dtype=res_item.dtype)
|
|
764
|
+
else:
|
|
765
|
+
window_results[key] = {}
|
|
766
|
+
|
|
767
|
+
window_results[key][iw] = res_item
|
|
768
|
+
|
|
769
|
+
# Reset to original_reference_points data for further analyses
|
|
770
|
+
# self.dataframe.values[0] = original_data
|
|
771
|
+
self.dataframe.reference_points = original_reference_points
|
|
772
|
+
|
|
773
|
+
# Generate summary results
|
|
774
|
+
summary_results = self.return_summary_results(results=window_results,
|
|
775
|
+
conf_lev=conf_lev)
|
|
776
|
+
|
|
777
|
+
return {'summary_results': summary_results,
|
|
778
|
+
'window_results': window_results}
|
|
779
|
+
|
|
780
|
+
def run_bootstrap_of(self, method, method_args,
|
|
781
|
+
boot_samples=100,
|
|
782
|
+
boot_blocklength=1,
|
|
783
|
+
conf_lev=0.9, aggregation="majority", seed=None):
|
|
784
|
+
"""Runs chosen method on bootstrap samples drawn from DataFrame.
|
|
785
|
+
|
|
786
|
+
Bootstraps for tau=0 are drawn from [2xtau_max, ..., T] and all lagged
|
|
787
|
+
variables constructed in DataFrame.construct_array are consistently
|
|
788
|
+
shifted with respect to this bootstrap sample to ensure that lagged
|
|
789
|
+
relations in the bootstrap sample are preserved.
|
|
790
|
+
|
|
791
|
+
The function returns summary_results and all_results (containing the
|
|
792
|
+
individual bootstrap results). summary_results contains
|
|
793
|
+
val_matrix_mean and val_matrix_interval, the latter containing the
|
|
794
|
+
confidence bounds for conf_lev. If the method also returns a graph,
|
|
795
|
+
then 'most_frequent_links' containing the most frequent link outcome
|
|
796
|
+
(specific link type) in each entry of graph, as well
|
|
797
|
+
as 'link_frequency', containing the occurence frequency of the most
|
|
798
|
+
frequent link outcome, are returned. Two aggregation methods are
|
|
799
|
+
available for 'most_frequent_links'. By default, "majority"
|
|
800
|
+
provides the most frequent link outcome. Alternatively
|
|
801
|
+
"no_edge_majority" provides an alternative aggregation strategy.
|
|
802
|
+
As explained in Debeire et al. (2024), in the first step of this
|
|
803
|
+
alternative approach, the orientation of edges is ignored, and the
|
|
804
|
+
focus is only on determining the adjacency of each pair of vertices.
|
|
805
|
+
This is done through majority voting between no edge and all other
|
|
806
|
+
edge types. In the second step, the adjacencies identified in the
|
|
807
|
+
first step are oriented based on majority voting. This alternative
|
|
808
|
+
approach ensures that no edge can only be voted on if it appears
|
|
809
|
+
in more than half of the bootstrap ensemble of graphs.
|
|
810
|
+
|
|
811
|
+
Assumes that method uses cond_ind_test.run_test() function with cut_off
|
|
812
|
+
= '2xtau_max'.
|
|
813
|
+
|
|
814
|
+
Utilizes parallelization via joblib.
|
|
815
|
+
|
|
816
|
+
Parameters
|
|
817
|
+
----------
|
|
818
|
+
method : str
|
|
819
|
+
Chosen method among valid functions in PCMCI.
|
|
820
|
+
method_args : dict
|
|
821
|
+
Arguments passed to method.
|
|
822
|
+
boot_samples : int
|
|
823
|
+
Number of bootstrap samples to draw.
|
|
824
|
+
boot_blocklength : int, optional (default: 1)
|
|
825
|
+
Block length for block-bootstrap.
|
|
826
|
+
conf_lev : float, optional (default: 0.9)
|
|
827
|
+
Two-sided confidence interval for summary results.
|
|
828
|
+
seed : int, optional(default = None)
|
|
829
|
+
Seed for RandomState (default_rng)
|
|
830
|
+
aggregation : str, optional (default: "majority")
|
|
831
|
+
Chosen aggregation strategy: "majority" or "no_edge_majority".
|
|
832
|
+
|
|
833
|
+
Returns
|
|
834
|
+
-------
|
|
835
|
+
Dictionary of summary results and results for every bootstrap sample.
|
|
836
|
+
"""
|
|
837
|
+
|
|
838
|
+
valid_methods = ['run_pc_stable',
|
|
839
|
+
'run_mci',
|
|
840
|
+
'get_lagged_dependencies',
|
|
841
|
+
'run_fullci',
|
|
842
|
+
'run_bivci',
|
|
843
|
+
'run_pcmci',
|
|
844
|
+
'run_pcalg',
|
|
845
|
+
'run_pcalg_non_timeseries_data',
|
|
846
|
+
'run_pcmciplus',
|
|
847
|
+
'run_lpcmci',
|
|
848
|
+
'run_jpcmciplus',
|
|
849
|
+
]
|
|
850
|
+
if method not in valid_methods:
|
|
851
|
+
raise ValueError("method must be one of %s" % str(valid_methods))
|
|
852
|
+
|
|
853
|
+
T = self.dataframe.largest_time_step
|
|
854
|
+
seed_sequence = np.random.SeedSequence(seed)
|
|
855
|
+
#global_random_state = np.random.default_rng(seed)
|
|
856
|
+
|
|
857
|
+
# Extract tau_max to construct bootstrap draws
|
|
858
|
+
if 'tau_max' not in method_args:
|
|
859
|
+
raise ValueError("tau_max must be explicitely set in method_args.")
|
|
860
|
+
tau_max = method_args['tau_max']
|
|
861
|
+
|
|
862
|
+
if self.cond_ind_test.recycle_residuals:
|
|
863
|
+
# recycle_residuals clashes with bootstrap draws...
|
|
864
|
+
raise ValueError("cond_ind_test.recycle_residuals must be False.")
|
|
865
|
+
|
|
866
|
+
if self.verbosity > 0:
|
|
867
|
+
print("\n##\n## Running Bootstrap of %s " % method +
|
|
868
|
+
"\n##\n" +
|
|
869
|
+
"\nboot_samples = %s \n" % boot_samples +
|
|
870
|
+
"\nboot_blocklength = %s \n" % boot_blocklength
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
# Set bootstrap attribute to be passed to dataframe
|
|
874
|
+
self.dataframe.bootstrap = {}
|
|
875
|
+
self.dataframe.bootstrap['boot_blocklength'] = boot_blocklength
|
|
876
|
+
|
|
877
|
+
boot_results = {}
|
|
878
|
+
#for b in range(boot_samples):
|
|
879
|
+
# Generate random state for this boot and set it in dataframe
|
|
880
|
+
# which will generate a draw with replacement
|
|
881
|
+
#boot_seed = global_random_state.integers(0, boot_samples, 1)
|
|
882
|
+
#boot_random_state = np.random.default_rng(boot_seed)
|
|
883
|
+
#self.dataframe.bootstrap['random_state'] = boot_random_state
|
|
884
|
+
|
|
885
|
+
child_seeds = seed_sequence.spawn(boot_samples)
|
|
886
|
+
|
|
887
|
+
aggregated_results = Parallel(n_jobs=-1)(
|
|
888
|
+
delayed(self.parallelized_bootstraps)(method, method_args, boot_seed=child_seeds[b]) for
|
|
889
|
+
b in range(boot_samples))
|
|
890
|
+
|
|
891
|
+
for b in range(boot_samples):
|
|
892
|
+
# Aggregate val_matrix and other arrays to new arrays with
|
|
893
|
+
# boot_samples as first dimension. Lists and other objects
|
|
894
|
+
# are stored in dictionary
|
|
895
|
+
boot_res = aggregated_results[b]
|
|
896
|
+
for key in boot_res:
|
|
897
|
+
res_item = boot_res[key]
|
|
898
|
+
if type(res_item) is np.ndarray:
|
|
899
|
+
if b == 0:
|
|
900
|
+
boot_results[key] = np.empty((boot_samples,)
|
|
901
|
+
+ res_item.shape,
|
|
902
|
+
dtype=res_item.dtype)
|
|
903
|
+
boot_results[key][b] = res_item
|
|
904
|
+
else:
|
|
905
|
+
if b == 0:
|
|
906
|
+
boot_results[key] = {}
|
|
907
|
+
boot_results[key][b] = res_item
|
|
908
|
+
|
|
909
|
+
# Generate summary results
|
|
910
|
+
summary_results = self.return_summary_results(results=boot_results,
|
|
911
|
+
conf_lev=conf_lev,
|
|
912
|
+
aggregation=aggregation)
|
|
913
|
+
|
|
914
|
+
# Reset bootstrap to None
|
|
915
|
+
self.dataframe.bootstrap = None
|
|
916
|
+
|
|
917
|
+
return {'summary_results': summary_results,
|
|
918
|
+
'boot_results': boot_results}
|
|
919
|
+
|
|
920
|
+
def parallelized_bootstraps(self, method, method_args, boot_seed):
|
|
921
|
+
# Pass seed sequence for this boot and set it in dataframe
|
|
922
|
+
# which will generate a draw with replacement
|
|
923
|
+
boot_random_state = np.random.default_rng(boot_seed)
|
|
924
|
+
self.dataframe.bootstrap['random_state'] = boot_random_state
|
|
925
|
+
boot_res = getattr(self, method)(**method_args)
|
|
926
|
+
return boot_res
|
|
927
|
+
|
|
928
|
+
@staticmethod
|
|
929
|
+
def return_summary_results(results, conf_lev=0.9, aggregation="majority"):
|
|
930
|
+
"""Return summary results for causal graphs.
|
|
931
|
+
|
|
932
|
+
The function returns summary_results of an array of PCMCI(+) results.
|
|
933
|
+
Summary_results contains val_matrix_mean and val_matrix_interval, the latter
|
|
934
|
+
containing the confidence bounds for conf_lev. If the method also returns a graph,
|
|
935
|
+
then 'most_frequent_links' containing the most frequent link outcome
|
|
936
|
+
(either 0 or 1 or a specific link type) in each entry of graph, as well
|
|
937
|
+
as 'link_frequency', containing the occurence frequency of the most
|
|
938
|
+
frequent link outcome, are returned. Two aggregation methods are
|
|
939
|
+
available for 'most_frequent_links'. By default, "majority"
|
|
940
|
+
provides the most frequent link outcome. Alternatively
|
|
941
|
+
"no_edge_majority" provides an alternative aggregation strategy.
|
|
942
|
+
As explained in Debeire et al. (2024), in the first step of this
|
|
943
|
+
alternative approach, the orientation of edges is ignored, and the
|
|
944
|
+
focus is only on determining the adjacency of each pair of vertices.
|
|
945
|
+
This is done through majority voting between no edge and all other
|
|
946
|
+
edge types. In the second step, the adjacencies identified in the
|
|
947
|
+
first step are oriented based on majority voting. This alternative
|
|
948
|
+
approach ensures that no edge can only be voted on if it appears
|
|
949
|
+
in more than half of the bootstrap ensemble of graphs.
|
|
950
|
+
|
|
951
|
+
Parameters
|
|
952
|
+
----------
|
|
953
|
+
results : dict
|
|
954
|
+
Results dictionary where the numpy arrays graph and val_matrix are
|
|
955
|
+
of shape (n_results, N, N, tau_max + 1).
|
|
956
|
+
conf_lev : float, optional (default: 0.9)
|
|
957
|
+
Two-sided confidence interval for summary results.
|
|
958
|
+
aggregation : str, optional (default: "majority")
|
|
959
|
+
Chosen aggregation strategy: "majority" or "no_edge_majority".
|
|
960
|
+
Returns
|
|
961
|
+
-------
|
|
962
|
+
Dictionary of summary results.
|
|
963
|
+
"""
|
|
964
|
+
|
|
965
|
+
valid_aggregations = {"majority", "no_edge_majority"}
|
|
966
|
+
if aggregation not in valid_aggregations:
|
|
967
|
+
raise ValueError(f"Invalid aggregation mode: {aggregation}. Expected one of {valid_aggregations}")
|
|
968
|
+
|
|
969
|
+
# Generate summary results
|
|
970
|
+
summary_results = {}
|
|
971
|
+
|
|
972
|
+
if 'graph' in results:
|
|
973
|
+
n_results, N, N, tau_max_plusone = results['graph'].shape
|
|
974
|
+
tau_max = tau_max_plusone - 1
|
|
975
|
+
# print(repr(results['graph']))
|
|
976
|
+
summary_results['most_frequent_links'] = np.zeros((N, N, tau_max_plusone),
|
|
977
|
+
dtype=results['graph'][0].dtype)
|
|
978
|
+
summary_results['link_frequency'] = np.zeros((N, N, tau_max_plusone),
|
|
979
|
+
dtype='float')
|
|
980
|
+
|
|
981
|
+
#preferred order in case of ties with the spirit of
|
|
982
|
+
#keeping the least assertive and most cautious claims in the presence of ties.
|
|
983
|
+
#In case of ties between other link types, a conflicting link "x-x" is assigned
|
|
984
|
+
preferred_order = [
|
|
985
|
+
"", # No link (most conservative)
|
|
986
|
+
#"o?o", # No claim made (lag 0 only)
|
|
987
|
+
#"<?>", # Neither is ancestor
|
|
988
|
+
"x-x", # Conflict (used to break <--> vs --> vs <-- ties)
|
|
989
|
+
"o-o", # Undirected link (lag 0 only)
|
|
990
|
+
# "<-o", # X^i not ancestor, but linked (lag 0 only)
|
|
991
|
+
# "o->", # X^j not ancestor, but linked
|
|
992
|
+
# rest is solved by conflict
|
|
993
|
+
# "<->",
|
|
994
|
+
# "-->",
|
|
995
|
+
# "<--",
|
|
996
|
+
]
|
|
997
|
+
|
|
998
|
+
for (i, j) in itertools.product(range(N), range(N)):
|
|
999
|
+
for abstau in range(0, tau_max + 1):
|
|
1000
|
+
links, counts = np.unique(results['graph'][:,i,j,abstau],
|
|
1001
|
+
return_counts=True)
|
|
1002
|
+
list_of_most_freq = links[counts == counts.max()]
|
|
1003
|
+
if aggregation=="majority":
|
|
1004
|
+
if len(list_of_most_freq) == 1:
|
|
1005
|
+
choice = list_of_most_freq[0]
|
|
1006
|
+
else:
|
|
1007
|
+
ordered_list = [link for link in preferred_order
|
|
1008
|
+
if link in list_of_most_freq]
|
|
1009
|
+
if len(ordered_list) == 0:
|
|
1010
|
+
choice = "x-x"
|
|
1011
|
+
else:
|
|
1012
|
+
choice = ordered_list[0]
|
|
1013
|
+
summary_results['most_frequent_links'][i,j, abstau] = choice
|
|
1014
|
+
summary_results['link_frequency'][i,j, abstau] = \
|
|
1015
|
+
counts[counts == counts.max()].sum()/float(n_results)
|
|
1016
|
+
|
|
1017
|
+
elif aggregation=="no_edge_majority":
|
|
1018
|
+
if counts[links == ""].size == 0: #handle the case where there is no "" in links
|
|
1019
|
+
freq_of_no_edge=0
|
|
1020
|
+
else:
|
|
1021
|
+
# make scalar count (counts[...] returns a 1-element array)
|
|
1022
|
+
freq_of_no_edge = int(counts[links == ""].sum())
|
|
1023
|
+
|
|
1024
|
+
freq_of_adjacency = n_results - freq_of_no_edge
|
|
1025
|
+
if freq_of_adjacency > freq_of_no_edge:
|
|
1026
|
+
adja_links = np.delete(links,np.where(links == ""))
|
|
1027
|
+
adja_counts = np.delete(counts,np.where(links == ""))
|
|
1028
|
+
list_of_most_freq_adja = adja_links[adja_counts == adja_counts.max()]
|
|
1029
|
+
if len(list_of_most_freq_adja) == 1:
|
|
1030
|
+
choice = list_of_most_freq_adja[0]
|
|
1031
|
+
else:
|
|
1032
|
+
ordered_list = [link for link in preferred_order
|
|
1033
|
+
if link in list_of_most_freq_adja]
|
|
1034
|
+
if len(ordered_list) == 0:
|
|
1035
|
+
choice = "x-x"
|
|
1036
|
+
else:
|
|
1037
|
+
choice = ordered_list[0]
|
|
1038
|
+
summary_results['most_frequent_links'][i,j, abstau] = choice
|
|
1039
|
+
summary_results['link_frequency'][i,j, abstau] = \
|
|
1040
|
+
adja_counts[adja_counts == adja_counts.max()].sum()/float(n_results)
|
|
1041
|
+
else:
|
|
1042
|
+
choice= ""
|
|
1043
|
+
summary_results['most_frequent_links'][i,j, abstau] = choice
|
|
1044
|
+
summary_results['link_frequency'][i,j, abstau] = \
|
|
1045
|
+
freq_of_no_edge/float(n_results)
|
|
1046
|
+
# Confidence intervals for val_matrix; interval is two-sided
|
|
1047
|
+
c_int = (1. - (1. - conf_lev)/2.)
|
|
1048
|
+
summary_results['val_matrix_mean'] = np.mean(
|
|
1049
|
+
results['val_matrix'], axis=0)
|
|
1050
|
+
|
|
1051
|
+
summary_results['val_matrix_interval'] = np.stack(np.percentile(
|
|
1052
|
+
results['val_matrix'], axis=0,
|
|
1053
|
+
q = [100*(1. - c_int), 100*c_int]), axis=3)
|
|
1054
|
+
return summary_results
|
|
1055
|
+
|
|
1056
|
+
@staticmethod
|
|
1057
|
+
def graph_to_dict(graph):
|
|
1058
|
+
"""Helper function to convert graph to dictionary of links.
|
|
1059
|
+
|
|
1060
|
+
Parameters
|
|
1061
|
+
---------
|
|
1062
|
+
graph : array of shape (N, N, tau_max+1)
|
|
1063
|
+
Matrix format of graph in string format.
|
|
1064
|
+
|
|
1065
|
+
Returns
|
|
1066
|
+
-------
|
|
1067
|
+
links : dict
|
|
1068
|
+
Dictionary of form {0:{(0, -1): o-o, ...}, 1:{...}, ...}.
|
|
1069
|
+
"""
|
|
1070
|
+
N = graph.shape[0]
|
|
1071
|
+
|
|
1072
|
+
links = dict([(j, {}) for j in range(N)])
|
|
1073
|
+
|
|
1074
|
+
for (i, j, tau) in zip(*np.where(graph!='')):
|
|
1075
|
+
links[j][(i, -tau)] = graph[i,j,tau]
|
|
1076
|
+
|
|
1077
|
+
return links
|
|
1078
|
+
|
|
1079
|
+
# @staticmethod
|
|
1080
|
+
def _dict_to_graph(self, links, tau_max=None):
|
|
1081
|
+
"""Helper function to convert dictionary of links to graph.
|
|
1082
|
+
|
|
1083
|
+
Parameters
|
|
1084
|
+
---------
|
|
1085
|
+
links : dict
|
|
1086
|
+
Dictionary of form {0:{(0, -1): 'o-o'}, ...}, 1:{...}, ...}.
|
|
1087
|
+
|
|
1088
|
+
Returns
|
|
1089
|
+
-------
|
|
1090
|
+
graph : array of shape (N, N, tau_max+1)
|
|
1091
|
+
Matrix format of graph in string format.
|
|
1092
|
+
"""
|
|
1093
|
+
|
|
1094
|
+
N = len(links)
|
|
1095
|
+
|
|
1096
|
+
# Get maximum time lag
|
|
1097
|
+
max_lag = 0
|
|
1098
|
+
for j in range(N):
|
|
1099
|
+
for link in links[j]:
|
|
1100
|
+
var, lag = link
|
|
1101
|
+
if isinstance(links[j], dict):
|
|
1102
|
+
link_type = links[j][link]
|
|
1103
|
+
if link_type != "":
|
|
1104
|
+
max_lag = max(max_lag, abs(lag))
|
|
1105
|
+
else:
|
|
1106
|
+
max_lag = max(max_lag, abs(lag))
|
|
1107
|
+
|
|
1108
|
+
if tau_max is None:
|
|
1109
|
+
tau_max = max_lag
|
|
1110
|
+
else:
|
|
1111
|
+
if tau_max < max_lag:
|
|
1112
|
+
raise ValueError("maxlag(links) > tau_max")
|
|
1113
|
+
|
|
1114
|
+
graph = np.zeros((N, N, tau_max + 1), dtype='<U3')
|
|
1115
|
+
graph[:] = ""
|
|
1116
|
+
for j in range(N):
|
|
1117
|
+
for link in links[j]:
|
|
1118
|
+
i, tau = link
|
|
1119
|
+
if isinstance(links[j], dict):
|
|
1120
|
+
link_type = links[j][link]
|
|
1121
|
+
graph[i, j, abs(tau)] = link_type
|
|
1122
|
+
else:
|
|
1123
|
+
graph[i, j, abs(tau)] = '-->'
|
|
1124
|
+
|
|
1125
|
+
return graph
|
|
1126
|
+
|
|
1127
|
+
@staticmethod
|
|
1128
|
+
def get_graph_from_dict(links, tau_max=None):
|
|
1129
|
+
"""Helper function to convert dictionary of links to graph array format.
|
|
1130
|
+
|
|
1131
|
+
Parameters
|
|
1132
|
+
---------
|
|
1133
|
+
links : dict
|
|
1134
|
+
Dictionary of form {0:[((0, -1), coeff, func), ...], 1:[...], ...}.
|
|
1135
|
+
Also format {0:[(0, -1), ...], 1:[...], ...} is allowed.
|
|
1136
|
+
tau_max : int or None
|
|
1137
|
+
Maximum lag. If None, the maximum lag in links is used.
|
|
1138
|
+
|
|
1139
|
+
Returns
|
|
1140
|
+
-------
|
|
1141
|
+
graph : array of shape (N, N, tau_max+1)
|
|
1142
|
+
Matrix format of graph with 1 for true links and 0 else.
|
|
1143
|
+
"""
|
|
1144
|
+
|
|
1145
|
+
def _get_minmax_lag(links):
|
|
1146
|
+
"""Helper function to retrieve tau_min and tau_max from links.
|
|
1147
|
+
"""
|
|
1148
|
+
|
|
1149
|
+
N = len(links)
|
|
1150
|
+
|
|
1151
|
+
# Get maximum time lag
|
|
1152
|
+
min_lag = np.inf
|
|
1153
|
+
max_lag = 0
|
|
1154
|
+
for j in range(N):
|
|
1155
|
+
for link_props in links[j]:
|
|
1156
|
+
if len(link_props) > 2:
|
|
1157
|
+
var, lag = link_props[0]
|
|
1158
|
+
coeff = link_props[1]
|
|
1159
|
+
# func = link_props[2]
|
|
1160
|
+
if coeff != 0.:
|
|
1161
|
+
min_lag = min(min_lag, abs(lag))
|
|
1162
|
+
max_lag = max(max_lag, abs(lag))
|
|
1163
|
+
else:
|
|
1164
|
+
var, lag = link_props
|
|
1165
|
+
min_lag = min(min_lag, abs(lag))
|
|
1166
|
+
max_lag = max(max_lag, abs(lag))
|
|
1167
|
+
|
|
1168
|
+
return min_lag, max_lag
|
|
1169
|
+
|
|
1170
|
+
N = len(links)
|
|
1171
|
+
|
|
1172
|
+
# Get maximum time lag
|
|
1173
|
+
min_lag, max_lag = _get_minmax_lag(links)
|
|
1174
|
+
|
|
1175
|
+
# Set maximum lag
|
|
1176
|
+
if tau_max is None:
|
|
1177
|
+
tau_max = max_lag
|
|
1178
|
+
else:
|
|
1179
|
+
if max_lag > tau_max:
|
|
1180
|
+
raise ValueError("tau_max is smaller than maximum lag = %d "
|
|
1181
|
+
"found in links, use tau_max=None or larger "
|
|
1182
|
+
"value" % max_lag)
|
|
1183
|
+
|
|
1184
|
+
graph = np.zeros((N, N, tau_max + 1), dtype='<U3')
|
|
1185
|
+
for j in links.keys():
|
|
1186
|
+
for link_props in links[j]:
|
|
1187
|
+
if len(link_props) > 2:
|
|
1188
|
+
var, lag = link_props[0]
|
|
1189
|
+
coeff = link_props[1]
|
|
1190
|
+
if coeff != 0.:
|
|
1191
|
+
graph[var, j, abs(lag)] = "-->"
|
|
1192
|
+
if lag == 0:
|
|
1193
|
+
graph[j, var, 0] = "<--"
|
|
1194
|
+
else:
|
|
1195
|
+
var, lag = link_props
|
|
1196
|
+
graph[var, j, abs(lag)] = "-->"
|
|
1197
|
+
if lag == 0:
|
|
1198
|
+
graph[j, var, 0] = "<--"
|
|
1199
|
+
|
|
1200
|
+
return graph
|
|
1201
|
+
|
|
1202
|
+
@staticmethod
|
|
1203
|
+
def build_link_assumptions(link_assumptions_absent_link_means_no_knowledge,
|
|
1204
|
+
n_component_time_series,
|
|
1205
|
+
tau_max,
|
|
1206
|
+
tau_min=0):
|
|
1207
|
+
|
|
1208
|
+
out = {j: {(i, -tau_i): ("o?>" if tau_i > 0 else "o?o")
|
|
1209
|
+
for i in range(n_component_time_series) for tau_i in range(tau_min, tau_max+1)
|
|
1210
|
+
if (tau_i > 0 or i != j)} for j in range(n_component_time_series)}
|
|
1211
|
+
|
|
1212
|
+
for j, links_j in link_assumptions_absent_link_means_no_knowledge.items():
|
|
1213
|
+
for (i, lag_i), link_ij in links_j.items():
|
|
1214
|
+
if link_ij == "":
|
|
1215
|
+
del out[j][(i, lag_i)]
|
|
1216
|
+
else:
|
|
1217
|
+
out[j][(i, lag_i)] = link_ij
|
|
1218
|
+
return out
|