GLDF 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- GLDF/__init__.py +2 -0
- GLDF/bridges/__init__.py +0 -0
- GLDF/bridges/causal_learn.py +185 -0
- GLDF/bridges/tigramite.py +143 -0
- GLDF/bridges/tigramite_plotting_modified.py +4764 -0
- GLDF/cit.py +274 -0
- GLDF/data_management.py +588 -0
- GLDF/data_processing.py +754 -0
- GLDF/frontend.py +537 -0
- GLDF/hccd.py +403 -0
- GLDF/hyperparams.py +205 -0
- GLDF/independence_atoms.py +78 -0
- GLDF/state_space_construction.py +288 -0
- GLDF/tutorials/01_preconfigured_quickstart.ipynb +302 -0
- GLDF/tutorials/02_detailed_configuration.ipynb +394 -0
- GLDF/tutorials/03_custom_patterns.ipynb +447 -0
- gldf-0.9.0.dist-info/METADATA +101 -0
- gldf-0.9.0.dist-info/RECORD +20 -0
- gldf-0.9.0.dist-info/WHEEL +4 -0
- gldf-0.9.0.dist-info/licenses/LICENSE +621 -0
GLDF/cit.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from scipy.stats import norm
|
|
3
|
+
from numpy.linalg import inv as matrix_inv
|
|
4
|
+
from .data_management import CIT_DataPatterned, BlockView
|
|
5
|
+
from .data_processing import ITestCI, IProvideAnalyticQuantilesForCIT, IProvideVarianceForCIT
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
class ParCorr(ITestCI, IProvideAnalyticQuantilesForCIT, IProvideVarianceForCIT):
|
|
9
|
+
"""Implementation of partial correlation independence test and interface for use with mCIT.
|
|
10
|
+
Can run on many blocks at once efficiently.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, alpha: float=0.05, lower_bound_clip_value: float=0.3, force_regression_global: bool=False,
|
|
14
|
+
analytic_approximation_for_cutoff: Literal["by effective count", "by large N expansion"]="by effective count"):
|
|
15
|
+
"""Constructor for partial correlation CIT.
|
|
16
|
+
|
|
17
|
+
:param alpha: target for FPR-control, defaults to 0.05
|
|
18
|
+
:type alpha: float, optional
|
|
19
|
+
:param lower_bound_clip_value: to avoid numeric instability for high dependencies, the implementation
|
|
20
|
+
provided for :py:meth:`IProvideAnalyticQuantilesForCIT.cit_quantile_estimate<GLDF.data_processing.IProvideAnalyticQuantilesForCIT.cit_quantile_estimate>`
|
|
21
|
+
clips bounds to a predefined range
|
|
22
|
+
(this is consistent, and does not cost substantial power), defaults to 0.3
|
|
23
|
+
:type lower_bound_clip_value: float, optional
|
|
24
|
+
:param force_regression_global: By default (disabled) regressions are computed locally per block, while in principle
|
|
25
|
+
slightly less sample-efficient on IID data, this is more robust against non-stationarities, defaults to False
|
|
26
|
+
:type force_regression_global: bool, optional
|
|
27
|
+
:param analytic_approximation_for_cutoff: analytic approximation used to implement
|
|
28
|
+
:py:class:`IProvideAnalyticQuantilesForCIT<GLDF.data_processing.IProvideAnalyticQuantilesForCIT>`,
|
|
29
|
+
defaults to "by effective count"
|
|
30
|
+
:type analytic_approximation_for_cutoff: Literal["by effective count", "by large N expansion"]
|
|
31
|
+
"""
|
|
32
|
+
self.alpha = alpha
|
|
33
|
+
self.lower_bound_clip_value = lower_bound_clip_value
|
|
34
|
+
self.force_regression_global = force_regression_global
|
|
35
|
+
self.analytic_approximation_for_cutoff = analytic_approximation_for_cutoff
|
|
36
|
+
|
|
37
|
+
def run_single(self, data: CIT_DataPatterned) -> ITestCI.Result:
|
|
38
|
+
global_score = self.score_single(data)
|
|
39
|
+
pvalue = self.pvalue(score=global_score, N=data.sample_count(), dim_Z=data.z_dim())
|
|
40
|
+
return ITestCI.Result(
|
|
41
|
+
global_score=global_score,
|
|
42
|
+
dependent=self.is_pvalue_dependent(pvalue)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def run_many(self, data: BlockView) -> ITestCI.Result:
|
|
46
|
+
block_scores = self.score_many(data)
|
|
47
|
+
global_score = np.mean(block_scores)
|
|
48
|
+
pvalue = self.pvalue_of_mean(score_mean=global_score, block_size=data.block_size(), block_count=data.block_count(), dim_Z=data.z_dim())
|
|
49
|
+
return ITestCI.Result(
|
|
50
|
+
global_score=np.mean(block_scores),
|
|
51
|
+
block_scores=block_scores,
|
|
52
|
+
dependent=self.is_pvalue_dependent(pvalue)
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def cit_quantile_estimate(self, data: BlockView, cit_result: ITestCI.Result, beta: float, cit_obj: ITestCI) -> float:
|
|
56
|
+
assert type(cit_obj) == type(self)
|
|
57
|
+
d1 = cit_result.global_score
|
|
58
|
+
d1_abs = abs(d1)
|
|
59
|
+
d1_is_positive = (d1 > 0.0)
|
|
60
|
+
|
|
61
|
+
cutoff_abs = self._lower_bound_from_pvalue(
|
|
62
|
+
d1_abs, beta, data.block_size(), conditioning_set_size=data.z_dim(),
|
|
63
|
+
how=self.analytic_approximation_for_cutoff, N_global=data.sample_count_used()
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
return cutoff_abs if d1_is_positive else -cutoff_abs
|
|
67
|
+
|
|
68
|
+
def get_variance_estimate(self, N: int, dim_Z: int, cit_obj: ITestCI) -> float:
|
|
69
|
+
assert type(cit_obj) == type(self)
|
|
70
|
+
return self.analytic_score_var(n=N, z_dim=dim_Z)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def effective_sample_count(n: int, z_dim: int) -> int:
|
|
75
|
+
"""Compute effective sample-size
|
|
76
|
+
|
|
77
|
+
:param n: actual sample-size
|
|
78
|
+
:type n: int
|
|
79
|
+
:param z_dim: size of conditioning set
|
|
80
|
+
:type z_dim: int
|
|
81
|
+
:return: effective sample-size
|
|
82
|
+
:rtype: int
|
|
83
|
+
"""
|
|
84
|
+
return n - 3 - z_dim
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def _n_required_for_eff_sample_count(effective_sample_count: int, z_dim:int) -> int:
|
|
88
|
+
return effective_sample_count + z_dim + 3
|
|
89
|
+
@classmethod
|
|
90
|
+
def _analytic_score_var_at_effective_sample_size(cls, effective_size: int) -> float:
|
|
91
|
+
return 1.0 / effective_size
|
|
92
|
+
@classmethod
|
|
93
|
+
def analytic_score_var(cls, n: int, z_dim: int) -> float:
|
|
94
|
+
"""Analytic approximation for score variance.
|
|
95
|
+
|
|
96
|
+
:param n: sample size
|
|
97
|
+
:type n: int
|
|
98
|
+
:param z_dim: size of conditioning set
|
|
99
|
+
:type z_dim: int
|
|
100
|
+
:return: score variance
|
|
101
|
+
:rtype: float
|
|
102
|
+
"""
|
|
103
|
+
return 1.0 / cls.effective_sample_count(n, z_dim)
|
|
104
|
+
@classmethod
|
|
105
|
+
def analytic_score_std(cls, n: int, z_dim: int) -> float:
|
|
106
|
+
"""Analytic approximation for score standard deviation.
|
|
107
|
+
|
|
108
|
+
:param n: sample size
|
|
109
|
+
:type n: int
|
|
110
|
+
:param z_dim: size of conditioning set
|
|
111
|
+
:type z_dim: int
|
|
112
|
+
:return: score standard dceviation
|
|
113
|
+
:rtype: float
|
|
114
|
+
"""
|
|
115
|
+
return np.sqrt(cls.analytic_score_var(n, z_dim))
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def _score_z_pair(x_blocks: np.ndarray, y_blocks: np.ndarray, var_ddof: int=1) -> np.ndarray:
|
|
120
|
+
mean_x = np.mean( x_blocks, axis=1 ).reshape(-1,1)
|
|
121
|
+
mean_y = np.mean( y_blocks, axis=1 ).reshape(-1,1)
|
|
122
|
+
covars = np.mean( (x_blocks-mean_x) * (y_blocks-mean_y), axis=1 )
|
|
123
|
+
var_x = np.var(x_blocks, ddof=var_ddof, axis=1) + 0.001 # add small value to avoid instability
|
|
124
|
+
var_y = np.var(y_blocks, ddof=var_ddof, axis=1) + 0.001
|
|
125
|
+
corr = np.clip( covars/np.sqrt(var_x*var_y), -0.999, 0.999 ) # clip to avoid instability
|
|
126
|
+
z = np.arctanh(corr) # np.atanh(corr) only in np>2.0?
|
|
127
|
+
return z
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def _regression_coefficients_many(source_blocks_mean_0: np.ndarray, target_blocks_mean_0: np.ndarray) -> np.ndarray:
|
|
131
|
+
# somehow numpys lstsq does not parallelize well, use np.linalg.inv directly instead
|
|
132
|
+
block_count, block_size, Z_dim = source_blocks_mean_0.shape
|
|
133
|
+
assert block_size > Z_dim, "Cannot invert the matrix for regession with Z_dim >= block_size."
|
|
134
|
+
X = source_blocks_mean_0
|
|
135
|
+
X_transpose = np.transpose(source_blocks_mean_0, [0,2,1])
|
|
136
|
+
X_t_X = np.matmul(X_transpose, X)
|
|
137
|
+
X_t_X_inv = matrix_inv(X_t_X)
|
|
138
|
+
X_t_X_inv_X_t = np.matmul(X_t_X_inv, X_transpose)
|
|
139
|
+
coeffs = np.matmul(X_t_X_inv_X_t, target_blocks_mean_0.reshape(block_count,block_size,1))
|
|
140
|
+
return coeffs.reshape(block_count,Z_dim)
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def _regress_out_raw(cls, x_blocks_mean_0: np.ndarray, y_blocks_mean_0: np.ndarray, z_blocks_mean_0: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
144
|
+
block_count, block_size, var_count = z_blocks_mean_0.shape
|
|
145
|
+
|
|
146
|
+
coeffs_zx = cls._regression_coefficients_many(z_blocks_mean_0, x_blocks_mean_0)
|
|
147
|
+
coeffs_zy = cls._regression_coefficients_many(z_blocks_mean_0, y_blocks_mean_0)
|
|
148
|
+
|
|
149
|
+
residuals_x = x_blocks_mean_0 - np.sum(coeffs_zx.reshape([block_count,1,var_count]) * z_blocks_mean_0, axis=2)
|
|
150
|
+
residuals_y = y_blocks_mean_0 - np.sum(coeffs_zy.reshape([block_count,1,var_count]) * z_blocks_mean_0, axis=2)
|
|
151
|
+
|
|
152
|
+
return residuals_x, residuals_y
|
|
153
|
+
|
|
154
|
+
def _regress_out(self, data: BlockView) -> tuple[np.ndarray, np.ndarray]:
|
|
155
|
+
if self.force_regression_global:
|
|
156
|
+
data_centered = data.trivialize().copy_and_center()
|
|
157
|
+
residuals = self._regress_out_raw(data_centered.x_blocks, data_centered.y_blocks, data_centered.z_blocks)
|
|
158
|
+
residuals = data.apply_blockformat(*residuals)
|
|
159
|
+
return residuals.x_blocks, residuals.y_blocks
|
|
160
|
+
else:
|
|
161
|
+
data_centered = data.copy_and_center()
|
|
162
|
+
return self._regress_out_raw(data_centered.x_blocks, data_centered.y_blocks, data_centered.z_blocks)
|
|
163
|
+
|
|
164
|
+
def score_many(self, data: BlockView) -> np.ndarray:
|
|
165
|
+
"""Compute score (z-transformed partial correlation) on blocks
|
|
166
|
+
|
|
167
|
+
:param data: data blocks
|
|
168
|
+
:type data: BlockView
|
|
169
|
+
:return: score per block
|
|
170
|
+
:rtype: np.ndarray
|
|
171
|
+
"""
|
|
172
|
+
if data.z_dim() > 0:
|
|
173
|
+
return self._score_z_pair(*self._regress_out(data))
|
|
174
|
+
else:
|
|
175
|
+
return self._score_z_pair(data.x_blocks, data.y_blocks)
|
|
176
|
+
|
|
177
|
+
def score_single(self, data: CIT_DataPatterned) -> float:
|
|
178
|
+
"""Compute score (z-transformed partial correlation)
|
|
179
|
+
|
|
180
|
+
:param data: data
|
|
181
|
+
:type data: CIT_DataPatterned
|
|
182
|
+
:return: score
|
|
183
|
+
:rtype: float
|
|
184
|
+
"""
|
|
185
|
+
return float(self.score_many(data.view_blocks_trivial())[0])
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _pvalue(self, score: float|np.ndarray, sigma: float) -> float|np.ndarray:
|
|
189
|
+
return 2.0 * (1.0 - norm.cdf( np.abs(score), scale=sigma ))
|
|
190
|
+
|
|
191
|
+
def pvalue(self, score: float|np.ndarray, N: int, dim_Z: int) -> float|np.ndarray:
|
|
192
|
+
"""Compute p-value for a given score and setup (possibly per block).
|
|
193
|
+
|
|
194
|
+
:param score: z-value(s)
|
|
195
|
+
:type score: float | ndarray
|
|
196
|
+
:param N: sample-size
|
|
197
|
+
:type N: int
|
|
198
|
+
:param dim_Z: size of conditioning set
|
|
199
|
+
:type dim_Z: int
|
|
200
|
+
:return: p-value(s)
|
|
201
|
+
:rtype: float | ndarray
|
|
202
|
+
"""
|
|
203
|
+
return self._pvalue(score, sigma=self.analytic_score_std(N, dim_Z))
|
|
204
|
+
|
|
205
|
+
def pvalue_of_mean(self, score_mean: float, block_size: int, block_count: int, dim_Z: int) -> float:
|
|
206
|
+
"""Compute p-value for a given score-mean over blocks and setup.
|
|
207
|
+
|
|
208
|
+
:param score_mean: mean z-value
|
|
209
|
+
:type score_mean: float
|
|
210
|
+
:param block_size: block-size
|
|
211
|
+
:type block_size: int
|
|
212
|
+
:param block_count: block-count
|
|
213
|
+
:type block_count: int
|
|
214
|
+
:param dim_Z: size of conditioning set
|
|
215
|
+
:type dim_Z: int
|
|
216
|
+
:return: p-value
|
|
217
|
+
:rtype: float
|
|
218
|
+
"""
|
|
219
|
+
if self.force_regression_global:
|
|
220
|
+
v_block = self.analytic_score_var(block_size, z_dim=0)
|
|
221
|
+
n = block_count * block_size
|
|
222
|
+
n_eff = self.effective_sample_count(n=n, z_dim=dim_Z)
|
|
223
|
+
v_global = (v_block / block_count) * (n/n_eff)
|
|
224
|
+
else:
|
|
225
|
+
v_block = self.analytic_score_var(block_size, dim_Z)
|
|
226
|
+
v_global = v_block / block_count
|
|
227
|
+
return self._pvalue(score_mean, sigma=np.sqrt(v_global))
|
|
228
|
+
|
|
229
|
+
def is_pvalue_dependent(self, pvalue: float) -> bool:
|
|
230
|
+
"""Decide if a given p-value should be considered evidence for a dependent test.
|
|
231
|
+
|
|
232
|
+
:param pvalue: p-value
|
|
233
|
+
:type pvalue: float
|
|
234
|
+
:return: test considered dependent
|
|
235
|
+
:rtype: bool
|
|
236
|
+
"""
|
|
237
|
+
return pvalue < self.alpha
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _lower_bound_from_pvalue(self, reference: float, pvalue: float, count: int, conditioning_set_size: int, how: Literal["by effective count", "by large N expansion"], N_global: int=None) -> float:
|
|
242
|
+
if how == "by effective count":
|
|
243
|
+
return self._lower_bound_from_pvalue_by_effective_count(reference, pvalue, count, conditioning_set_size, N_global=N_global)
|
|
244
|
+
elif how == "by large N expansion":
|
|
245
|
+
return self._lower_bound_from_pvalue_by_large_N_expansion(reference, pvalue, count, conditioning_set_size, N_global=N_global)
|
|
246
|
+
else:
|
|
247
|
+
assert False, "unknown analytical approximation"
|
|
248
|
+
|
|
249
|
+
def _lower_bound_from_pvalue_by_effective_count(self, reference: float, pvalue: float, count: int, conditioning_set_size: int, N_global: int) -> float:
|
|
250
|
+
# z is var-stabilized, so should not depend on reference
|
|
251
|
+
assert reference >= 0.0, "remove sign first"
|
|
252
|
+
if self.force_regression_global:
|
|
253
|
+
# Heuristically account for |Z| samples "lost" globally by the fraction of samples (count/N_global) used here.
|
|
254
|
+
eff_n = count-3-conditioning_set_size*(count/N_global)
|
|
255
|
+
else:
|
|
256
|
+
eff_n = self.effective_sample_count(count, conditioning_set_size)
|
|
257
|
+
distance = norm.ppf(1.0 - pvalue) / np.sqrt(eff_n)
|
|
258
|
+
return self._clip_lower_bound( reference - distance )
|
|
259
|
+
|
|
260
|
+
def _lower_bound_from_pvalue_by_large_N_expansion(self, reference: float, pvalue: float, count: int, conditioning_set_size: int, N_global: int) -> float:
|
|
261
|
+
assert reference >= 0.0, "remove sign first"
|
|
262
|
+
corr = np.tanh(reference)
|
|
263
|
+
if self.force_regression_global:
|
|
264
|
+
# Heuristically account for |Z| samples "lost" globally by the fraction of samples (count/N_global) used here.
|
|
265
|
+
N = count-conditioning_set_size*(count/N_global)
|
|
266
|
+
else:
|
|
267
|
+
N = count-conditioning_set_size
|
|
268
|
+
v = 1 / N + (6.0 - corr*corr)/(2 * N * N) # leading terms in 1/N expansion
|
|
269
|
+
distance = norm.ppf(1.0 - pvalue) * np.sqrt(v)
|
|
270
|
+
return self._clip_lower_bound( reference - distance )
|
|
271
|
+
|
|
272
|
+
def _clip_lower_bound(self, lower_bound_raw: float) -> float:
|
|
273
|
+
# Avoid instability for large dependence-values (does not seem to affect relevant power vs. true-regimes).
|
|
274
|
+
return self.lower_bound_clip_value if lower_bound_raw > self.lower_bound_clip_value else lower_bound_raw
|