GLDF 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
GLDF/cit.py ADDED
@@ -0,0 +1,274 @@
1
+ import numpy as np
2
+ from scipy.stats import norm
3
+ from numpy.linalg import inv as matrix_inv
4
+ from .data_management import CIT_DataPatterned, BlockView
5
+ from .data_processing import ITestCI, IProvideAnalyticQuantilesForCIT, IProvideVarianceForCIT
6
+ from typing import Literal
7
+
8
+ class ParCorr(ITestCI, IProvideAnalyticQuantilesForCIT, IProvideVarianceForCIT):
9
+ """Implementation of partial correlation independence test and interface for use with mCIT.
10
+ Can run on many blocks at once efficiently.
11
+ """
12
+
13
+ def __init__(self, alpha: float=0.05, lower_bound_clip_value: float=0.3, force_regression_global: bool=False,
14
+ analytic_approximation_for_cutoff: Literal["by effective count", "by large N expansion"]="by effective count"):
15
+ """Constructor for partial correlation CIT.
16
+
17
+ :param alpha: target for FPR-control, defaults to 0.05
18
+ :type alpha: float, optional
19
+ :param lower_bound_clip_value: to avoid numeric instability for high dependencies, the implementation
20
+ provided for :py:meth:`IProvideAnalyticQuantilesForCIT.cit_quantile_estimate<GLDF.data_processing.IProvideAnalyticQuantilesForCIT.cit_quantile_estimate>`
21
+ clips bounds to a predefined range
22
+ (this is consistent, and does not cost substantial power), defaults to 0.3
23
+ :type lower_bound_clip_value: float, optional
24
+ :param force_regression_global: By default (disabled) regressions are computed locally per block, while in principle
25
+ slightly less sample-efficient on IID data, this is more robust against non-stationarities, defaults to False
26
+ :type force_regression_global: bool, optional
27
+ :param analytic_approximation_for_cutoff: analytic approximation used to implement
28
+ :py:class:`IProvideAnalyticQuantilesForCIT<GLDF.data_processing.IProvideAnalyticQuantilesForCIT>`,
29
+ defaults to "by effective count"
30
+ :type analytic_approximation_for_cutoff: Literal["by effective count", "by large N expansion"]
31
+ """
32
+ self.alpha = alpha
33
+ self.lower_bound_clip_value = lower_bound_clip_value
34
+ self.force_regression_global = force_regression_global
35
+ self.analytic_approximation_for_cutoff = analytic_approximation_for_cutoff
36
+
37
+ def run_single(self, data: CIT_DataPatterned) -> ITestCI.Result:
38
+ global_score = self.score_single(data)
39
+ pvalue = self.pvalue(score=global_score, N=data.sample_count(), dim_Z=data.z_dim())
40
+ return ITestCI.Result(
41
+ global_score=global_score,
42
+ dependent=self.is_pvalue_dependent(pvalue)
43
+ )
44
+
45
+ def run_many(self, data: BlockView) -> ITestCI.Result:
46
+ block_scores = self.score_many(data)
47
+ global_score = np.mean(block_scores)
48
+ pvalue = self.pvalue_of_mean(score_mean=global_score, block_size=data.block_size(), block_count=data.block_count(), dim_Z=data.z_dim())
49
+ return ITestCI.Result(
50
+ global_score=np.mean(block_scores),
51
+ block_scores=block_scores,
52
+ dependent=self.is_pvalue_dependent(pvalue)
53
+ )
54
+
55
+ def cit_quantile_estimate(self, data: BlockView, cit_result: ITestCI.Result, beta: float, cit_obj: ITestCI) -> float:
56
+ assert type(cit_obj) == type(self)
57
+ d1 = cit_result.global_score
58
+ d1_abs = abs(d1)
59
+ d1_is_positive = (d1 > 0.0)
60
+
61
+ cutoff_abs = self._lower_bound_from_pvalue(
62
+ d1_abs, beta, data.block_size(), conditioning_set_size=data.z_dim(),
63
+ how=self.analytic_approximation_for_cutoff, N_global=data.sample_count_used()
64
+ )
65
+
66
+ return cutoff_abs if d1_is_positive else -cutoff_abs
67
+
68
+ def get_variance_estimate(self, N: int, dim_Z: int, cit_obj: ITestCI) -> float:
69
+ assert type(cit_obj) == type(self)
70
+ return self.analytic_score_var(n=N, z_dim=dim_Z)
71
+
72
+
73
+ @staticmethod
74
+ def effective_sample_count(n: int, z_dim: int) -> int:
75
+ """Compute effective sample-size
76
+
77
+ :param n: actual sample-size
78
+ :type n: int
79
+ :param z_dim: size of conditioning set
80
+ :type z_dim: int
81
+ :return: effective sample-size
82
+ :rtype: int
83
+ """
84
+ return n - 3 - z_dim
85
+
86
+ @staticmethod
87
+ def _n_required_for_eff_sample_count(effective_sample_count: int, z_dim:int) -> int:
88
+ return effective_sample_count + z_dim + 3
89
+ @classmethod
90
+ def _analytic_score_var_at_effective_sample_size(cls, effective_size: int) -> float:
91
+ return 1.0 / effective_size
92
+ @classmethod
93
+ def analytic_score_var(cls, n: int, z_dim: int) -> float:
94
+ """Analytic approximation for score variance.
95
+
96
+ :param n: sample size
97
+ :type n: int
98
+ :param z_dim: size of conditioning set
99
+ :type z_dim: int
100
+ :return: score variance
101
+ :rtype: float
102
+ """
103
+ return 1.0 / cls.effective_sample_count(n, z_dim)
104
+ @classmethod
105
+ def analytic_score_std(cls, n: int, z_dim: int) -> float:
106
+ """Analytic approximation for score standard deviation.
107
+
108
+ :param n: sample size
109
+ :type n: int
110
+ :param z_dim: size of conditioning set
111
+ :type z_dim: int
112
+ :return: score standard dceviation
113
+ :rtype: float
114
+ """
115
+ return np.sqrt(cls.analytic_score_var(n, z_dim))
116
+
117
+
118
+ @staticmethod
119
+ def _score_z_pair(x_blocks: np.ndarray, y_blocks: np.ndarray, var_ddof: int=1) -> np.ndarray:
120
+ mean_x = np.mean( x_blocks, axis=1 ).reshape(-1,1)
121
+ mean_y = np.mean( y_blocks, axis=1 ).reshape(-1,1)
122
+ covars = np.mean( (x_blocks-mean_x) * (y_blocks-mean_y), axis=1 )
123
+ var_x = np.var(x_blocks, ddof=var_ddof, axis=1) + 0.001 # add small value to avoid instability
124
+ var_y = np.var(y_blocks, ddof=var_ddof, axis=1) + 0.001
125
+ corr = np.clip( covars/np.sqrt(var_x*var_y), -0.999, 0.999 ) # clip to avoid instability
126
+ z = np.arctanh(corr) # np.atanh(corr) only in np>2.0?
127
+ return z
128
+
129
+ @staticmethod
130
+ def _regression_coefficients_many(source_blocks_mean_0: np.ndarray, target_blocks_mean_0: np.ndarray) -> np.ndarray:
131
+ # somehow numpys lstsq does not parallelize well, use np.linalg.inv directly instead
132
+ block_count, block_size, Z_dim = source_blocks_mean_0.shape
133
+ assert block_size > Z_dim, "Cannot invert the matrix for regession with Z_dim >= block_size."
134
+ X = source_blocks_mean_0
135
+ X_transpose = np.transpose(source_blocks_mean_0, [0,2,1])
136
+ X_t_X = np.matmul(X_transpose, X)
137
+ X_t_X_inv = matrix_inv(X_t_X)
138
+ X_t_X_inv_X_t = np.matmul(X_t_X_inv, X_transpose)
139
+ coeffs = np.matmul(X_t_X_inv_X_t, target_blocks_mean_0.reshape(block_count,block_size,1))
140
+ return coeffs.reshape(block_count,Z_dim)
141
+
142
+ @classmethod
143
+ def _regress_out_raw(cls, x_blocks_mean_0: np.ndarray, y_blocks_mean_0: np.ndarray, z_blocks_mean_0: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
144
+ block_count, block_size, var_count = z_blocks_mean_0.shape
145
+
146
+ coeffs_zx = cls._regression_coefficients_many(z_blocks_mean_0, x_blocks_mean_0)
147
+ coeffs_zy = cls._regression_coefficients_many(z_blocks_mean_0, y_blocks_mean_0)
148
+
149
+ residuals_x = x_blocks_mean_0 - np.sum(coeffs_zx.reshape([block_count,1,var_count]) * z_blocks_mean_0, axis=2)
150
+ residuals_y = y_blocks_mean_0 - np.sum(coeffs_zy.reshape([block_count,1,var_count]) * z_blocks_mean_0, axis=2)
151
+
152
+ return residuals_x, residuals_y
153
+
154
+ def _regress_out(self, data: BlockView) -> tuple[np.ndarray, np.ndarray]:
155
+ if self.force_regression_global:
156
+ data_centered = data.trivialize().copy_and_center()
157
+ residuals = self._regress_out_raw(data_centered.x_blocks, data_centered.y_blocks, data_centered.z_blocks)
158
+ residuals = data.apply_blockformat(*residuals)
159
+ return residuals.x_blocks, residuals.y_blocks
160
+ else:
161
+ data_centered = data.copy_and_center()
162
+ return self._regress_out_raw(data_centered.x_blocks, data_centered.y_blocks, data_centered.z_blocks)
163
+
164
+ def score_many(self, data: BlockView) -> np.ndarray:
165
+ """Compute score (z-transformed partial correlation) on blocks
166
+
167
+ :param data: data blocks
168
+ :type data: BlockView
169
+ :return: score per block
170
+ :rtype: np.ndarray
171
+ """
172
+ if data.z_dim() > 0:
173
+ return self._score_z_pair(*self._regress_out(data))
174
+ else:
175
+ return self._score_z_pair(data.x_blocks, data.y_blocks)
176
+
177
+ def score_single(self, data: CIT_DataPatterned) -> float:
178
+ """Compute score (z-transformed partial correlation)
179
+
180
+ :param data: data
181
+ :type data: CIT_DataPatterned
182
+ :return: score
183
+ :rtype: float
184
+ """
185
+ return float(self.score_many(data.view_blocks_trivial())[0])
186
+
187
+
188
+ def _pvalue(self, score: float|np.ndarray, sigma: float) -> float|np.ndarray:
189
+ return 2.0 * (1.0 - norm.cdf( np.abs(score), scale=sigma ))
190
+
191
+ def pvalue(self, score: float|np.ndarray, N: int, dim_Z: int) -> float|np.ndarray:
192
+ """Compute p-value for a given score and setup (possibly per block).
193
+
194
+ :param score: z-value(s)
195
+ :type score: float | ndarray
196
+ :param N: sample-size
197
+ :type N: int
198
+ :param dim_Z: size of conditioning set
199
+ :type dim_Z: int
200
+ :return: p-value(s)
201
+ :rtype: float | ndarray
202
+ """
203
+ return self._pvalue(score, sigma=self.analytic_score_std(N, dim_Z))
204
+
205
+ def pvalue_of_mean(self, score_mean: float, block_size: int, block_count: int, dim_Z: int) -> float:
206
+ """Compute p-value for a given score-mean over blocks and setup.
207
+
208
+ :param score_mean: mean z-value
209
+ :type score_mean: float
210
+ :param block_size: block-size
211
+ :type block_size: int
212
+ :param block_count: block-count
213
+ :type block_count: int
214
+ :param dim_Z: size of conditioning set
215
+ :type dim_Z: int
216
+ :return: p-value
217
+ :rtype: float
218
+ """
219
+ if self.force_regression_global:
220
+ v_block = self.analytic_score_var(block_size, z_dim=0)
221
+ n = block_count * block_size
222
+ n_eff = self.effective_sample_count(n=n, z_dim=dim_Z)
223
+ v_global = (v_block / block_count) * (n/n_eff)
224
+ else:
225
+ v_block = self.analytic_score_var(block_size, dim_Z)
226
+ v_global = v_block / block_count
227
+ return self._pvalue(score_mean, sigma=np.sqrt(v_global))
228
+
229
+ def is_pvalue_dependent(self, pvalue: float) -> bool:
230
+ """Decide if a given p-value should be considered evidence for a dependent test.
231
+
232
+ :param pvalue: p-value
233
+ :type pvalue: float
234
+ :return: test considered dependent
235
+ :rtype: bool
236
+ """
237
+ return pvalue < self.alpha
238
+
239
+
240
+
241
+ def _lower_bound_from_pvalue(self, reference: float, pvalue: float, count: int, conditioning_set_size: int, how: Literal["by effective count", "by large N expansion"], N_global: int=None) -> float:
242
+ if how == "by effective count":
243
+ return self._lower_bound_from_pvalue_by_effective_count(reference, pvalue, count, conditioning_set_size, N_global=N_global)
244
+ elif how == "by large N expansion":
245
+ return self._lower_bound_from_pvalue_by_large_N_expansion(reference, pvalue, count, conditioning_set_size, N_global=N_global)
246
+ else:
247
+ assert False, "unknown analytical approximation"
248
+
249
+ def _lower_bound_from_pvalue_by_effective_count(self, reference: float, pvalue: float, count: int, conditioning_set_size: int, N_global: int) -> float:
250
+ # z is var-stabilized, so should not depend on reference
251
+ assert reference >= 0.0, "remove sign first"
252
+ if self.force_regression_global:
253
+ # Heuristically account for |Z| samples "lost" globally by the fraction of samples (count/N_global) used here.
254
+ eff_n = count-3-conditioning_set_size*(count/N_global)
255
+ else:
256
+ eff_n = self.effective_sample_count(count, conditioning_set_size)
257
+ distance = norm.ppf(1.0 - pvalue) / np.sqrt(eff_n)
258
+ return self._clip_lower_bound( reference - distance )
259
+
260
+ def _lower_bound_from_pvalue_by_large_N_expansion(self, reference: float, pvalue: float, count: int, conditioning_set_size: int, N_global: int) -> float:
261
+ assert reference >= 0.0, "remove sign first"
262
+ corr = np.tanh(reference)
263
+ if self.force_regression_global:
264
+ # Heuristically account for |Z| samples "lost" globally by the fraction of samples (count/N_global) used here.
265
+ N = count-conditioning_set_size*(count/N_global)
266
+ else:
267
+ N = count-conditioning_set_size
268
+ v = 1 / N + (6.0 - corr*corr)/(2 * N * N) # leading terms in 1/N expansion
269
+ distance = norm.ppf(1.0 - pvalue) * np.sqrt(v)
270
+ return self._clip_lower_bound( reference - distance )
271
+
272
+ def _clip_lower_bound(self, lower_bound_raw: float) -> float:
273
+ # Avoid instability for large dependence-values (does not seem to affect relevant power vs. true-regimes).
274
+ return self.lower_bound_clip_value if lower_bound_raw > self.lower_bound_clip_value else lower_bound_raw