tigramite-fast 5.2.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tigramite/__init__.py +0 -0
- tigramite/causal_effects.py +1525 -0
- tigramite/causal_mediation.py +1592 -0
- tigramite/data_processing.py +1574 -0
- tigramite/graphs.py +1509 -0
- tigramite/independence_tests/LBFGS.py +1114 -0
- tigramite/independence_tests/__init__.py +0 -0
- tigramite/independence_tests/cmiknn.py +661 -0
- tigramite/independence_tests/cmiknn_mixed.py +1397 -0
- tigramite/independence_tests/cmisymb.py +286 -0
- tigramite/independence_tests/gpdc.py +664 -0
- tigramite/independence_tests/gpdc_torch.py +820 -0
- tigramite/independence_tests/gsquared.py +190 -0
- tigramite/independence_tests/independence_tests_base.py +1310 -0
- tigramite/independence_tests/oracle_conditional_independence.py +1582 -0
- tigramite/independence_tests/pairwise_CI.py +383 -0
- tigramite/independence_tests/parcorr.py +369 -0
- tigramite/independence_tests/parcorr_mult.py +485 -0
- tigramite/independence_tests/parcorr_wls.py +451 -0
- tigramite/independence_tests/regressionCI.py +403 -0
- tigramite/independence_tests/robust_parcorr.py +403 -0
- tigramite/jpcmciplus.py +966 -0
- tigramite/lpcmci.py +3649 -0
- tigramite/models.py +2257 -0
- tigramite/pcmci.py +3935 -0
- tigramite/pcmci_base.py +1218 -0
- tigramite/plotting.py +4735 -0
- tigramite/rpcmci.py +467 -0
- tigramite/toymodels/__init__.py +0 -0
- tigramite/toymodels/context_model.py +261 -0
- tigramite/toymodels/non_additive.py +1231 -0
- tigramite/toymodels/structural_causal_processes.py +1201 -0
- tigramite/toymodels/surrogate_generator.py +319 -0
- tigramite_fast-5.2.10.1.dist-info/METADATA +182 -0
- tigramite_fast-5.2.10.1.dist-info/RECORD +38 -0
- tigramite_fast-5.2.10.1.dist-info/WHEEL +5 -0
- tigramite_fast-5.2.10.1.dist-info/licenses/license.txt +621 -0
- tigramite_fast-5.2.10.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1397 @@
|
|
|
1
|
+
from __future__ import print_function
|
|
2
|
+
from scipy import special, spatial
|
|
3
|
+
from sklearn.neighbors import BallTree, NearestNeighbors
|
|
4
|
+
from sklearn import metrics
|
|
5
|
+
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
|
|
6
|
+
from sklearn.utils.extmath import cartesian
|
|
7
|
+
import numpy as np
|
|
8
|
+
import math
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
from tigramite.independence_tests.independence_tests_base import CondIndTest
|
|
12
|
+
|
|
13
|
+
class CMIknnMixed(CondIndTest):
|
|
14
|
+
r"""Conditional mutual information test based on nearest-neighbor estimator.
|
|
15
|
+
|
|
16
|
+
Conditional mutual information is a general dependency measure coming
|
|
17
|
+
from an information-theoretic framework. It makes almost no assumptions about the
|
|
18
|
+
parametric form of the dependencies by directly estimating the underlying
|
|
19
|
+
joint density. The tests here are based on the entropy estimation using
|
|
20
|
+
k-nearest neighbors. We implement three different approaches:
|
|
21
|
+
|
|
22
|
+
(1) Mesner & Shalizi [1]
|
|
23
|
+
(2) Conditional variant [2]
|
|
24
|
+
(3) Our variant [3]
|
|
25
|
+
|
|
26
|
+
These approaches differ in how the distance metrics are defined when searching
|
|
27
|
+
for neighbors:
|
|
28
|
+
|
|
29
|
+
(1) The distance on the discrete dimensions for unequal values is 1, otherwise 0.
|
|
30
|
+
(2) This approach splits the space into clusters for which discrete values are
|
|
31
|
+
all equal, then computes distances between those points (which now have
|
|
32
|
+
only continuous values).
|
|
33
|
+
(2) This approach uses the approach from [1], but defines the distance for
|
|
34
|
+
points with unequal discrete values as infinite, and ignores all
|
|
35
|
+
neighbors that have infinite distances.
|
|
36
|
+
|
|
37
|
+
The tests can be combined with a shuffle test to generate the distribution
|
|
38
|
+
under the null hypothesis of independence, described in [4]. The
|
|
39
|
+
knn-estimator is suitable for heterogeneous variables
|
|
40
|
+
(mixed-type, multivariate with discrete and continuous dimensions). For
|
|
41
|
+
mixture-type variables, use only (1) or (3).
|
|
42
|
+
|
|
43
|
+
For continuous variables, use the CMI class. For discrete variables, use
|
|
44
|
+
the CMIsymb or Gsquared class.
|
|
45
|
+
|
|
46
|
+
Notes
|
|
47
|
+
-----
|
|
48
|
+
These estimators have as a parameter the number of
|
|
49
|
+
nearest-neighbors :math:`k` which determines the size of hyper-cubes
|
|
50
|
+
around each (high-dimensional) sample point.
|
|
51
|
+
|
|
52
|
+
For variants (2) and (3), k is used locally, meaning that it defines
|
|
53
|
+
how many neighbors from a respective subsample should be considered.
|
|
54
|
+
|
|
55
|
+
:math:`k` can be viewed as a density smoothing parameter (although it is
|
|
56
|
+
data-adaptive unlike fixed-bandwidth estimators). For large :math:`k`, the
|
|
57
|
+
underlying dependencies are more smoothed and CMI has a larger bias,
|
|
58
|
+
but lower variance, which is more important for significance testing. Note
|
|
59
|
+
that the estimated CMI values can be slightly negative while CMI is a non-
|
|
60
|
+
negative quantity.
|
|
61
|
+
|
|
62
|
+
This method requires the scipy package.
|
|
63
|
+
|
|
64
|
+
References
|
|
65
|
+
----------
|
|
66
|
+
.. [1] Mesner, O.C., & Shalizi, C.R. (2019): Conditional Mutual Information
|
|
67
|
+
Estimation for Mixed Discrete and Continuous Variables with
|
|
68
|
+
Nearest Neighbors. arXiv: Statistics Theory.
|
|
69
|
+
https://arxiv.org/abs/1912.03387
|
|
70
|
+
|
|
71
|
+
.. [2] Zan, L., Meynaoui, A., Assaad, C.K., Devijver, E., & Gaussier,
|
|
72
|
+
É. (2022): A Conditional Mutual Information Estimator for
|
|
73
|
+
Mixed Data and an Associated Conditional Independence Test.
|
|
74
|
+
Entropy, 24.
|
|
75
|
+
https://www.mdpi.com/1099-4300/24/9/1234/html
|
|
76
|
+
|
|
77
|
+
.. [3] Oana-Iuliana Popescu, Andreas Gerhardus, Martin Rabel, Jakob Runge
|
|
78
|
+
(2024), accepted at CLEAR
|
|
79
|
+
https://arxiv.org/abs/2310.11132
|
|
80
|
+
|
|
81
|
+
.. [4] J. Runge (2018): Conditional Independence Testing Based on a
|
|
82
|
+
Nearest-Neighbor Estimator of Conditional Mutual Information.
|
|
83
|
+
In Proceedings of the 21st International Conference on Artificial
|
|
84
|
+
Intelligence and Statistics.
|
|
85
|
+
http://proceedings.mlr.press/v84/runge18a.html
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
knn : int or float, optional (default: 0.2)
|
|
90
|
+
Number of nearest-neighbors which determines the size of hyper-cubes
|
|
91
|
+
around each (high-dimensional) sample point. If smaller than 1, this is
|
|
92
|
+
computed as a fraction of T, hence knn=knn*T. For knn larger or equal to
|
|
93
|
+
1, this is the absolute number.
|
|
94
|
+
|
|
95
|
+
knn_type : string, optional (default: 'global')
|
|
96
|
+
Sets the type of heuristic for the MSinf estimator (see paper). Can
|
|
97
|
+
be 'local', 'global', or 'cluster_size'. Use 'global' for the most
|
|
98
|
+
computational efficient variant of the estimator.
|
|
99
|
+
|
|
100
|
+
estimator : string, optional (default: 'MS')
|
|
101
|
+
The type of estimator to be used. Three options are available:
|
|
102
|
+
approach (1) (Mesner and Shalizi (2021) [1]): 'MS',
|
|
103
|
+
approach (2) (Zao et.al. (2022) [2]): 'cond',
|
|
104
|
+
approach (3) (Mesner and Shalizi (2021) [1]) with
|
|
105
|
+
infinite distance for points from different categories): 'FPinf'
|
|
106
|
+
|
|
107
|
+
shuffle_neighbors : int, optional (default: 5)
|
|
108
|
+
Number of nearest-neighbors within Z for the shuffle surrogates which
|
|
109
|
+
determines the size of hyper-cubes around each (high-dimensional) sample
|
|
110
|
+
point.
|
|
111
|
+
|
|
112
|
+
transform : {'ranks', 'standardize', 'scale', False}, optional
|
|
113
|
+
(default: 'standardize')
|
|
114
|
+
Whether to transform the array beforehand by transforming to ranks,
|
|
115
|
+
standardizing or scaling to (a,b)
|
|
116
|
+
|
|
117
|
+
scale : tuple, optional (default: (0,1))
|
|
118
|
+
the scale (a,b) to use if the transform is set 'scale'
|
|
119
|
+
|
|
120
|
+
perc : float, optional (default: None)
|
|
121
|
+
the value to be used as percentage of the cluster size for the realization
|
|
122
|
+
of a discrete value when using the 'MSinf' method. If set to None,
|
|
123
|
+
it is the same as the knn value.
|
|
124
|
+
|
|
125
|
+
workers : int (optional, default = -1)
|
|
126
|
+
Number of workers to use for parallel processing. If -1 is given
|
|
127
|
+
all processors are used. Default: -1.
|
|
128
|
+
|
|
129
|
+
rho: list of float, optional (default: [np.inf])
|
|
130
|
+
Hyperparameters used for weighting the discrete variable distances.
|
|
131
|
+
If not initialized, the distance will be set to np.inf, such that discrete
|
|
132
|
+
variables with different values will never be considered neighbors.
|
|
133
|
+
Otherwise the rho
|
|
134
|
+
...
|
|
135
|
+
|
|
136
|
+
significance : str, optional (default: 'shuffle_test')
|
|
137
|
+
Type of significance test to use. For CMIknn only 'fixed_thres' and
|
|
138
|
+
'shuffle_test' are available.
|
|
139
|
+
|
|
140
|
+
**kwargs :
|
|
141
|
+
Arguments passed on to parent class CondIndTest.
|
|
142
|
+
"""
|
|
143
|
+
@property
|
|
144
|
+
def measure(self):
|
|
145
|
+
"""
|
|
146
|
+
Concrete property to return the measure of the independence test
|
|
147
|
+
"""
|
|
148
|
+
return self._measure
|
|
149
|
+
|
|
150
|
+
def __init__(self,
|
|
151
|
+
knn=0.2,
|
|
152
|
+
knn_type='global',
|
|
153
|
+
estimator='MSinf',
|
|
154
|
+
shuffle_neighbors=5,
|
|
155
|
+
significance='shuffle_test',
|
|
156
|
+
transform='ranks',
|
|
157
|
+
scale_range=(0, 1),
|
|
158
|
+
max_with_0=False,
|
|
159
|
+
workers=-1,
|
|
160
|
+
**kwargs):
|
|
161
|
+
# Set the member variables
|
|
162
|
+
self.knn = knn
|
|
163
|
+
self.knn_type = knn_type
|
|
164
|
+
self.estimator = estimator
|
|
165
|
+
self.shuffle_neighbors = shuffle_neighbors
|
|
166
|
+
self.transform = transform
|
|
167
|
+
self.max_with_0 = max_with_0
|
|
168
|
+
self.scale_range = scale_range
|
|
169
|
+
self._measure = 'cmi_knn_mixed'
|
|
170
|
+
self.two_sided = False
|
|
171
|
+
self.residual_based = False
|
|
172
|
+
self.recycle_residuals = False
|
|
173
|
+
self.workers = workers
|
|
174
|
+
self.eps = 1e-5
|
|
175
|
+
|
|
176
|
+
# Call the parent constructor
|
|
177
|
+
CondIndTest.__init__(self, significance=significance, **kwargs)
|
|
178
|
+
# Print some information about construction
|
|
179
|
+
if self.verbosity > 0:
|
|
180
|
+
if self.knn < 1:
|
|
181
|
+
print("knn/T = %s" % self.knn)
|
|
182
|
+
else:
|
|
183
|
+
print("knn = %s" % self.knn)
|
|
184
|
+
print("shuffle_neighbors = %d\n" % self.shuffle_neighbors)
|
|
185
|
+
|
|
186
|
+
def _standardize_array(self, array, dim):
|
|
187
|
+
"""Standardizes a given array with dimensions dim.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
array : array-like
|
|
192
|
+
data array with X, Y, Z in rows and observations in columns
|
|
193
|
+
|
|
194
|
+
dim: int
|
|
195
|
+
number of dimensions of the data.
|
|
196
|
+
|
|
197
|
+
Returns
|
|
198
|
+
-------
|
|
199
|
+
array : array-like
|
|
200
|
+
The standardized array.
|
|
201
|
+
"""
|
|
202
|
+
array = array.astype(np.float64)
|
|
203
|
+
array -= array.mean(axis=1).reshape(dim, 1)
|
|
204
|
+
std = array.std(axis=1)
|
|
205
|
+
for i in range(dim):
|
|
206
|
+
if std[i] != 0.:
|
|
207
|
+
array[i] /= std[i]
|
|
208
|
+
# array /= array.std(axis=1).reshape(dim, 1)
|
|
209
|
+
# FIXME: If the time series is constant, return nan rather than
|
|
210
|
+
# raising Exception
|
|
211
|
+
if np.any(std == 0.):
|
|
212
|
+
warnings.warn("Possibly constant array!")
|
|
213
|
+
# raise ValueError("nans after standardizing, "
|
|
214
|
+
# "possibly constant array!")
|
|
215
|
+
return array
|
|
216
|
+
|
|
217
|
+
def _scale_array(self, array, minmax=(0, 1)):
|
|
218
|
+
"""Scales a given array to range minmax dimension-wise.
|
|
219
|
+
|
|
220
|
+
Parameters
|
|
221
|
+
----------
|
|
222
|
+
array : array-like
|
|
223
|
+
data array with X, Y, Z in rows and observations in columns
|
|
224
|
+
|
|
225
|
+
minmax: tuple (a, b)
|
|
226
|
+
the min and the max values (a, b) for the scaling
|
|
227
|
+
|
|
228
|
+
Returns
|
|
229
|
+
-------
|
|
230
|
+
array : array-like
|
|
231
|
+
The scaled array.
|
|
232
|
+
"""
|
|
233
|
+
scaler = MinMaxScaler(minmax)
|
|
234
|
+
return scaler.fit_transform(array.T).T
|
|
235
|
+
|
|
236
|
+
def _rank_array(self, array):
|
|
237
|
+
"""Transform a given array to ranks.
|
|
238
|
+
|
|
239
|
+
Parameters
|
|
240
|
+
----------
|
|
241
|
+
array : array-like
|
|
242
|
+
data array with X, Y, Z in rows and observations in columns
|
|
243
|
+
|
|
244
|
+
Returns
|
|
245
|
+
-------
|
|
246
|
+
array : array-like
|
|
247
|
+
The scaled array.
|
|
248
|
+
"""
|
|
249
|
+
return array.argsort(axis=1).argsort(axis=1).astype(np.float64)
|
|
250
|
+
|
|
251
|
+
def _transform_mixed_data(self, array, data_type=None, add_noise=True):
|
|
252
|
+
"""Applies data transformations to the continuous dimensions of the given data.
|
|
253
|
+
|
|
254
|
+
Parameters
|
|
255
|
+
----------
|
|
256
|
+
array : array-like
|
|
257
|
+
data array with X, Y, Z in rows and observations in columns
|
|
258
|
+
|
|
259
|
+
add_noise : bool (default False)
|
|
260
|
+
Defines whether to add small normal noise to the continuous data.
|
|
261
|
+
|
|
262
|
+
data_type : array-like
|
|
263
|
+
data array of same shape as array which describes whether variables
|
|
264
|
+
are continuous or discrete: 0s for continuous variables and
|
|
265
|
+
1s for discrete variables
|
|
266
|
+
|
|
267
|
+
Returns
|
|
268
|
+
-------
|
|
269
|
+
array : array-like
|
|
270
|
+
The array with the continuous data transformed.
|
|
271
|
+
|
|
272
|
+
"""
|
|
273
|
+
continuous_idxs = np.where(np.all(data_type == 0, axis=1))[0]
|
|
274
|
+
cont_dim = len(continuous_idxs)
|
|
275
|
+
|
|
276
|
+
if add_noise:
|
|
277
|
+
# Add noise to destroy ties
|
|
278
|
+
array[continuous_idxs, :] += (1E-16 * array[continuous_idxs, :].std(axis=1).reshape(cont_dim, 1)
|
|
279
|
+
* self.random_state.random((array[continuous_idxs, :].shape[0], array[continuous_idxs, :].shape[1])))
|
|
280
|
+
if self.transform == 'standardize':
|
|
281
|
+
array[continuous_idxs, :] = self._standardize_array(array[continuous_idxs, :], cont_dim)
|
|
282
|
+
elif self.transform == 'scale':
|
|
283
|
+
array[continuous_idxs, :] = self._scale_array(array[continuous_idxs, :], minmax=self.scale_range)
|
|
284
|
+
elif self.transform == 'ranks':
|
|
285
|
+
# if self.estimator == 'MS' or self.estimator == 'FPinf':
|
|
286
|
+
array[continuous_idxs, :] = self._rank_array(array[continuous_idxs, :])
|
|
287
|
+
# else:
|
|
288
|
+
# for conditional, compute ranks for each
|
|
289
|
+
# pass
|
|
290
|
+
elif self.transform == 'none':
|
|
291
|
+
pass
|
|
292
|
+
else:
|
|
293
|
+
warnings.warn('Unknown transform')
|
|
294
|
+
|
|
295
|
+
return array
|
|
296
|
+
|
|
297
|
+
def _transform_to_one_hot_mixed(self, array, xyz, data_type,
|
|
298
|
+
zero_inf=False):
|
|
299
|
+
"""Applies one-hot encoding to the discrete dimensions of the array.
|
|
300
|
+
|
|
301
|
+
Parameters
|
|
302
|
+
----------
|
|
303
|
+
array : array-like
|
|
304
|
+
data array with X, Y, Z in rows and observations in columns
|
|
305
|
+
|
|
306
|
+
xyz : list
|
|
307
|
+
List that indicates which dimensions belong to which variable, e.g.
|
|
308
|
+
for X, Y, Z one-dimensional xyz = [0, 1, 2]
|
|
309
|
+
|
|
310
|
+
data_type : array-like
|
|
311
|
+
data array of same shape as array which describes whether variables
|
|
312
|
+
are continuous or discrete: 0s for continuous variables and
|
|
313
|
+
1s for discrete variables
|
|
314
|
+
|
|
315
|
+
zero_inf : bool, optional (default: False)
|
|
316
|
+
defines whether to set infinite distances between points with different
|
|
317
|
+
values for the discrete dimensions
|
|
318
|
+
|
|
319
|
+
Returns
|
|
320
|
+
-------
|
|
321
|
+
array : array-like
|
|
322
|
+
The array with the continuous data transformed.
|
|
323
|
+
|
|
324
|
+
"""
|
|
325
|
+
|
|
326
|
+
discrete_idx_list = np.where(np.all(data_type == 1, axis=0), 1, 0)
|
|
327
|
+
mixed_idx_list = np.where(np.any(data_type == 1, axis=0), 1, 0)
|
|
328
|
+
|
|
329
|
+
narray = np.copy(array)
|
|
330
|
+
nxyz = np.copy(xyz)
|
|
331
|
+
ndata_type = np.copy(data_type)
|
|
332
|
+
|
|
333
|
+
appended_columns = 0
|
|
334
|
+
for i in range(len(discrete_idx_list)):
|
|
335
|
+
if discrete_idx_list[i] == 1:
|
|
336
|
+
encoder = OneHotEncoder(handle_unknown='ignore')
|
|
337
|
+
i += appended_columns
|
|
338
|
+
data = narray[:, i]
|
|
339
|
+
xyz_val = nxyz[i]
|
|
340
|
+
encoder_df = encoder.fit_transform(data.reshape(-1, 1)).toarray()
|
|
341
|
+
if zero_inf:
|
|
342
|
+
encoder_df = np.where(encoder_df == 1, 9999999, 0)
|
|
343
|
+
|
|
344
|
+
xyz_val = [nxyz[i]] * encoder_df.shape[-1]
|
|
345
|
+
narray = np.concatenate([narray[:, :i], encoder_df, narray[:, i+1:]], axis=-1)
|
|
346
|
+
|
|
347
|
+
nxyz = np.concatenate([nxyz[:i], xyz_val, nxyz[i+1:]])
|
|
348
|
+
ndata_type = np.concatenate([ndata_type[:, :i],
|
|
349
|
+
np.ones(encoder_df.shape),
|
|
350
|
+
ndata_type[:, i+1:]],
|
|
351
|
+
axis=-1)
|
|
352
|
+
appended_columns += encoder_df.shape[-1] - 1
|
|
353
|
+
|
|
354
|
+
elif mixed_idx_list[i] == 1 and zero_inf == True:
|
|
355
|
+
i += appended_columns
|
|
356
|
+
data = narray[:, i]
|
|
357
|
+
xyz_val = nxyz[i]
|
|
358
|
+
|
|
359
|
+
# find categories
|
|
360
|
+
categories = np.unique(narray[:, i] * ndata_type[:, i])
|
|
361
|
+
categories = np.delete(categories, categories == 0.)
|
|
362
|
+
cont_vars = np.unique(narray[:, i] * (1 - ndata_type[:, i]))
|
|
363
|
+
|
|
364
|
+
encoder = OneHotEncoder(categories=[categories], handle_unknown='ignore')
|
|
365
|
+
xyz_val = nxyz[i]
|
|
366
|
+
encoder_df = encoder.fit_transform(data.reshape(-1, 1)).toarray()
|
|
367
|
+
if zero_inf:
|
|
368
|
+
encoder_df = np.where(encoder_df == 1, 9999999, 0)
|
|
369
|
+
|
|
370
|
+
xyz_val = [nxyz[i]] * (encoder_df.shape[-1] + 1)
|
|
371
|
+
cont_column = np.expand_dims(narray[:, i] * (1 - ndata_type[:, i]), -1)
|
|
372
|
+
narray = np.concatenate([narray[:, :i], cont_column, encoder_df, narray[:, i+1:]], axis=-1)
|
|
373
|
+
|
|
374
|
+
nxyz = np.concatenate([nxyz[:i], xyz_val, nxyz[i+1:]])
|
|
375
|
+
ndata_type = np.concatenate([ndata_type[:, :i],
|
|
376
|
+
np.zeros(cont_column.shape),
|
|
377
|
+
np.ones(encoder_df.shape),
|
|
378
|
+
ndata_type[:, i+1:]],
|
|
379
|
+
axis=-1)
|
|
380
|
+
appended_columns += encoder_df.shape[-1]
|
|
381
|
+
|
|
382
|
+
ndiscrete_idx_list = np.where(np.any(ndata_type == 1, axis=0), 1, 0)
|
|
383
|
+
|
|
384
|
+
return narray, nxyz, ndata_type, ndiscrete_idx_list
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def get_smallest_cluster_size(self, array, data_type=None):
|
|
388
|
+
"""Computes the smallest number of samples for each realization
|
|
389
|
+
of the discrete variables.
|
|
390
|
+
Used for computation of the "local" knn.
|
|
391
|
+
|
|
392
|
+
Parameters
|
|
393
|
+
----------
|
|
394
|
+
array : array-like
|
|
395
|
+
data array with X, Y, Z in rows and observations in columns
|
|
396
|
+
|
|
397
|
+
data_type : array-like
|
|
398
|
+
data array of same shape as array which describes whether variables
|
|
399
|
+
are continuous or discrete: 0s for continuous variables and
|
|
400
|
+
1s for discrete variables
|
|
401
|
+
Returns
|
|
402
|
+
-------
|
|
403
|
+
min_nc : integer
|
|
404
|
+
The smallest number of samples in a cluster.
|
|
405
|
+
"""
|
|
406
|
+
discrete_idx_list = np.where(np.any(data_type == 1, axis=0), 1, 0)
|
|
407
|
+
discrete_xyz_idx = np.where(np.asarray(discrete_idx_list) == 1)[0]
|
|
408
|
+
|
|
409
|
+
num_xyz_classes = [np.unique(array[:, index]) for index in range(len(discrete_idx_list)) if (discrete_idx_list[index] == 1)]
|
|
410
|
+
|
|
411
|
+
xyz_cartesian_product = []
|
|
412
|
+
|
|
413
|
+
if len(num_xyz_classes) > 1:
|
|
414
|
+
xyz_cartesian_product = cartesian(num_xyz_classes)
|
|
415
|
+
elif len(num_xyz_classes) > 0:
|
|
416
|
+
xyz_cartesian_product = num_xyz_classes[0]
|
|
417
|
+
|
|
418
|
+
min_nc = array.shape[0]
|
|
419
|
+
|
|
420
|
+
if len(xyz_cartesian_product) > 0:
|
|
421
|
+
for i, entry in enumerate(xyz_cartesian_product):
|
|
422
|
+
current_array = array[np.sum(array[:, discrete_xyz_idx] == entry,
|
|
423
|
+
axis=-1) == len(discrete_xyz_idx)]
|
|
424
|
+
if current_array.shape[0] > 0 and current_array.shape[0] < min_nc:
|
|
425
|
+
min_nc = current_array.shape[0]
|
|
426
|
+
|
|
427
|
+
return min_nc
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
# @jit(forceobj=True)
|
|
432
|
+
def _get_nearest_neighbors_zeroinf_onehot(self, array, xyz, knn,
|
|
433
|
+
data_type=None):
|
|
434
|
+
"""Returns CMI estimate according to [1] with an
|
|
435
|
+
altered distance metric: the 0-inf metric, which attributes
|
|
436
|
+
infinite distance to points where the values for the discrete dimensions
|
|
437
|
+
do not coincide.
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
Retrieves the distances eps to the k-th nearest neighbors for every
|
|
441
|
+
sample in joint space XYZ and returns the numbers of nearest neighbors
|
|
442
|
+
within eps in subspaces Z, XZ, YZ. Uses the 0-inf metric for
|
|
443
|
+
discrete variables.
|
|
444
|
+
|
|
445
|
+
Parameters
|
|
446
|
+
----------
|
|
447
|
+
array : array-like
|
|
448
|
+
data array with X, Y, Z in rows and observations in columns
|
|
449
|
+
|
|
450
|
+
xyz : array of ints
|
|
451
|
+
XYZ identifier array of shape (dim,).
|
|
452
|
+
|
|
453
|
+
knn : int or float
|
|
454
|
+
Number of nearest-neighbors which determines the size of hyper-cubes
|
|
455
|
+
around each (high-dimensional) sample point. If smaller than 1, this
|
|
456
|
+
is computed as a fraction of T, hence knn=knn*T. For knn larger or
|
|
457
|
+
equal to 1, this is the absolute number.
|
|
458
|
+
|
|
459
|
+
data_type : array-like
|
|
460
|
+
data array of same shape as array which describes whether variables
|
|
461
|
+
are continuous or discrete: 0s for continuous variables and
|
|
462
|
+
1s for discrete variables
|
|
463
|
+
Returns
|
|
464
|
+
-------
|
|
465
|
+
k_xz, k_yz, k_z : tuple of arrays of shape (T,)
|
|
466
|
+
Nearest neighbors in subspaces.
|
|
467
|
+
"""
|
|
468
|
+
dim, T = array.shape
|
|
469
|
+
|
|
470
|
+
array = array.astype(np.float64)
|
|
471
|
+
xyz = xyz.astype(np.int32)
|
|
472
|
+
|
|
473
|
+
array = self._transform_mixed_data(array, data_type)
|
|
474
|
+
|
|
475
|
+
array = array.T
|
|
476
|
+
data_type = data_type.T
|
|
477
|
+
|
|
478
|
+
narray, nxyz, ndata_type, discrete_idx_list = self._transform_to_one_hot_mixed(array, xyz, data_type,
|
|
479
|
+
zero_inf=True)
|
|
480
|
+
|
|
481
|
+
# Subsample indices
|
|
482
|
+
x_indices = np.where(nxyz == 0)[0]
|
|
483
|
+
y_indices = np.where(nxyz == 1)[0]
|
|
484
|
+
z_indices = np.where(nxyz == 2)[0]
|
|
485
|
+
xz_indices = np.concatenate([x_indices, z_indices])
|
|
486
|
+
yz_indices = np.concatenate([y_indices, z_indices])
|
|
487
|
+
|
|
488
|
+
# Fit trees
|
|
489
|
+
tree_xyz = spatial.cKDTree(narray)
|
|
490
|
+
neighbors = tree_xyz.query(narray, k=knn+1, p=np.inf,
|
|
491
|
+
workers=self.workers,
|
|
492
|
+
distance_upper_bound=9999999)
|
|
493
|
+
n, k = neighbors[0].shape
|
|
494
|
+
|
|
495
|
+
epsarray = np.zeros(n)
|
|
496
|
+
for i in range(n):
|
|
497
|
+
if neighbors[0][i, knn] == np.inf:
|
|
498
|
+
# number of non-inf neighbors
|
|
499
|
+
replacement_idx = np.where(neighbors[0][i] != np.inf)[0][-1]
|
|
500
|
+
if self.knn_type == 'global':
|
|
501
|
+
# look at at least one neighbor
|
|
502
|
+
r = max(int(replacement_idx * self.perc), 1)
|
|
503
|
+
elif self.knn_type == 'cluster_size' or self.knn_type == 'local':
|
|
504
|
+
r = replacement_idx
|
|
505
|
+
epsarray[i] = neighbors[0][i, r]
|
|
506
|
+
else:
|
|
507
|
+
epsarray[i] = neighbors[0][i, knn]
|
|
508
|
+
|
|
509
|
+
neighbors_radius_xyz = tree_xyz.query_ball_point(narray, epsarray, p=np.inf, workers=self.workers,)
|
|
510
|
+
|
|
511
|
+
k_tilde = [len(neighbors_radius_xyz[i]) - 1 if len(neighbors_radius_xyz[i]) > 1 else len(neighbors_radius_xyz[i]) for i in range(len(neighbors_radius_xyz))]
|
|
512
|
+
# k_tilde = [len(neighbors_radius_xyz[i]) for i in range(len(neighbors_radius_xyz))]
|
|
513
|
+
|
|
514
|
+
# compute nearest neighbors in subspaces
|
|
515
|
+
xz = narray[:, xz_indices]
|
|
516
|
+
tree_xz = spatial.cKDTree(xz)
|
|
517
|
+
k_xz = tree_xz.query_ball_point(xz, r=epsarray, p=np.inf, workers=self.workers, return_length=True)
|
|
518
|
+
|
|
519
|
+
yz = narray[:, yz_indices]
|
|
520
|
+
tree_yz = spatial.cKDTree(yz)
|
|
521
|
+
k_yz = tree_yz.query_ball_point(yz, r=epsarray, p=np.inf, workers=self.workers, return_length=True)
|
|
522
|
+
|
|
523
|
+
if len(z_indices) > 0:
|
|
524
|
+
z = narray[:, z_indices]
|
|
525
|
+
tree_z = spatial.cKDTree(z)
|
|
526
|
+
k_z = tree_z.query_ball_point(z, r=epsarray, p=np.inf, workers=self.workers, return_length=True)
|
|
527
|
+
else:
|
|
528
|
+
# Number of neighbors is T when z is empty.
|
|
529
|
+
k_z = np.full(T, T, dtype='float')
|
|
530
|
+
|
|
531
|
+
k_xz = np.asarray([i - 1 if i > 1 else i for i in k_xz])
|
|
532
|
+
k_yz = np.asarray([i - 1 if i > 1 else i for i in k_yz])
|
|
533
|
+
k_z = np.asarray([i - 1 if i > 1 else i for i in k_z])
|
|
534
|
+
|
|
535
|
+
return k_tilde, k_xz, k_yz, k_z
|
|
536
|
+
|
|
537
|
+
def get_dependence_measure_MSinf(self, array, xyz,
|
|
538
|
+
data_type=None):
|
|
539
|
+
"""Returns CMI estimate according to Frenzel and Pompe with an
|
|
540
|
+
altered distance metric: the 0-inf metric, which attributes
|
|
541
|
+
infinite distance to points where the values for the discrete dimensions
|
|
542
|
+
do not coincide.
|
|
543
|
+
|
|
544
|
+
Parameters
|
|
545
|
+
----------
|
|
546
|
+
array : array-like
|
|
547
|
+
data array with X, Y, Z in rows and observations in columns
|
|
548
|
+
|
|
549
|
+
xyz : array of ints
|
|
550
|
+
XYZ identifier array of shape (dim,).
|
|
551
|
+
|
|
552
|
+
data_type : array-like
|
|
553
|
+
data array of same shape as array which describes whether variables
|
|
554
|
+
are continuous or discrete: 0s for continuous variables and
|
|
555
|
+
1s for discrete variables
|
|
556
|
+
|
|
557
|
+
Returns
|
|
558
|
+
-------
|
|
559
|
+
val : float
|
|
560
|
+
Conditional mutual information estimate.
|
|
561
|
+
"""
|
|
562
|
+
dim, T = array.shape
|
|
563
|
+
|
|
564
|
+
# compute knn according to knn type
|
|
565
|
+
if self.knn < 1:
|
|
566
|
+
if self.knn_type == 'global':
|
|
567
|
+
# compute knn
|
|
568
|
+
knn = max(1, int(self.knn*T))
|
|
569
|
+
self.perc = self.knn
|
|
570
|
+
elif self.knn_type == 'cluster_size':
|
|
571
|
+
knn = max(1, int(self.knn*T))
|
|
572
|
+
elif self.knn_type == 'local':
|
|
573
|
+
min_nc = self.get_smallest_cluster_size(array.T, data_type.T)
|
|
574
|
+
knn = max(1, int(self.knn*min_nc))
|
|
575
|
+
else:
|
|
576
|
+
if self.knn_type == 'global':
|
|
577
|
+
knn = max(1, int(self.knn))
|
|
578
|
+
self.perc = self.knn
|
|
579
|
+
elif self.knn_type == 'cluster_size':
|
|
580
|
+
knn = max(1, int(self.knn))
|
|
581
|
+
else:
|
|
582
|
+
raise ValueError("MSinf with knn_type == 'local' needs knn value as percentage (value < 1), not number of neighbors!")
|
|
583
|
+
|
|
584
|
+
knn_tilde, k_xz, k_yz, k_z = self._get_nearest_neighbors_zeroinf_onehot(array=array,
|
|
585
|
+
xyz=xyz,
|
|
586
|
+
knn=knn,
|
|
587
|
+
data_type=data_type)
|
|
588
|
+
non_zero = knn_tilde - k_xz - k_yz + k_z
|
|
589
|
+
|
|
590
|
+
non_zero_count = np.count_nonzero(non_zero) / len(non_zero)
|
|
591
|
+
|
|
592
|
+
val = (special.digamma(knn_tilde) - special.digamma(k_xz) -
|
|
593
|
+
special.digamma(k_yz) +
|
|
594
|
+
special.digamma(k_z))
|
|
595
|
+
|
|
596
|
+
val = val[np.isfinite(val)].mean()
|
|
597
|
+
|
|
598
|
+
if self.max_with_0 and val < 0.:
|
|
599
|
+
val = 0.
|
|
600
|
+
|
|
601
|
+
return val
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
# @jit(forceobj=True)
|
|
605
|
+
def _get_nearest_neighbors_MS_one_hot(self, array, xyz,
|
|
606
|
+
knn, data_type=None):
|
|
607
|
+
"""Returns nearest neighbors according to [1].
|
|
608
|
+
|
|
609
|
+
Retrieves the distances eps to the k-th nearest neighbors for every
|
|
610
|
+
sample in joint space XYZ and returns the numbers of nearest neighbors
|
|
611
|
+
within eps in subspaces Z, XZ, YZ. Uses a custom-defined metric for
|
|
612
|
+
discrete variables.
|
|
613
|
+
|
|
614
|
+
Parameters
|
|
615
|
+
----------
|
|
616
|
+
array : array-like
|
|
617
|
+
data array with X, Y, Z in rows and observations in columns
|
|
618
|
+
|
|
619
|
+
xyz : array of ints
|
|
620
|
+
XYZ identifier array of shape (dim,).
|
|
621
|
+
|
|
622
|
+
knn : int or float
|
|
623
|
+
Number of nearest-neighbors which determines the size of hyper-cubes
|
|
624
|
+
around each (high-dimensional) sample point. If smaller than 1, this
|
|
625
|
+
is computed as a fraction of T, hence knn=knn*T. For knn larger or
|
|
626
|
+
equal to 1, this is the absolute number.
|
|
627
|
+
|
|
628
|
+
data_type : array-like
|
|
629
|
+
data array of same shape as array which describes whether variables
|
|
630
|
+
are continuous or discrete: 0s for continuous variables and
|
|
631
|
+
1s for discrete variables
|
|
632
|
+
|
|
633
|
+
Returns
|
|
634
|
+
-------
|
|
635
|
+
k_tilde, k_xz, k_yz, k_z : tuple of arrays of shape (T,)
|
|
636
|
+
Nearest neighbors in XYZ, XZ, YZ, and Z subspaces.
|
|
637
|
+
"""
|
|
638
|
+
|
|
639
|
+
dim, T = array.shape
|
|
640
|
+
|
|
641
|
+
array = array.astype(np.float64)
|
|
642
|
+
xyz = xyz.astype(np.int32)
|
|
643
|
+
|
|
644
|
+
array = self._transform_mixed_data(array, data_type)
|
|
645
|
+
|
|
646
|
+
array = array.T
|
|
647
|
+
data_type = data_type.T
|
|
648
|
+
|
|
649
|
+
narray, nxyz, ndata_type, discrete_idx_list = self._transform_to_one_hot_mixed(array,
|
|
650
|
+
xyz,
|
|
651
|
+
data_type)
|
|
652
|
+
|
|
653
|
+
# Subsample indices
|
|
654
|
+
x_indices = np.where(nxyz == 0)[0]
|
|
655
|
+
y_indices = np.where(nxyz == 1)[0]
|
|
656
|
+
z_indices = np.where(nxyz == 2)[0]
|
|
657
|
+
|
|
658
|
+
xz_indices = np.concatenate([x_indices, z_indices])
|
|
659
|
+
yz_indices = np.concatenate([y_indices, z_indices])
|
|
660
|
+
|
|
661
|
+
# Fit trees
|
|
662
|
+
tree_xyz = spatial.cKDTree(narray)
|
|
663
|
+
neighbors = tree_xyz.query(narray, k=knn+1, p=np.inf, workers=self.workers)
|
|
664
|
+
|
|
665
|
+
epsarray = neighbors[0][:, -1].astype(np.float64)
|
|
666
|
+
|
|
667
|
+
neighbors_radius_xyz = tree_xyz.query_ball_point(narray, epsarray, p=np.inf,
|
|
668
|
+
workers=self.workers)
|
|
669
|
+
|
|
670
|
+
# search again for neighbors in the radius to find all of them
|
|
671
|
+
# in the discrete case k_tilde can be larger than the given knn
|
|
672
|
+
k_tilde = np.asarray([len(neighbors_radius_xyz[i]) - 1 if len(neighbors_radius_xyz[i]) > 1 else len(neighbors_radius_xyz[i]) for i in range(len(neighbors_radius_xyz))])
|
|
673
|
+
|
|
674
|
+
# compute entropies
|
|
675
|
+
xz = narray[:, xz_indices]
|
|
676
|
+
tree_xz = spatial.cKDTree(xz)
|
|
677
|
+
k_xz = tree_xz.query_ball_point(xz, r=epsarray, p=np.inf,
|
|
678
|
+
workers=self.workers, return_length=True)
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
yz = narray[:, yz_indices]
|
|
682
|
+
tree_yz = spatial.cKDTree(yz)
|
|
683
|
+
k_yz = tree_yz.query_ball_point(yz, r=epsarray, p=np.inf,
|
|
684
|
+
workers=self.workers, return_length=True)
|
|
685
|
+
|
|
686
|
+
if len(z_indices) > 0:
|
|
687
|
+
z = narray[:, z_indices]
|
|
688
|
+
tree_z = spatial.cKDTree(z)
|
|
689
|
+
k_z = tree_z.query_ball_point(z, r=epsarray, p=np.inf,
|
|
690
|
+
workers=self.workers, return_length=True)
|
|
691
|
+
|
|
692
|
+
else:
|
|
693
|
+
# Number of neighbors is T when z is empty.
|
|
694
|
+
k_z = np.full(T, T, dtype='float')
|
|
695
|
+
|
|
696
|
+
k_xz = np.asarray([i - 1 if i > 1 else i for i in k_xz])
|
|
697
|
+
k_yz = np.asarray([i - 1 if i > 1 else i for i in k_yz])
|
|
698
|
+
k_z = np.asarray([i - 1 if i > 1 else i for i in k_z])
|
|
699
|
+
|
|
700
|
+
return k_tilde, k_xz, k_yz, k_z
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
def get_dependence_measure_MS(self, array, xyz,
|
|
704
|
+
data_type=None):
|
|
705
|
+
|
|
706
|
+
"""Returns CMI estimate as described in Messner and Shalizi (2021).
|
|
707
|
+
|
|
708
|
+
Parameters
|
|
709
|
+
----------
|
|
710
|
+
array : array-like
|
|
711
|
+
data array with X, Y, Z in rows and observations in columns
|
|
712
|
+
|
|
713
|
+
xyz : array of ints
|
|
714
|
+
XYZ identifier array of shape (dim,).
|
|
715
|
+
|
|
716
|
+
data_type : array-like
|
|
717
|
+
data array of same shape as array which describes whether variables
|
|
718
|
+
are continuous or discrete: 0s for continuous variables and
|
|
719
|
+
1s for discrete variables
|
|
720
|
+
|
|
721
|
+
Returns
|
|
722
|
+
-------
|
|
723
|
+
val : float
|
|
724
|
+
Conditional mutual information estimate.
|
|
725
|
+
"""
|
|
726
|
+
dim, T = array.shape
|
|
727
|
+
|
|
728
|
+
# compute knn
|
|
729
|
+
if self.knn < 1:
|
|
730
|
+
knn = max(1, int(self.knn*T))
|
|
731
|
+
else:
|
|
732
|
+
knn = max(1, self.knn)
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
knn_tilde, k_xz, k_yz, k_z = self._get_nearest_neighbors_MS_one_hot(array=array,
|
|
736
|
+
xyz=xyz,
|
|
737
|
+
knn=knn,
|
|
738
|
+
data_type=data_type)
|
|
739
|
+
|
|
740
|
+
non_zero = knn_tilde - k_xz - k_yz + k_z
|
|
741
|
+
|
|
742
|
+
non_zero_count = np.count_nonzero(non_zero) / len(non_zero)
|
|
743
|
+
|
|
744
|
+
val = (special.digamma(knn_tilde) - special.digamma(k_xz) -
|
|
745
|
+
special.digamma(k_yz) +
|
|
746
|
+
special.digamma(k_z))
|
|
747
|
+
|
|
748
|
+
val = val[np.isfinite(val)].mean()
|
|
749
|
+
|
|
750
|
+
if self.max_with_0 and val < 0.:
|
|
751
|
+
val = 0.
|
|
752
|
+
|
|
753
|
+
return val
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
def _compute_entropies_for_discrete_entry(self, array,
|
|
758
|
+
discrete_values,
|
|
759
|
+
discrete_idxs,
|
|
760
|
+
continuous_idxs,
|
|
761
|
+
total_num_samples,
|
|
762
|
+
knn):
|
|
763
|
+
# select data for which the discrete values are the given ones
|
|
764
|
+
current_array = array[np.sum(array[:, discrete_idxs] == discrete_values,
|
|
765
|
+
axis=-1) == len(discrete_idxs)]
|
|
766
|
+
|
|
767
|
+
# if we do not have samples, we cannot estimate CMI
|
|
768
|
+
if np.size(current_array) == 0:
|
|
769
|
+
return 0., 0.
|
|
770
|
+
|
|
771
|
+
T, dim = current_array.shape
|
|
772
|
+
|
|
773
|
+
# if we have more samples than knns and samples are not purely discrete, we can
|
|
774
|
+
# compute CMI
|
|
775
|
+
if len(continuous_idxs) > 0 and T > knn:
|
|
776
|
+
val_continuous_entropy = self._compute_continuous_entropy(current_array[:, continuous_idxs], knn)
|
|
777
|
+
else:
|
|
778
|
+
val_continuous_entropy = 0.
|
|
779
|
+
|
|
780
|
+
prob = float(T) / total_num_samples
|
|
781
|
+
|
|
782
|
+
# multiply by probabilities of occurence
|
|
783
|
+
val_continuous_entropy *= prob
|
|
784
|
+
# compute entropy for that occurence
|
|
785
|
+
val_discrete_entropy = prob * np.log(prob)
|
|
786
|
+
|
|
787
|
+
return val_continuous_entropy, val_discrete_entropy
|
|
788
|
+
|
|
789
|
+
def _compute_continuous_entropy(self, array, knn):
|
|
790
|
+
T, dim = array.shape
|
|
791
|
+
if T == 1:
|
|
792
|
+
return 0.
|
|
793
|
+
|
|
794
|
+
if knn < 1:
|
|
795
|
+
knn = int(max(np.rint(knn * T), 1))
|
|
796
|
+
|
|
797
|
+
tree = spatial.cKDTree(array)
|
|
798
|
+
epsarray = tree.query(array, k=[knn+1], p=np.inf,
|
|
799
|
+
workers=self.workers,
|
|
800
|
+
eps=0.)[0][:, 0].astype(np.float64)
|
|
801
|
+
epsarray = epsarray[epsarray != 0]
|
|
802
|
+
num_non_zero = len(epsarray)
|
|
803
|
+
if num_non_zero == 0:
|
|
804
|
+
cmi_hat = 0.
|
|
805
|
+
else:
|
|
806
|
+
avg_dist = float(array.shape[-1]) / float(num_non_zero) * np.sum(np.log(2 * epsarray))
|
|
807
|
+
cmi_hat = special.digamma(num_non_zero) - special.digamma(knn) + avg_dist
|
|
808
|
+
|
|
809
|
+
return cmi_hat
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def get_dependence_measure_ZMADG(self, array, xyz,
|
|
813
|
+
data_type=None):
|
|
814
|
+
"""Returns CMI estimate as described in [2].
|
|
815
|
+
|
|
816
|
+
Parameters
|
|
817
|
+
----------
|
|
818
|
+
array : array-like
|
|
819
|
+
data array with X, Y, Z in rows and observations in columns
|
|
820
|
+
|
|
821
|
+
xyz : array of ints
|
|
822
|
+
XYZ identifier array of shape (dim,).
|
|
823
|
+
|
|
824
|
+
data_type : array-like
|
|
825
|
+
data array of same shape as array which describes whether variables
|
|
826
|
+
are continuous or discrete: 0s for continuous variables and
|
|
827
|
+
1s for discrete variables
|
|
828
|
+
|
|
829
|
+
Returns
|
|
830
|
+
-------
|
|
831
|
+
val : float
|
|
832
|
+
Conditional mutual information estimate.
|
|
833
|
+
"""
|
|
834
|
+
|
|
835
|
+
dim, T = array.shape
|
|
836
|
+
|
|
837
|
+
if self.knn > 1:
|
|
838
|
+
raise ValueError("ZMADG needs knn value as percentage (value < 1), not number of neighbors!")
|
|
839
|
+
else:
|
|
840
|
+
knn = self.knn
|
|
841
|
+
|
|
842
|
+
array = array.astype(np.float64)
|
|
843
|
+
xyz = xyz.astype(np.int32)
|
|
844
|
+
|
|
845
|
+
array = self._transform_mixed_data(array, data_type)
|
|
846
|
+
|
|
847
|
+
array = array.T
|
|
848
|
+
data_type = data_type.T
|
|
849
|
+
|
|
850
|
+
discrete_idx_list = np.where(np.any(data_type == 1, axis=0), 1, 0)
|
|
851
|
+
|
|
852
|
+
if np.sum(discrete_idx_list) == 0:
|
|
853
|
+
raise ValueError("Variables are continuous, cannot use CMIknnMixed ZMADG!")
|
|
854
|
+
|
|
855
|
+
# Subsample indices
|
|
856
|
+
x_indices = np.where(xyz == 0)[0]
|
|
857
|
+
y_indices = np.where(xyz == 1)[0]
|
|
858
|
+
z_indices = np.where(xyz == 2)[0]
|
|
859
|
+
xz_indices = np.concatenate([x_indices, z_indices])
|
|
860
|
+
yz_indices = np.concatenate([y_indices, z_indices])
|
|
861
|
+
|
|
862
|
+
discrete_xz_indices = discrete_idx_list[xz_indices]
|
|
863
|
+
discrete_yz_indices = discrete_idx_list[yz_indices]
|
|
864
|
+
discrete_z_indices = discrete_idx_list[z_indices]
|
|
865
|
+
|
|
866
|
+
discrete_xyz_idx = np.where(np.asarray(discrete_idx_list) == 1)[0]
|
|
867
|
+
discrete_xz_idx = np.where(np.asarray(discrete_xz_indices) == 1)[0]
|
|
868
|
+
discrete_yz_idx = np.where(np.asarray(discrete_yz_indices) == 1)[0]
|
|
869
|
+
discrete_z_idx = np.where(np.asarray(discrete_z_indices) == 1)[0]
|
|
870
|
+
|
|
871
|
+
continuous_xyz_idx = np.where(np.asarray(discrete_idx_list) == 0)[0]
|
|
872
|
+
continuous_xz_idx = np.where(np.asarray(discrete_xz_indices) == 0)[0]
|
|
873
|
+
continuous_yz_idx = np.where(np.asarray(discrete_yz_indices) == 0)[0]
|
|
874
|
+
continuous_z_idx = np.where(np.asarray(discrete_z_indices) == 0)[0]
|
|
875
|
+
|
|
876
|
+
# get the number of unique values for each category of the discrete variable
|
|
877
|
+
# add empty set for code not to break when accessing [0]
|
|
878
|
+
num_xz_classes = [np.unique(array[:, xz_indices][:, index]) for index in range(len(discrete_xz_indices)) if (discrete_xz_indices[index] == 1)]
|
|
879
|
+
num_yz_classes = [np.unique(array[:, yz_indices][:, index]) for index in range(len(discrete_yz_indices)) if (discrete_yz_indices[index] == 1)]
|
|
880
|
+
num_z_classes = [np.unique(array[:, z_indices][:, index]) for index in range(len(discrete_z_indices)) if (discrete_z_indices[index] == 1)]
|
|
881
|
+
num_xyz_classes = [np.unique(array[:, index]) for index in range(len(discrete_idx_list)) if (discrete_idx_list[index] == 1)]
|
|
882
|
+
|
|
883
|
+
xyz_cartesian_product = []
|
|
884
|
+
xz_cartesian_product = []
|
|
885
|
+
yz_cartesian_product = []
|
|
886
|
+
z_cartesian_product = []
|
|
887
|
+
|
|
888
|
+
if len(num_xyz_classes) > 1:
|
|
889
|
+
xyz_cartesian_product = cartesian(num_xyz_classes)
|
|
890
|
+
elif len(num_xyz_classes) > 0:
|
|
891
|
+
xyz_cartesian_product = num_xyz_classes[0]
|
|
892
|
+
|
|
893
|
+
|
|
894
|
+
if len(num_xz_classes) > 1:
|
|
895
|
+
xz_cartesian_product = cartesian(num_xz_classes)
|
|
896
|
+
elif len(num_xz_classes) > 0:
|
|
897
|
+
xz_cartesian_product = num_xz_classes[0]
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
if len(num_yz_classes) > 1:
|
|
901
|
+
yz_cartesian_product = cartesian(num_yz_classes)
|
|
902
|
+
elif len(num_yz_classes) > 0:
|
|
903
|
+
yz_cartesian_product = num_yz_classes[0]
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
if len(num_z_classes) > 1:
|
|
907
|
+
z_cartesian_product = cartesian(num_z_classes)
|
|
908
|
+
elif len(num_z_classes) > 0:
|
|
909
|
+
z_cartesian_product = num_z_classes[0]
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
####### start computing entropies
|
|
913
|
+
|
|
914
|
+
if len(xyz_cartesian_product) > 0:
|
|
915
|
+
xyz_cmi = 0.
|
|
916
|
+
xyz_entropy = 0.
|
|
917
|
+
|
|
918
|
+
for i, entry in enumerate(xyz_cartesian_product):
|
|
919
|
+
xyz_cont_entropy, xyz_disc_entropy = self._compute_entropies_for_discrete_entry(array, entry,
|
|
920
|
+
discrete_xyz_idx,
|
|
921
|
+
continuous_xyz_idx,
|
|
922
|
+
T, knn)
|
|
923
|
+
xyz_cmi += xyz_cont_entropy
|
|
924
|
+
xyz_entropy -= xyz_disc_entropy
|
|
925
|
+
else:
|
|
926
|
+
xyz_cmi = self._compute_continuous_entropy(array, knn)
|
|
927
|
+
xyz_entropy = 0.
|
|
928
|
+
|
|
929
|
+
h_xyz = xyz_cmi + xyz_entropy
|
|
930
|
+
|
|
931
|
+
if len(xz_cartesian_product) > 0:
|
|
932
|
+
xz_cmi = 0.
|
|
933
|
+
xz_entropy = 0.
|
|
934
|
+
|
|
935
|
+
for i, entry in enumerate(xz_cartesian_product):
|
|
936
|
+
xz_cont_entropy, xz_disc_entropy = self._compute_entropies_for_discrete_entry(array[:, xz_indices], entry,
|
|
937
|
+
discrete_xz_idx,
|
|
938
|
+
continuous_xz_idx,
|
|
939
|
+
T, knn)
|
|
940
|
+
xz_cmi += xz_cont_entropy
|
|
941
|
+
xz_entropy -= xz_disc_entropy
|
|
942
|
+
else:
|
|
943
|
+
xz_cmi = self._compute_continuous_entropy(array[:, xz_indices], knn)
|
|
944
|
+
xz_entropy = 0.
|
|
945
|
+
|
|
946
|
+
h_xz = xz_cmi + xz_entropy
|
|
947
|
+
|
|
948
|
+
# compute entropies in Xy subspace
|
|
949
|
+
if len(yz_cartesian_product) > 0:
|
|
950
|
+
yz_cmi = 0.
|
|
951
|
+
yz_entropy = 0.
|
|
952
|
+
|
|
953
|
+
for i, entry in enumerate(yz_cartesian_product):
|
|
954
|
+
yz_cont_entropy, yz_disc_entropy = self._compute_entropies_for_discrete_entry(array[:, yz_indices], entry,
|
|
955
|
+
discrete_yz_idx,
|
|
956
|
+
continuous_yz_idx,
|
|
957
|
+
T, knn)
|
|
958
|
+
yz_cmi += yz_cont_entropy
|
|
959
|
+
yz_entropy -= yz_disc_entropy
|
|
960
|
+
else:
|
|
961
|
+
yz_cmi = self._compute_continuous_entropy(array[:, yz_indices], knn)
|
|
962
|
+
yz_entropy = 0.
|
|
963
|
+
|
|
964
|
+
h_yz = yz_cmi + yz_entropy
|
|
965
|
+
|
|
966
|
+
# compute entropies in Z subspace
|
|
967
|
+
if len(z_cartesian_product) > 0:
|
|
968
|
+
z_cmi = 0.
|
|
969
|
+
z_entropy = 0.
|
|
970
|
+
|
|
971
|
+
for i, entry in enumerate(z_cartesian_product):
|
|
972
|
+
z_cont_entropy, z_disc_entropy = self._compute_entropies_for_discrete_entry(array[:, z_indices],
|
|
973
|
+
entry,
|
|
974
|
+
discrete_z_idx,
|
|
975
|
+
continuous_z_idx,
|
|
976
|
+
T, knn)
|
|
977
|
+
z_cmi += z_cont_entropy
|
|
978
|
+
z_entropy -= z_disc_entropy
|
|
979
|
+
else:
|
|
980
|
+
z_cmi = self._compute_continuous_entropy(array[:, z_indices], knn)
|
|
981
|
+
z_entropy = 0.
|
|
982
|
+
|
|
983
|
+
h_z = z_cmi + z_entropy
|
|
984
|
+
|
|
985
|
+
# put it all together for the CMI estimation
|
|
986
|
+
val = h_xz + h_yz - h_xyz - h_z
|
|
987
|
+
|
|
988
|
+
if self.max_with_0:
|
|
989
|
+
if val < 0.:
|
|
990
|
+
val = 0.
|
|
991
|
+
|
|
992
|
+
entropies = (xz_cmi, yz_cmi, xyz_cmi, z_cmi, xz_entropy, yz_entropy, xyz_entropy, z_entropy)
|
|
993
|
+
|
|
994
|
+
return val
|
|
995
|
+
|
|
996
|
+
def _get_p_value(self, val, array, xyz, T, dim,
|
|
997
|
+
data_type=None,
|
|
998
|
+
sig_override=None):
|
|
999
|
+
"""
|
|
1000
|
+
Returns the p-value from whichever significance function is specified
|
|
1001
|
+
for this test. If an override is used, then it will call a different
|
|
1002
|
+
function then specified by self.significance
|
|
1003
|
+
|
|
1004
|
+
Parameters
|
|
1005
|
+
----------
|
|
1006
|
+
val : float
|
|
1007
|
+
Test statistic value.
|
|
1008
|
+
|
|
1009
|
+
array : array-like
|
|
1010
|
+
data array with X, Y, Z in rows and observations in columns
|
|
1011
|
+
|
|
1012
|
+
xyz : array of ints
|
|
1013
|
+
XYZ identifier array of shape (dim,).
|
|
1014
|
+
|
|
1015
|
+
T : int
|
|
1016
|
+
Sample length
|
|
1017
|
+
|
|
1018
|
+
dim : int
|
|
1019
|
+
Dimensionality, ie, number of features.
|
|
1020
|
+
|
|
1021
|
+
data_type : array-like
|
|
1022
|
+
Binary data array of same shape as array which describes whether
|
|
1023
|
+
individual samples in a variable (or all samples) are continuous
|
|
1024
|
+
or discrete: 0s for continuous variables and 1s for discrete variables.
|
|
1025
|
+
|
|
1026
|
+
sig_override : string
|
|
1027
|
+
Must be in 'analytic', 'shuffle_test', 'fixed_thres'
|
|
1028
|
+
|
|
1029
|
+
Returns
|
|
1030
|
+
-------
|
|
1031
|
+
pval : float or numpy.nan
|
|
1032
|
+
P-value.
|
|
1033
|
+
"""
|
|
1034
|
+
# Defaults to the self.significance member value
|
|
1035
|
+
use_sig = self.significance
|
|
1036
|
+
if sig_override is not None:
|
|
1037
|
+
use_sig = sig_override
|
|
1038
|
+
# Check if we are using the analytic significance
|
|
1039
|
+
if use_sig == 'analytic':
|
|
1040
|
+
pval = self.get_analytic_significance(value=val, T=T, dim=dim, xyz=xyz)
|
|
1041
|
+
# Check if we are using the shuffle significance
|
|
1042
|
+
elif use_sig == 'shuffle_test':
|
|
1043
|
+
pval = self.get_shuffle_significance(array=array,
|
|
1044
|
+
xyz=xyz,
|
|
1045
|
+
value=val,
|
|
1046
|
+
data_type=data_type)
|
|
1047
|
+
# Check if we are using the fixed_thres significance
|
|
1048
|
+
elif use_sig == 'fixed_thres':
|
|
1049
|
+
# Determined outside then
|
|
1050
|
+
pval = None
|
|
1051
|
+
else:
|
|
1052
|
+
raise ValueError("%s not known." % self.significance)
|
|
1053
|
+
return pval
|
|
1054
|
+
|
|
1055
|
+
def get_dependence_measure(self, array, xyz,
|
|
1056
|
+
data_type=None):
|
|
1057
|
+
"""Calls the appropriate function to estimate CMI.
|
|
1058
|
+
Parameters
|
|
1059
|
+
----------
|
|
1060
|
+
array : array-like
|
|
1061
|
+
data array with X, Y, Z in rows and observations in columns
|
|
1062
|
+
|
|
1063
|
+
xyz : array of ints
|
|
1064
|
+
XYZ identifier array of shape (dim,)
|
|
1065
|
+
|
|
1066
|
+
data_type : array-like
|
|
1067
|
+
data array of same shape as array which describes whether variables
|
|
1068
|
+
are continuous or discrete: 0s for continuous variables and
|
|
1069
|
+
1s for discrete variables
|
|
1070
|
+
|
|
1071
|
+
Returns
|
|
1072
|
+
-------
|
|
1073
|
+
val : float
|
|
1074
|
+
Conditional mutual information estimate.
|
|
1075
|
+
"""
|
|
1076
|
+
# check that data is really mixed
|
|
1077
|
+
if data_type is None:
|
|
1078
|
+
raise ValueError("Type mask cannot be none for CMIknnMixed!")
|
|
1079
|
+
if np.sum(data_type) > data_type.size:
|
|
1080
|
+
raise ValueError("Type mask contains other values than 0 and 1!")
|
|
1081
|
+
|
|
1082
|
+
if self.estimator == 'MS':
|
|
1083
|
+
return self.get_dependence_measure_MS(array,
|
|
1084
|
+
xyz,
|
|
1085
|
+
data_type)
|
|
1086
|
+
elif self.estimator == 'ZMADG':
|
|
1087
|
+
return self.get_dependence_measure_ZMADG(array,
|
|
1088
|
+
xyz,
|
|
1089
|
+
data_type)
|
|
1090
|
+
elif self.estimator == 'MSinf':
|
|
1091
|
+
return self.get_dependence_measure_MSinf(array,
|
|
1092
|
+
xyz,
|
|
1093
|
+
data_type)
|
|
1094
|
+
else:
|
|
1095
|
+
raise ValueError('No such estimator available!')
|
|
1096
|
+
|
|
1097
|
+
# @jit(forceobj=True)
|
|
1098
|
+
def get_restricted_permutation(self, T, shuffle_neighbors, neighbors, order):
|
|
1099
|
+
|
|
1100
|
+
restricted_permutation = np.zeros(T, dtype=np.int32)
|
|
1101
|
+
used = np.array([], dtype=np.int32)
|
|
1102
|
+
|
|
1103
|
+
for sample_index in order:
|
|
1104
|
+
# neighbors_to_use = np.unique(neighbors[sample_index, :])
|
|
1105
|
+
neighbors_to_use = neighbors[sample_index]
|
|
1106
|
+
m = 0
|
|
1107
|
+
# use = neighbors[sample_index, m]
|
|
1108
|
+
use = neighbors_to_use[m]
|
|
1109
|
+
while ((use in used) and (m < shuffle_neighbors - 1)):
|
|
1110
|
+
m += 1
|
|
1111
|
+
use = neighbors_to_use[m]
|
|
1112
|
+
# use = neighbors[sample_index, m]
|
|
1113
|
+
|
|
1114
|
+
restricted_permutation[sample_index] = use
|
|
1115
|
+
used = np.append(used, use)
|
|
1116
|
+
|
|
1117
|
+
return restricted_permutation
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
# @jit(forceobj=True)
|
|
1121
|
+
def _generate_random_permutation(self, array, neighbors, x_indices, data_type):
|
|
1122
|
+
|
|
1123
|
+
T, dim = array.shape
|
|
1124
|
+
# Generate random order in which to go through indices loop in
|
|
1125
|
+
# next step
|
|
1126
|
+
order = self.random_state.permutation(T).astype(np.int32)
|
|
1127
|
+
|
|
1128
|
+
n = np.empty(neighbors.shape[0], dtype=object)
|
|
1129
|
+
|
|
1130
|
+
for i in range(neighbors.shape[0]):
|
|
1131
|
+
v = np.unique(neighbors[i])
|
|
1132
|
+
# Shuffle neighbor indices for each sample index
|
|
1133
|
+
self.random_state.shuffle(v)
|
|
1134
|
+
n[i] = v
|
|
1135
|
+
|
|
1136
|
+
# Select a series of neighbor indices that contains as few as
|
|
1137
|
+
# possible duplicates
|
|
1138
|
+
restricted_permutation = self.get_restricted_permutation(
|
|
1139
|
+
T=T,
|
|
1140
|
+
shuffle_neighbors=self.shuffle_neighbors,
|
|
1141
|
+
neighbors=n,
|
|
1142
|
+
order=order)
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
array_shuffled = np.copy(array)
|
|
1146
|
+
data_type_shuffled = np.copy(data_type)
|
|
1147
|
+
|
|
1148
|
+
for i in x_indices:
|
|
1149
|
+
array_shuffled[:, i] = array[restricted_permutation, i]
|
|
1150
|
+
data_type_shuffled[:, i] = data_type[restricted_permutation, i]
|
|
1151
|
+
|
|
1152
|
+
return array_shuffled, data_type_shuffled
|
|
1153
|
+
|
|
1154
|
+
# @jit(forceobj=True)
|
|
1155
|
+
def compute_perm_null_dist(self, array, xyz,
|
|
1156
|
+
data_type=None):
|
|
1157
|
+
# max_neighbors = max(1, int(max_neighbor_ratio*T))
|
|
1158
|
+
array = self._transform_mixed_data(array.T, data_type.T).T
|
|
1159
|
+
|
|
1160
|
+
# compute valid neighbors
|
|
1161
|
+
narray, nxyz, ndata_type, discrete_idx_list = self._transform_to_one_hot_mixed(array,
|
|
1162
|
+
xyz,
|
|
1163
|
+
data_type,
|
|
1164
|
+
zero_inf=True)
|
|
1165
|
+
x_indices = np.where(nxyz == 0)[0]
|
|
1166
|
+
z_indices = np.where(nxyz == 2)[0]
|
|
1167
|
+
|
|
1168
|
+
if self.verbosity > 2:
|
|
1169
|
+
print(" nearest-neighbor shuffle significance "
|
|
1170
|
+
"test with n = %d and %d surrogates" % (
|
|
1171
|
+
self.shuffle_neighbors, self.sig_samples))
|
|
1172
|
+
# Get nearest neighbors around each sample point in Z
|
|
1173
|
+
z_array = np.array(narray[:, z_indices])
|
|
1174
|
+
|
|
1175
|
+
tree_xyz = spatial.cKDTree(z_array)
|
|
1176
|
+
neighbors = tree_xyz.query(z_array,
|
|
1177
|
+
k=self.shuffle_neighbors + 1,
|
|
1178
|
+
p=np.inf,
|
|
1179
|
+
workers=self.workers,
|
|
1180
|
+
distance_upper_bound=9999999,
|
|
1181
|
+
eps=0.)
|
|
1182
|
+
|
|
1183
|
+
# remove all neighbors with distance infinite -> from another class
|
|
1184
|
+
# for those that are discrete
|
|
1185
|
+
valid_neighbors = np.ones(neighbors[1].shape)
|
|
1186
|
+
# fill valid neighbors with point -> if infinite, the neighbor will
|
|
1187
|
+
# be the point itself
|
|
1188
|
+
valid_neighbors = np.multiply(valid_neighbors, np.expand_dims(np.arange(valid_neighbors.shape[0]), axis=-1))
|
|
1189
|
+
|
|
1190
|
+
valid_neighbors[neighbors[0] != np.inf] = neighbors[1][neighbors[0] != np.inf]
|
|
1191
|
+
|
|
1192
|
+
null_dist = np.zeros(self.sig_samples)
|
|
1193
|
+
|
|
1194
|
+
for sam in range(self.sig_samples):
|
|
1195
|
+
# permute un-encoded array using the valud neighbors list
|
|
1196
|
+
array_shuffled, data_type_shuffled = self._generate_random_permutation(array,
|
|
1197
|
+
valid_neighbors,
|
|
1198
|
+
x_indices=np.where(xyz == 0)[0],
|
|
1199
|
+
data_type=data_type)
|
|
1200
|
+
|
|
1201
|
+
# use array instead of narray to avoid double encoding
|
|
1202
|
+
null_dist[sam] = self.get_dependence_measure(array_shuffled.T,
|
|
1203
|
+
xyz,
|
|
1204
|
+
data_type=data_type_shuffled.T)
|
|
1205
|
+
return null_dist
|
|
1206
|
+
|
|
1207
|
+
# @jit(forceobj=True)
|
|
1208
|
+
def get_shuffle_significance(self, array, xyz, value,
|
|
1209
|
+
return_null_dist=False,
|
|
1210
|
+
data_type=None):
|
|
1211
|
+
|
|
1212
|
+
"""Returns p-value for nearest-neighbor shuffle significance test.
|
|
1213
|
+
|
|
1214
|
+
For non-empty Z, overwrites get_shuffle_significance from the parent
|
|
1215
|
+
class which is a block shuffle test, which does not preserve
|
|
1216
|
+
dependencies of X and Y with Z. Here the parameter shuffle_neighbors is
|
|
1217
|
+
used to permute only those values :math:`x_i` and :math:`x_j` for which
|
|
1218
|
+
:math:`z_j` is among the nearest neighbors of :math:`z_i`. If Z is
|
|
1219
|
+
empty, the block-shuffle test is used.
|
|
1220
|
+
|
|
1221
|
+
Parameters
|
|
1222
|
+
----------
|
|
1223
|
+
array : array-like
|
|
1224
|
+
data array with X, Y, Z in rows and observations in columns
|
|
1225
|
+
|
|
1226
|
+
xyz : array of ints
|
|
1227
|
+
XYZ identifier array of shape (dim,).
|
|
1228
|
+
|
|
1229
|
+
value : number
|
|
1230
|
+
Value of test statistic for unshuffled estimate.
|
|
1231
|
+
|
|
1232
|
+
data_type : array-like
|
|
1233
|
+
data array of same shape as array which describes whether variables
|
|
1234
|
+
are continuous or discrete: 0s for continuous variables and
|
|
1235
|
+
1s for discrete variables
|
|
1236
|
+
|
|
1237
|
+
Returns
|
|
1238
|
+
-------
|
|
1239
|
+
pval : float
|
|
1240
|
+
p-value
|
|
1241
|
+
"""
|
|
1242
|
+
|
|
1243
|
+
dim, T = array.shape
|
|
1244
|
+
array = array.T
|
|
1245
|
+
data_type = data_type.T
|
|
1246
|
+
|
|
1247
|
+
z_indices = np.where(xyz == 2)[0]
|
|
1248
|
+
|
|
1249
|
+
if len(z_indices) > 0 and self.shuffle_neighbors < T:
|
|
1250
|
+
null_dist = self.compute_perm_null_dist(array, xyz, data_type)
|
|
1251
|
+
else:
|
|
1252
|
+
null_dist = \
|
|
1253
|
+
self._get_shuffle_dist(array.T, xyz,
|
|
1254
|
+
sig_samples=self.sig_samples,
|
|
1255
|
+
sig_blocklength=self.sig_blocklength,
|
|
1256
|
+
data_type=data_type.T,
|
|
1257
|
+
verbosity=self.verbosity)
|
|
1258
|
+
|
|
1259
|
+
pval = float(np.sum(null_dist >= value) + 1) / (self.sig_samples + 1)
|
|
1260
|
+
|
|
1261
|
+
if return_null_dist:
|
|
1262
|
+
# Sort
|
|
1263
|
+
null_dist.sort()
|
|
1264
|
+
return pval, null_dist
|
|
1265
|
+
return pval
|
|
1266
|
+
|
|
1267
|
+
|
|
1268
|
+
def _get_shuffle_dist(self, array, xyz,
|
|
1269
|
+
sig_samples, sig_blocklength=None,
|
|
1270
|
+
data_type=None,
|
|
1271
|
+
verbosity=0):
|
|
1272
|
+
"""Returns shuffle distribution of test statistic.
|
|
1273
|
+
|
|
1274
|
+
The rows in array corresponding to the X-variable are shuffled using
|
|
1275
|
+
a block-shuffle approach.
|
|
1276
|
+
|
|
1277
|
+
Parameters
|
|
1278
|
+
----------
|
|
1279
|
+
array : array-like
|
|
1280
|
+
data array with X, Y, Z in rows and observations in columns
|
|
1281
|
+
|
|
1282
|
+
xyz : array of ints
|
|
1283
|
+
XYZ identifier array of shape (dim,).
|
|
1284
|
+
|
|
1285
|
+
dependence_measure : object
|
|
1286
|
+
Dependence measure function must be of form
|
|
1287
|
+
dependence_measure(array, xyz) and return a numeric value
|
|
1288
|
+
|
|
1289
|
+
sig_samples : int, optional (default: 100)
|
|
1290
|
+
Number of samples for shuffle significance test.
|
|
1291
|
+
|
|
1292
|
+
sig_blocklength : int, optional (default: None)
|
|
1293
|
+
Block length for block-shuffle significance test. If None, the
|
|
1294
|
+
block length is determined from the decay of the autocovariance as
|
|
1295
|
+
explained in [1]_.
|
|
1296
|
+
|
|
1297
|
+
data_type : array-like
|
|
1298
|
+
data array of same shape as array which describes whether variables
|
|
1299
|
+
are continuous or discrete: 0s for continuous variables and
|
|
1300
|
+
1s for discrete variables
|
|
1301
|
+
|
|
1302
|
+
verbosity : int, optional (default: 0)
|
|
1303
|
+
Level of verbosity.
|
|
1304
|
+
|
|
1305
|
+
Returns
|
|
1306
|
+
-------
|
|
1307
|
+
null_dist : array of shape (sig_samples,)
|
|
1308
|
+
Contains the sorted test statistic values estimated from the
|
|
1309
|
+
shuffled arrays.
|
|
1310
|
+
"""
|
|
1311
|
+
dim, T = array.shape
|
|
1312
|
+
|
|
1313
|
+
x_indices = np.where(xyz == 0)[0]
|
|
1314
|
+
dim_x = len(x_indices)
|
|
1315
|
+
|
|
1316
|
+
if sig_blocklength is None:
|
|
1317
|
+
sig_blocklength = self._get_block_length(array, xyz,
|
|
1318
|
+
mode='significance')
|
|
1319
|
+
|
|
1320
|
+
n_blks = int(math.floor(float(T)/sig_blocklength))
|
|
1321
|
+
# print 'n_blks ', n_blks
|
|
1322
|
+
if verbosity > 2:
|
|
1323
|
+
print(" Significance test with block-length = %d "
|
|
1324
|
+
"..." % (sig_blocklength))
|
|
1325
|
+
|
|
1326
|
+
array_shuffled = np.copy(array)
|
|
1327
|
+
data_type_shuffled = np.copy(data_type)
|
|
1328
|
+
block_starts = np.arange(0, T - sig_blocklength + 1, sig_blocklength)
|
|
1329
|
+
|
|
1330
|
+
# Dividing the array up into n_blks of length sig_blocklength may
|
|
1331
|
+
# leave a tail. This tail is later randomly inserted
|
|
1332
|
+
tail = array[x_indices, n_blks*sig_blocklength:]
|
|
1333
|
+
tail_type = data_type_shuffled[x_indices, n_blks*sig_blocklength:]
|
|
1334
|
+
|
|
1335
|
+
|
|
1336
|
+
null_dist = np.zeros(sig_samples)
|
|
1337
|
+
for sam in range(sig_samples):
|
|
1338
|
+
|
|
1339
|
+
blk_starts = self.random_state.permutation(block_starts)[:n_blks]
|
|
1340
|
+
|
|
1341
|
+
x_shuffled = np.zeros((dim_x, n_blks*sig_blocklength),
|
|
1342
|
+
dtype=array.dtype)
|
|
1343
|
+
type_x_shuffled = np.zeros((dim_x, n_blks*sig_blocklength),
|
|
1344
|
+
dtype=array.dtype)
|
|
1345
|
+
|
|
1346
|
+
for i, index in enumerate(x_indices):
|
|
1347
|
+
for blk in range(sig_blocklength):
|
|
1348
|
+
x_shuffled[i, blk::sig_blocklength] = \
|
|
1349
|
+
array[index, blk_starts + blk]
|
|
1350
|
+
|
|
1351
|
+
type_x_shuffled[i, blk::sig_blocklength] = \
|
|
1352
|
+
data_type[index, blk_starts + blk]
|
|
1353
|
+
|
|
1354
|
+
# Insert tail randomly somewhere
|
|
1355
|
+
if tail.shape[1] > 0:
|
|
1356
|
+
insert_tail_at = self.random_state.choice(block_starts)
|
|
1357
|
+
x_shuffled = np.insert(x_shuffled, insert_tail_at,
|
|
1358
|
+
tail.T, axis=1)
|
|
1359
|
+
type_x_shuffled = np.insert(type_x_shuffled, insert_tail_at,
|
|
1360
|
+
tail_type.T, axis=1)
|
|
1361
|
+
|
|
1362
|
+
for i, index in enumerate(x_indices):
|
|
1363
|
+
array_shuffled[index] = x_shuffled[i]
|
|
1364
|
+
data_type_shuffled[index] = type_x_shuffled[i]
|
|
1365
|
+
|
|
1366
|
+
null_dist[sam] = self.get_dependence_measure(array=array_shuffled,
|
|
1367
|
+
xyz=xyz,
|
|
1368
|
+
data_type=data_type_shuffled)
|
|
1369
|
+
|
|
1370
|
+
return null_dist
|
|
1371
|
+
|
|
1372
|
+
|
|
1373
|
+
if __name__ == '__main__':
|
|
1374
|
+
|
|
1375
|
+
import tigramite
|
|
1376
|
+
from tigramite.data_processing import DataFrame
|
|
1377
|
+
import tigramite.data_processing as pp
|
|
1378
|
+
import numpy as np
|
|
1379
|
+
|
|
1380
|
+
from tigramite.independence_tests.cmiknn import CMIknn
|
|
1381
|
+
|
|
1382
|
+
random_state = np.random.default_rng(seed=None)
|
|
1383
|
+
cmi = CMIknnMixed(seed=None)
|
|
1384
|
+
|
|
1385
|
+
T = 500
|
|
1386
|
+
dimz = 1
|
|
1387
|
+
|
|
1388
|
+
# Continuous data
|
|
1389
|
+
z = random_state.standard_normal((T, dimz))
|
|
1390
|
+
x = random_state.standard_normal(T).reshape(T, 1)
|
|
1391
|
+
y = (5.*z[:,0] + 0.*x[:,0] + random_state.standard_normal(T)).reshape(T, 1)
|
|
1392
|
+
|
|
1393
|
+
print(cmi.get_dependence_measure_raw(x=x,y=y,z=z,
|
|
1394
|
+
x_type=np.zeros(x.shape), y_type=np.zeros(y.shape), z_type=np.zeros(z.shape) ))
|
|
1395
|
+
|
|
1396
|
+
print(cmi.run_test_raw(x=x,y=y,z=z,
|
|
1397
|
+
x_type=np.zeros(x.shape), y_type=np.zeros(y.shape), z_type=np.zeros(z.shape) ))
|