brain-pasta 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brain_pasta-0.0.0/PKG-INFO +11 -0
- brain_pasta-0.0.0/README.md +0 -0
- brain_pasta-0.0.0/brain_pasta.egg-info/PKG-INFO +11 -0
- brain_pasta-0.0.0/brain_pasta.egg-info/SOURCES.txt +9 -0
- brain_pasta-0.0.0/brain_pasta.egg-info/dependency_links.txt +1 -0
- brain_pasta-0.0.0/brain_pasta.egg-info/requires.txt +5 -0
- brain_pasta-0.0.0/brain_pasta.egg-info/top_level.txt +1 -0
- brain_pasta-0.0.0/pasta/__init__.py +1 -0
- brain_pasta-0.0.0/pasta/pasta.py +683 -0
- brain_pasta-0.0.0/setup.cfg +4 -0
- brain_pasta-0.0.0/setup.py +14 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: brain-pasta
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Requires-Python: >=3.9
|
|
5
|
+
Requires-Dist: numpy<2.0,>=1.26
|
|
6
|
+
Requires-Dist: scipy>=1.13
|
|
7
|
+
Requires-Dist: scikit-learn>=1.6
|
|
8
|
+
Requires-Dist: scikit-learn-extra>=0.3
|
|
9
|
+
Requires-Dist: setuptools
|
|
10
|
+
Dynamic: requires-dist
|
|
11
|
+
Dynamic: requires-python
|
|
File without changes
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: brain-pasta
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Requires-Python: >=3.9
|
|
5
|
+
Requires-Dist: numpy<2.0,>=1.26
|
|
6
|
+
Requires-Dist: scipy>=1.13
|
|
7
|
+
Requires-Dist: scikit-learn>=1.6
|
|
8
|
+
Requires-Dist: scikit-learn-extra>=0.3
|
|
9
|
+
Requires-Dist: setuptools
|
|
10
|
+
Dynamic: requires-dist
|
|
11
|
+
Dynamic: requires-python
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pasta
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .pasta import *
|
|
@@ -0,0 +1,683 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from scipy.spatial.distance import pdist, squareform
|
|
3
|
+
import scipy.stats as stats
|
|
4
|
+
from scipy.optimize import curve_fit
|
|
5
|
+
from sklearn.cluster import KMeans
|
|
6
|
+
from sklearn_extra.cluster import KMedoids
|
|
7
|
+
import copy
|
|
8
|
+
import warnings
|
|
9
|
+
from scipy.stats import pearsonr
|
|
10
|
+
from scipy.stats import t as t_dist
|
|
11
|
+
import setuptools
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def estimate_variogram(D, data, M:int, qd:float):
|
|
15
|
+
'''
|
|
16
|
+
Estimate the empirical variogram from distance matrix between vertices,
|
|
17
|
+
and data value at each vertex. Estimation performed in M bins, ranging
|
|
18
|
+
from min_distance to qd * max_distance, where min_distance and
|
|
19
|
+
max_distance are the min and max distance in the distance matrix.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
D : ndarray (N, N)
|
|
24
|
+
Distance matrix between all vertices.
|
|
25
|
+
data : ndarray (N,)
|
|
26
|
+
Data value at each vertex.
|
|
27
|
+
M : int
|
|
28
|
+
Number of bins to estimate variogram.
|
|
29
|
+
qd : float
|
|
30
|
+
Determine the maximum distance to evaluate variogram.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
v : ndarray (M,)
|
|
35
|
+
Estimated variogram values, i.e., semivariance.
|
|
36
|
+
h : ndarray (M,)
|
|
37
|
+
Lag distances.
|
|
38
|
+
|
|
39
|
+
Notes
|
|
40
|
+
-----
|
|
41
|
+
This is similar to variogram estimation in BrainSMASH but determining
|
|
42
|
+
the max distance evaluated in a different way.
|
|
43
|
+
'''
|
|
44
|
+
|
|
45
|
+
Dmax = qd * np.max(D)
|
|
46
|
+
Dmin = np.min(D[D > 0])
|
|
47
|
+
|
|
48
|
+
# Upper triangle without diagonal
|
|
49
|
+
triu_indices = np.triu_indices_from(D, k=1)
|
|
50
|
+
dval = D[triu_indices]
|
|
51
|
+
row = triu_indices[0]
|
|
52
|
+
col = triu_indices[1]
|
|
53
|
+
|
|
54
|
+
mask = dval <= Dmax #data pairs falling within the distance range of analysis
|
|
55
|
+
dval = dval[mask]
|
|
56
|
+
row = row[mask]
|
|
57
|
+
col = col[mask]
|
|
58
|
+
|
|
59
|
+
h = np.linspace(Dmin, Dmax, M) # linearly spaced lag distances
|
|
60
|
+
delta = (Dmax - Dmin) / (M - 1) * 0.5
|
|
61
|
+
sigma = 6 * delta
|
|
62
|
+
v = np.zeros(M)
|
|
63
|
+
# variogram estimation using gaussian smoothing kernel, same as BrainSMASH
|
|
64
|
+
for i in range(M):
|
|
65
|
+
w = np.exp(-((2.68 * np.abs(h[i] - dval)) ** 2) / (2 * sigma ** 2))
|
|
66
|
+
diff_sq = (data[row] - data[col]) ** 2
|
|
67
|
+
v[i] = 0.5 * np.sum(w * diff_sq) / np.sum(w)
|
|
68
|
+
|
|
69
|
+
return v, h
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def fit_variogram(h,v,D,PrecomputedVariance=None, nugget:bool=True):
|
|
73
|
+
'''
|
|
74
|
+
Fit a stable variogram model to an empirical variogram.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
h : (M,) ndarray
|
|
79
|
+
Empirical lag distances.
|
|
80
|
+
v : (M,) ndarray
|
|
81
|
+
Empirical semivariance evaluated at lag distances ``h``.
|
|
82
|
+
D : (N, N) ndarray
|
|
83
|
+
Pairwise distance matrix between spatial locations.
|
|
84
|
+
PrecomputedVariance : float or None, optional
|
|
85
|
+
Precomputed sill (total variance) as initial guess for optimization. If ``None``, the sill is
|
|
86
|
+
estimated as the maximum value of ``v``.
|
|
87
|
+
nugget : bool, default=True
|
|
88
|
+
Whether to include a nugget term in the fitted model.
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
c_para : (N, N) ndarray
|
|
93
|
+
Fitted covariance matrix derived from the stable variogram model.
|
|
94
|
+
b : (4,) ndarray
|
|
95
|
+
Estimated stable model parameters in the order
|
|
96
|
+
``(sill, range, exponent, nugget)``.
|
|
97
|
+
f : callable
|
|
98
|
+
Variogram function. ``f(h)`` returns the semivariance at lag
|
|
99
|
+
distance ``h``.
|
|
100
|
+
fcov : callable
|
|
101
|
+
Covariance function. ``fcov(h)`` returns the covariance at lag
|
|
102
|
+
distance ``h``.
|
|
103
|
+
|
|
104
|
+
Notes
|
|
105
|
+
-----
|
|
106
|
+
The fitted model follows a stable variogram parameterization. The nugget
|
|
107
|
+
term should be included for better fit.
|
|
108
|
+
'''
|
|
109
|
+
|
|
110
|
+
if PrecomputedVariance is None:
|
|
111
|
+
PrecomputedVariance = np.max(v)
|
|
112
|
+
x0 = np.asarray([PrecomputedVariance, np.min(h), 1.]) # initial guess of stable variogram parameters
|
|
113
|
+
lb = np.asarray([0., 0., 0.]) # lower bound of estimation
|
|
114
|
+
ub = np.asarray([2*PrecomputedVariance, np.inf, 2.]) # upper bound of estimation, set ub of sill to 2*PrecomputedVariance for stable inference
|
|
115
|
+
# fit variogram model
|
|
116
|
+
if not nugget:
|
|
117
|
+
b, _ = curve_fit(stable_variogram_no_nugget, h, v, p0=x0, bounds=(lb, ub))
|
|
118
|
+
b = np.append(b, 0.)
|
|
119
|
+
else:
|
|
120
|
+
x0 = np.append(x0, 0.)
|
|
121
|
+
lb = np.append(lb, 0.)
|
|
122
|
+
ub = np.append(ub, 0.5 * PrecomputedVariance) # set ub for nugget for stable inference at extreme short-range autocorrelation
|
|
123
|
+
b, _ = curve_fit(stable_variogram, h, v, p0=x0, bounds=(lb, ub))
|
|
124
|
+
f = lambda h: stable_variogram(h, *b)
|
|
125
|
+
fcov = lambda h: stable_covariance_func(h, b)
|
|
126
|
+
c_para = fcov(D) # off-diagonal components of covariance matrix
|
|
127
|
+
np.fill_diagonal(c_para, b[0] + b[3]) # diagonal set to sill + nugget
|
|
128
|
+
|
|
129
|
+
return c_para, b, f, fcov
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def stable_variogram_no_nugget(h, b1, b2, b3):
|
|
133
|
+
'''
|
|
134
|
+
stable variogram model without nugget, defined as: semivariance = sill * (1-exp(-(h/range)**shape))
|
|
135
|
+
|
|
136
|
+
Parameters
|
|
137
|
+
----------
|
|
138
|
+
h : float or ndarray
|
|
139
|
+
lag distance to be evaluated
|
|
140
|
+
b1 : float
|
|
141
|
+
sill
|
|
142
|
+
b2 : float
|
|
143
|
+
range parameter
|
|
144
|
+
b3 : float
|
|
145
|
+
shape
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
float or ndarray
|
|
150
|
+
senuvaruabce at lag distance h
|
|
151
|
+
'''
|
|
152
|
+
return b1 * (1 - np.exp(-(h / b2) ** b3))
|
|
153
|
+
|
|
154
|
+
def stable_variogram(h, b1, b2, b3, b4):
|
|
155
|
+
'''
|
|
156
|
+
Stable variogram model without nugget, defined as:
|
|
157
|
+
semivariance = sill * (1 - exp(-(h / range)**shape))
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
h : float or ndarray
|
|
162
|
+
Lag distance to be evaluated.
|
|
163
|
+
b1 : float
|
|
164
|
+
Sill.
|
|
165
|
+
b2 : float
|
|
166
|
+
Range parameter.
|
|
167
|
+
b3 : float
|
|
168
|
+
Shape.
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
float or ndarray
|
|
173
|
+
Semivariance at lag distance ``h``.
|
|
174
|
+
'''
|
|
175
|
+
return b1 * (1 - np.exp(-(h / b2) ** b3)) + b4
|
|
176
|
+
|
|
177
|
+
def stable_covariance_func(h, b):
|
|
178
|
+
'''
|
|
179
|
+
Covariance function based on stable variogram model for observations with
|
|
180
|
+
distance h.
|
|
181
|
+
|
|
182
|
+
Equivalent to:
|
|
183
|
+
(sill + nugget) - (sill * (1 - exp(-(h / range)**shape)) + nugget)
|
|
184
|
+
= sill * exp(-(h / range)**shape) for h > 0.
|
|
185
|
+
|
|
186
|
+
When h == 0, set to sill + nugget, which is not computed here.
|
|
187
|
+
|
|
188
|
+
Parameters
|
|
189
|
+
----------
|
|
190
|
+
h : float or ndarray
|
|
191
|
+
Lag distance at which to compute covariance.
|
|
192
|
+
b : ndarray
|
|
193
|
+
Parameters for stable models.
|
|
194
|
+
|
|
195
|
+
Returns
|
|
196
|
+
-------
|
|
197
|
+
float or ndarray
|
|
198
|
+
Covariance at distance h.
|
|
199
|
+
'''
|
|
200
|
+
b1, b2, b3 = b[:3]
|
|
201
|
+
return (h > 0) * (b1 * np.exp(-(h / b2) ** b3))
|
|
202
|
+
|
|
203
|
+
def parc_data(parc, c_para, b, D, coord, max_clusters, min_clusters, min_cluster_size, map_idx):
|
|
204
|
+
'''
|
|
205
|
+
parcellate data depending on setting to account for nonstationarity
|
|
206
|
+
|
|
207
|
+
3 scenarios:
|
|
208
|
+
1. parc is None: do not parcellate and return covariance matrix c_para as is (new variable name fc_para)
|
|
209
|
+
2. parc is string 'auto': determine the number of parcels based on estiamted range and shape parameter from stable variogram model (i.e., b[1] and b[2]), and parcellate data using spatial clustering
|
|
210
|
+
3. parc is user specified np int array with shape (M,) with each int indicating a unique parcel: return parc as is (new variable name parc_out), raise warning if risk of over-parcellation (compared to 'auto')
|
|
211
|
+
|
|
212
|
+
Parameters
|
|
213
|
+
----------
|
|
214
|
+
parc : either None, 'auto', or (N,)
|
|
215
|
+
specifying the setting of parcellation
|
|
216
|
+
c_para : covariance matrix estimated from PaSTA, i.e., without parcel
|
|
217
|
+
b : stable variogram model parameters estimated from PaSTA
|
|
218
|
+
D : distance matrix of data (N, N)
|
|
219
|
+
coord : (N, 3) spatial coordinates of data or None
|
|
220
|
+
If None, spatial clustering is conducted based on the distance matrix D with KMedoids
|
|
221
|
+
max_clusters : maximum number of parcellation, set to avoid over-parcellation at weak autocorrelation, e.g., spatial independence
|
|
222
|
+
min_clusters : minimum number of parcellation, set to 1 will allow PaSTA-NS to collapse to PaSTA. This was used to mandate parcellation and test difference between PaSTA and PaSTA-NS in the manuscript.
|
|
223
|
+
min_cluster_size : set to avoid too small parcellations, we set to 500 in fsaverage5 mesh with 10k vertices
|
|
224
|
+
map_idx : used to identify the map evaluated when raise warning
|
|
225
|
+
|
|
226
|
+
Returns
|
|
227
|
+
-------
|
|
228
|
+
parc_out : None when there is no subdivision of parcels, or (N,) of int where each unique int indicate a parcel
|
|
229
|
+
n_parc : number of parcels
|
|
230
|
+
unique_parcs : index of unique parcels
|
|
231
|
+
fc_para : covariance matrix, either as c_para (when no parcellation) or zeros (parcellated)
|
|
232
|
+
'''
|
|
233
|
+
if parc is None: # if None, return covariance matrix as is
|
|
234
|
+
fc_para = c_para
|
|
235
|
+
n_parc = 1
|
|
236
|
+
unique_parcs = None
|
|
237
|
+
parc_out = None
|
|
238
|
+
else:
|
|
239
|
+
# if not None, first compute number of parcels in data-driven manner depending on the strength of autocorrelation (i.e., the effective range of variogram)
|
|
240
|
+
range_len = b[1] * 2.996 ** (1/b[2]) # effective range
|
|
241
|
+
nPoints = np.max([np.sum(D < range_len) / D.shape[0] - 1, min_cluster_size]) # number of points per parcel on average, when parcel radius ~ effectuve rabge
|
|
242
|
+
n_clusters = np.max([np.min([np.floor(D.shape[0] / nPoints), max_clusters]),min_clusters]).astype(int) # number of parcels
|
|
243
|
+
if parc == 'auto':
|
|
244
|
+
if coord is not None: # spatial clustering via Kmeans on coordinates
|
|
245
|
+
parc_out = KMeans(n_clusters).fit(coord).labels_ # if coord available use kmeans
|
|
246
|
+
else: # spatial clustering via KMedoids on distance matrix
|
|
247
|
+
parc_out = KMedoids(n_clusters).fit(D).labels_ # if coord not available use kmedoids
|
|
248
|
+
unique_parcs = np.unique(parc_out)
|
|
249
|
+
n_parc = len(unique_parcs)
|
|
250
|
+
else: # if user specified parcel, raise waring if risk of over-parcellation (more than estimated by 'auto')
|
|
251
|
+
parc_out = parc # parcellation returned as is
|
|
252
|
+
unique_parcs = np.unique(parc_out)
|
|
253
|
+
n_parc = len(unique_parcs)
|
|
254
|
+
if n_parc > n_clusters:
|
|
255
|
+
warnings.warn(f'data No.{map_idx}: specified number of parcs {n_parc} is larger than data-derived max number of parcs {n_clusters}, carefully trade off the ability for detecting nonstationarity and the parcel coverage for robust estimation.')
|
|
256
|
+
if n_parc == 1: # if does not subdivide, return covariance matrix as is
|
|
257
|
+
fc_para = c_para
|
|
258
|
+
else: # if subdivide, initialize and return a covariance matrix of zeros that will be filled in later steps, i.e., PaSTA-NS
|
|
259
|
+
fc_para = np.zeros_like(c_para)
|
|
260
|
+
return parc_out, n_parc, unique_parcs, fc_para
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def effective_sample_size_estimation(x, y, coord=None, D=None, dim=None, M=None, qd=0.7, xparc=None, yparc=None, max_clusters=10, min_cluster_size=500, min_clusters=1, M_cluster=None, nugget=True):
|
|
264
|
+
'''
|
|
265
|
+
Main function that runs PaSTA and PaSTA-NS to compute effective sample size
|
|
266
|
+
and autocorrelation-corrected p-values.
|
|
267
|
+
|
|
268
|
+
Leave xparc=None and yparc=None will run PaSTA, while setting to 'auto'
|
|
269
|
+
or user-specified parcellation np int array (N,) will run PaSTA-NS.
|
|
270
|
+
|
|
271
|
+
Parameters
|
|
272
|
+
----------
|
|
273
|
+
x, y : ndarray (N,)
|
|
274
|
+
Spatial map data to evaluate association. Can contain missing values
|
|
275
|
+
such as NaN and Inf.
|
|
276
|
+
coord : ndarray (N, 3) or None
|
|
277
|
+
Spatial coordinates for observations. When unknown and left as None,
|
|
278
|
+
the function requires D to run PaSTA, and D and dim to run PaSTA-NS.
|
|
279
|
+
D : ndarray (N, N) or None
|
|
280
|
+
Distance matrix. When left as None, computed from coord.
|
|
281
|
+
dim : int or None
|
|
282
|
+
Spatial dimension of data. When left as None, computed as
|
|
283
|
+
coord.shape[1] if needed.
|
|
284
|
+
M : int or None
|
|
285
|
+
Number of lag distances to evaluate when estimating variogram —
|
|
286
|
+
important hyperparameter that determines the quality of variogram
|
|
287
|
+
estimation, large values preferred. When set to None, use
|
|
288
|
+
3*sqrt(N) as default.
|
|
289
|
+
qd : float (0, 1]
|
|
290
|
+
Determine the coverage of lag distances evaluated in variogram,
|
|
291
|
+
with maximum distance evaluated being qd*np.max(D) — important
|
|
292
|
+
hyperparameter that determines the quality of variogram estimation,
|
|
293
|
+
large values preferred. Default 0.7.
|
|
294
|
+
xparc : None, 'auto', or ndarray (N,)
|
|
295
|
+
Parcellation setting for map x. If ndarray, index should begin from
|
|
296
|
+
0, i.e., 0 to Np - 1 if Np parcels specified.
|
|
297
|
+
yparc : None, 'auto', or ndarray (N,)
|
|
298
|
+
Parcellation setting for map y. If ndarray, index should begin from
|
|
299
|
+
0, i.e., 0 to Np - 1 if Np parcels specified.
|
|
300
|
+
max_clusters : int
|
|
301
|
+
Maximum number of parcellations allowed in PaSTA-NS.
|
|
302
|
+
min_clusters : int
|
|
303
|
+
Minimum number of parcellations allowed in PaSTA-NS.
|
|
304
|
+
min_cluster_size : int
|
|
305
|
+
Minimum size of parcellations (# observations per parcel).
|
|
306
|
+
M_cluster : int or None
|
|
307
|
+
Number of lag distances to evaluate in PaSTA-NS parcels when estimating
|
|
308
|
+
their variograms. When set to None, default to 3*sqrt(Np), where Np
|
|
309
|
+
is the number of observations in each parcel.
|
|
310
|
+
nugget : bool
|
|
311
|
+
Indicator of whether use nugget in variogram models or not.
|
|
312
|
+
Default True because nugget helps with discontinuity at short
|
|
313
|
+
distances, and setting to False can result in problems especially
|
|
314
|
+
when data are nonstationary.
|
|
315
|
+
|
|
316
|
+
Returns
|
|
317
|
+
-------
|
|
318
|
+
pef : float
|
|
319
|
+
Significance p-values based on PaSTA/PaSTA-NS.
|
|
320
|
+
rX : float
|
|
321
|
+
Pearson correlation coefficient between x and y.
|
|
322
|
+
nef : float
|
|
323
|
+
Effective sample size estimated.
|
|
324
|
+
run_status : int
|
|
325
|
+
1 indicates successful run, and 0 indicates unsuccessful run such
|
|
326
|
+
as when nef < 2 and data are too smooth to infer significance.
|
|
327
|
+
n_parc : ndarray (2,)
|
|
328
|
+
[xn_parc, yn_parc] that indicates the number of parcels for each
|
|
329
|
+
map in PaSTA-NS.
|
|
330
|
+
p_naive : float
|
|
331
|
+
Significance with independence assumption and without controlling
|
|
332
|
+
for autocorrelation.
|
|
333
|
+
fc_para1 : ndarray
|
|
334
|
+
Covariance matrix for map x, with Nvx indicating the number of
|
|
335
|
+
valid (finite value) observations in map x.
|
|
336
|
+
fc_para2 : ndarray
|
|
337
|
+
Covariance matrix for map y, with Nvy indicating the number of
|
|
338
|
+
valid (finite value) observations in map y.
|
|
339
|
+
'''
|
|
340
|
+
assert (coord is not None or D is not None), 'at least one of coord and D is required'
|
|
341
|
+
assert ((coord is not None or dim is not None) or xparc is None and yparc is None), 'dim is required for PaSTA-NS when coord is not provided'
|
|
342
|
+
valid = np.logical_and(np.isfinite(x), np.isfinite(y))
|
|
343
|
+
x = x[valid]
|
|
344
|
+
y = y[valid]
|
|
345
|
+
x = stats.zscore(x)
|
|
346
|
+
y = stats.zscore(y)
|
|
347
|
+
if D is not None:
|
|
348
|
+
D = D[np.ix_(valid, valid)]
|
|
349
|
+
else:
|
|
350
|
+
coord = coord[valid,:]
|
|
351
|
+
D = squareform(pdist(coord))
|
|
352
|
+
dim = coord.shape[1]
|
|
353
|
+
|
|
354
|
+
if M is None:
|
|
355
|
+
M = 3 * np.ceil(np.sqrt(x.shape[0])).astype('int')
|
|
356
|
+
|
|
357
|
+
PrecomputedVariance = None
|
|
358
|
+
v1,h1 = estimate_variogram(D, x, M, qd)
|
|
359
|
+
v2,h2 = estimate_variogram(D, y, M, qd)
|
|
360
|
+
c_para1, b1, f1, fcov1 = fit_variogram(h1,v1,D,PrecomputedVariance,nugget)
|
|
361
|
+
c_para2, b2, f2, fcov2 = fit_variogram(h2,v2,D,PrecomputedVariance,nugget)
|
|
362
|
+
|
|
363
|
+
xparc, xn_parc, xunique_parcs, fc_para1 = parc_data(xparc, c_para1, b1, D, coord, max_clusters, min_clusters, min_cluster_size, 1)
|
|
364
|
+
yparc, yn_parc, yunique_parcs, fc_para2 = parc_data(yparc, c_para2, b2, D, coord, max_clusters, min_clusters, min_cluster_size, 1)
|
|
365
|
+
if xn_parc > 1:
|
|
366
|
+
exponent1 = b1[2]
|
|
367
|
+
fc_para1, pb1 = fit_covariance_blocks(x, D, xn_parc, xparc, M_cluster, qd, nugget, exponent1)
|
|
368
|
+
fc_para1 = process_convolution_crossblocks(fc_para1, pb1, x, D, xn_parc, xparc, dim, exponent1)
|
|
369
|
+
if yn_parc > 1:
|
|
370
|
+
exponent2 = b2[2]
|
|
371
|
+
fc_para2, pb2 = fit_covariance_blocks(y, D, yn_parc, yparc, M_cluster, qd, nugget, exponent2)
|
|
372
|
+
fc_para2 = process_convolution_crossblocks(fc_para2, pb2, y, D, yn_parc, yparc, dim, exponent2)
|
|
373
|
+
|
|
374
|
+
nef = cov2nef(fc_para1,fc_para2)
|
|
375
|
+
run_status = nef > 2
|
|
376
|
+
|
|
377
|
+
rX, p_naive = pearsonr(x, y)
|
|
378
|
+
if run_status:
|
|
379
|
+
pef = nef2p(rX, nef)
|
|
380
|
+
else:
|
|
381
|
+
pef = np.nan
|
|
382
|
+
n_parc = np.asarray([xn_parc, yn_parc])
|
|
383
|
+
return pef, rX, nef, run_status, n_parc, p_naive, fc_para1, fc_para2
|
|
384
|
+
|
|
385
|
+
def covariance_estimation(x, coord=None, D=None, dim=None, M=None, qd=0.7, xparc=None, max_clusters=10, min_cluster_size=500, min_clusters=1, M_cluster=None, nugget=True):
|
|
386
|
+
'''
|
|
387
|
+
Compute the covariance matrix for a single map x using PaSTA or PaSTA-NS.
|
|
388
|
+
|
|
389
|
+
This can be particularly useful when pairwise association between a
|
|
390
|
+
large number of maps needs to be evaluated. Compute the covariance
|
|
391
|
+
matrix for each data separately and save for later use can avoid
|
|
392
|
+
repetitive covariance estimation in the
|
|
393
|
+
effective_sample_size_estimation function.
|
|
394
|
+
|
|
395
|
+
Statistical significance between two maps can be inferred by loading
|
|
396
|
+
saved covariance matrices (cov1 and cov2) of two maps (x and y), and
|
|
397
|
+
compute effective sample size and p-values following steps below:
|
|
398
|
+
|
|
399
|
+
get submatrix of covariance matrices for points that are valid in both
|
|
400
|
+
maps - valid = np.isfinite(x) & np.isfinite(y),
|
|
401
|
+
cov1 = cov1[np.ix_(valid, valid)],
|
|
402
|
+
cov2 = cov2[np.ix_(valid, valid)],
|
|
403
|
+
x = x[valid], y = y[valid]
|
|
404
|
+
|
|
405
|
+
compute nef - cov2nef(cov1, cov2)
|
|
406
|
+
|
|
407
|
+
compute test statistics such as Pearson correlation coefficient —
|
|
408
|
+
rX, p_naive = pearsonr(x, y)
|
|
409
|
+
|
|
410
|
+
compute significance p-value from test statistics and effective sample
|
|
411
|
+
size: nef2p(rX, nef)
|
|
412
|
+
|
|
413
|
+
Inputs are same as in effective_sample_size_estimation but with y and yparc removed
|
|
414
|
+
|
|
415
|
+
Returns
|
|
416
|
+
-------
|
|
417
|
+
covmat : ndarray (N, N)
|
|
418
|
+
Covariance matrix of map x in shape (N, N), where rows and columns
|
|
419
|
+
corresponding to invalid observations in x (e.g., NaN, Inf) are set
|
|
420
|
+
to np.nan and need to be removed before computing nef.
|
|
421
|
+
'''
|
|
422
|
+
assert (coord is not None or D is not None), 'at least one of coord and D is required'
|
|
423
|
+
assert ((coord is not None or dim is not None) or xparc is None), 'dim is required for PaSTA-NS when coord is not provided'
|
|
424
|
+
nx = len(x)
|
|
425
|
+
valid = np.isfinite(x)
|
|
426
|
+
x = x[valid]
|
|
427
|
+
x = stats.zscore(x)
|
|
428
|
+
covmat = np.full((nx, nx), np.nan)
|
|
429
|
+
if D is not None:
|
|
430
|
+
D = D[np.ix_(valid, valid)]
|
|
431
|
+
else:
|
|
432
|
+
coord = coord[valid,:]
|
|
433
|
+
D = squareform(pdist(coord))
|
|
434
|
+
dim = coord.shape[1]
|
|
435
|
+
|
|
436
|
+
if M is None:
|
|
437
|
+
M = 3 * np.ceil(np.sqrt(x.shape[0])).astype('int')
|
|
438
|
+
|
|
439
|
+
PrecomputedVariance = None
|
|
440
|
+
v1,h1 = estimate_variogram(D, x, M, qd)
|
|
441
|
+
c_para1, b1, f1, fcov1 = fit_variogram(h1,v1,D,PrecomputedVariance,nugget)
|
|
442
|
+
|
|
443
|
+
xparc, xn_parc, xunique_parcs, fc_para1 = parc_data(xparc, c_para1, b1, D, coord, max_clusters, min_clusters, min_cluster_size, 1)
|
|
444
|
+
if xn_parc > 1:
|
|
445
|
+
exponent1 = b1[2]
|
|
446
|
+
fc_para1, pb1 = fit_covariance_blocks(x, D, xn_parc, xparc, M_cluster, qd, nugget, exponent1)
|
|
447
|
+
fc_para1 = process_convolution_crossblocks(fc_para1, pb1, x, D, xn_parc, xparc, dim, exponent1)
|
|
448
|
+
|
|
449
|
+
covmat[np.ix_(valid,valid)] = fc_para1
|
|
450
|
+
return covmat
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def cov2nef(c_para1, c_para2):
|
|
454
|
+
'''
|
|
455
|
+
Compute effective sample size from covariance matrices c_para1 and
|
|
456
|
+
c_para2.
|
|
457
|
+
|
|
458
|
+
Is a computational efficient implementation equivalent to::
|
|
459
|
+
|
|
460
|
+
nef=real(1/(trace(B*fc_para1*B*fc_para2)/(trace(B*fc_para1)*trace(B*fc_para2)))+1);
|
|
461
|
+
|
|
462
|
+
Parameters
|
|
463
|
+
----------
|
|
464
|
+
c_para1 : ndarray
|
|
465
|
+
Covariance matrix.
|
|
466
|
+
c_para2 : ndarray
|
|
467
|
+
Covariance matrix.
|
|
468
|
+
|
|
469
|
+
Returns
|
|
470
|
+
-------
|
|
471
|
+
nef : float
|
|
472
|
+
Effective sample size.
|
|
473
|
+
'''
|
|
474
|
+
c1 = c_para1 - np.mean(c_para1, axis=0, keepdims=True) - np.mean(c_para1, axis=1, keepdims=True) + np.mean(c_para1)
|
|
475
|
+
c2 = c_para2 - np.mean(c_para2, axis=0, keepdims=True) - np.mean(c_para2, axis=1, keepdims=True) + np.mean(c_para2)
|
|
476
|
+
num = np.trace(c1 @ c2)
|
|
477
|
+
den = np.trace(c1) * np.trace(c2)
|
|
478
|
+
nef = np.real(1 / (num / den) + 1)
|
|
479
|
+
return nef
|
|
480
|
+
|
|
481
|
+
def nef2p(rX, nef):
|
|
482
|
+
'''
|
|
483
|
+
Infer statistical significance p-value from test statistics rX and
|
|
484
|
+
effective sample size nef.
|
|
485
|
+
|
|
486
|
+
Parameters
|
|
487
|
+
----------
|
|
488
|
+
rX : float
|
|
489
|
+
Test statistic.
|
|
490
|
+
nef : float
|
|
491
|
+
Effective sample size.
|
|
492
|
+
|
|
493
|
+
Returns
|
|
494
|
+
-------
|
|
495
|
+
p : float
|
|
496
|
+
Statistical significance p-value.
|
|
497
|
+
'''
|
|
498
|
+
df = max(0, nef - 2)
|
|
499
|
+
if df == 0:
|
|
500
|
+
return np.nan
|
|
501
|
+
t = rX * np.sqrt(df / (1 - rX**2))
|
|
502
|
+
p = 2 * t_dist.sf(np.abs(t), df)
|
|
503
|
+
return p
|
|
504
|
+
|
|
505
|
+
def fit_covariance_blocks(x, D, n_clusters, point_cluster_idx, M_cluster, qd, nugget, exponent):
|
|
506
|
+
'''
|
|
507
|
+
Fit variogram model for each parcel and compute the diagonal blocks of
|
|
508
|
+
nonstationary covariance matrix.
|
|
509
|
+
|
|
510
|
+
Parameters
|
|
511
|
+
----------
|
|
512
|
+
x : ndarray (N,)
|
|
513
|
+
Spatial map data to evaluate association. All values are valid.
|
|
514
|
+
D : ndarray (N, N)
|
|
515
|
+
Distance matrix.
|
|
516
|
+
n_clusters : int
|
|
517
|
+
Number of parcels.
|
|
518
|
+
point_cluster_idx : ndarray (N,)
|
|
519
|
+
Int array specifying parcellation settings for map x, ranging from
|
|
520
|
+
0 to NP-1 if NP parcels.
|
|
521
|
+
M_cluster : int or None
|
|
522
|
+
Number of lag distances to evaluate in parcel when estimating their
|
|
523
|
+
variograms. When set to None, default to 3*sqrt(Np), where Np is
|
|
524
|
+
the number of observations in each parcel.
|
|
525
|
+
qd : float (0, 1]
|
|
526
|
+
nugget : bool
|
|
527
|
+
Indicator of whether use nugget in variogram models or not.
|
|
528
|
+
exponent : float
|
|
529
|
+
Shape parameter estimated using global stationary variogram model.
|
|
530
|
+
This will be kept the same across parcels to obtain valid
|
|
531
|
+
nonstationary covariance expression (i.e., PSD matrix).
|
|
532
|
+
|
|
533
|
+
Returns
|
|
534
|
+
-------
|
|
535
|
+
c_para : ndarray (N, N)
|
|
536
|
+
Covariance matrix for map x, where within parcel covariance are
|
|
537
|
+
estimated but cross-parcel elements are set to 0.
|
|
538
|
+
b : ndarray (n_clusters, 4)
|
|
539
|
+
Stable variogram model parameters, each row corresponds a parcel.
|
|
540
|
+
'''
|
|
541
|
+
c_para = np.zeros(D.shape) # initiation
|
|
542
|
+
b = np.zeros(shape=(n_clusters,4))
|
|
543
|
+
computeM = (M_cluster is None)
|
|
544
|
+
for i in np.arange(n_clusters):
|
|
545
|
+
v_select = point_cluster_idx == i
|
|
546
|
+
x_select = x[v_select]
|
|
547
|
+
var_x_select = x_select.var()
|
|
548
|
+
x_select = stats.zscore(x_select)
|
|
549
|
+
D_select = D[np.ix_(v_select, v_select)]
|
|
550
|
+
if computeM:
|
|
551
|
+
M_cluster = 3 * np.ceil(np.sqrt(x_select.shape[0])).astype(int)
|
|
552
|
+
v, h = estimate_variogram(D_select, x_select, M_cluster, qd)
|
|
553
|
+
pc_para, pb, f, fcov = fit_variogram_fixed_exponent(h, v, D_select, exponent, 1, nugget)
|
|
554
|
+
c_para[np.ix_(v_select, v_select)] = pc_para * var_x_select
|
|
555
|
+
pb[0] = pb[0] * var_x_select
|
|
556
|
+
pb[-1] = pb[-1] * var_x_select
|
|
557
|
+
b[i,:] = pb
|
|
558
|
+
return c_para, b
|
|
559
|
+
|
|
560
|
+
def fit_variogram_fixed_exponent(h, v, D, exponent, PrecomputedVariance=None, nugget: bool = True):
|
|
561
|
+
'''
|
|
562
|
+
Same as fit_variogram, but for stable model with predetermined
|
|
563
|
+
range parameter.
|
|
564
|
+
|
|
565
|
+
Parameters
|
|
566
|
+
----------
|
|
567
|
+
h : ndarray
|
|
568
|
+
v : ndarray
|
|
569
|
+
D : ndarray
|
|
570
|
+
exponent : float
|
|
571
|
+
PrecomputedVariance : float or None
|
|
572
|
+
nugget : bool
|
|
573
|
+
|
|
574
|
+
Returns
|
|
575
|
+
-------
|
|
576
|
+
c_para : ndarray
|
|
577
|
+
b : ndarray
|
|
578
|
+
f : callable
|
|
579
|
+
fcov : callable
|
|
580
|
+
'''
|
|
581
|
+
if PrecomputedVariance is None:
|
|
582
|
+
PrecomputedVariance = np.max(v)
|
|
583
|
+
x0 = np.asarray([PrecomputedVariance, np.min(h)])
|
|
584
|
+
lb = np.asarray([0., 0.])
|
|
585
|
+
ub = np.asarray([2*PrecomputedVariance, np.inf])
|
|
586
|
+
if not nugget:
|
|
587
|
+
b, _ = curve_fit(lambda h, b1, b2: stable_variogram_fixed_exp_no_nugget(h, b1, b2, exponent), h, v, p0=x0, bounds=(lb, ub))
|
|
588
|
+
b = np.array([b[0], b[1], exponent, 0.0])
|
|
589
|
+
else:
|
|
590
|
+
x0 = np.append(x0, 0.)
|
|
591
|
+
lb = np.append(lb, 0.)
|
|
592
|
+
ub = np.append(ub, 0.5*PrecomputedVariance) # set ub for nugget to avoid inaccurate shape parameter fitting when no/very-short-range autocorrelation
|
|
593
|
+
b, _ = curve_fit(lambda h, b1, b2, b3: stable_variogram_fixed_exp(h, b1, b2, b3, exponent), h, v, p0=x0, bounds=(lb, ub))
|
|
594
|
+
b = np.asarray([b[0], b[1], exponent, b[-1]])
|
|
595
|
+
|
|
596
|
+
f = lambda h: stable_variogram(h, *b)
|
|
597
|
+
fcov = lambda h: stable_covariance_func(h, b)
|
|
598
|
+
c_para = fcov(D)
|
|
599
|
+
np.fill_diagonal(c_para, b[0] + b[3])
|
|
600
|
+
|
|
601
|
+
return c_para, b, f, fcov
|
|
602
|
+
|
|
603
|
+
def stable_variogram_fixed_exp_no_nugget(h, b1, b2, fixed_exp):
|
|
604
|
+
'''
|
|
605
|
+
Stable variogram with prespecified shape parameter, without nugget
|
|
606
|
+
(i.e., nugget set to 0).
|
|
607
|
+
|
|
608
|
+
Parameters
|
|
609
|
+
----------
|
|
610
|
+
h : float or ndarray
|
|
611
|
+
b1 : float
|
|
612
|
+
b2 : float
|
|
613
|
+
fixed_exp : float
|
|
614
|
+
|
|
615
|
+
Returns
|
|
616
|
+
-------
|
|
617
|
+
float or ndarray
|
|
618
|
+
'''
|
|
619
|
+
return b1 * (1 - np.exp(-(h / b2) ** fixed_exp))
|
|
620
|
+
|
|
621
|
+
def stable_variogram_fixed_exp(h, b1, b2, b3, fixed_exp):
|
|
622
|
+
'''
|
|
623
|
+
Stable variogram with prespecified shape parameter, with nugget.
|
|
624
|
+
|
|
625
|
+
Parameters
|
|
626
|
+
----------
|
|
627
|
+
h : float or ndarray
|
|
628
|
+
b1 : float
|
|
629
|
+
b2 : float
|
|
630
|
+
b3 : float
|
|
631
|
+
fixed_exp : float
|
|
632
|
+
|
|
633
|
+
Returns
|
|
634
|
+
-------
|
|
635
|
+
float or ndarray
|
|
636
|
+
'''
|
|
637
|
+
return b1 * (1 - np.exp(-(h / b2) ** fixed_exp)) + b3
|
|
638
|
+
|
|
639
|
+
def process_convolution_crossblocks(c_para, b, x, D, n_clusters, point_cluster_idx, dim, exponent):
|
|
640
|
+
'''
|
|
641
|
+
Process convolution to infer the cross-parcel covariance of
|
|
642
|
+
nonstationary covariance matrix.
|
|
643
|
+
|
|
644
|
+
Parameters
|
|
645
|
+
----------
|
|
646
|
+
c_para : ndarray (N, N)
|
|
647
|
+
Covariance matrix output from fit_covariance_blocks, where
|
|
648
|
+
covariances are estimated for within parcel pairs but not
|
|
649
|
+
cross-parcel.
|
|
650
|
+
b : ndarray (n_clusters, 4)
|
|
651
|
+
Fitted stable variogram model parameters, each row per parcel.
|
|
652
|
+
x : ndarray (N,)
|
|
653
|
+
Map data.
|
|
654
|
+
D : ndarray (N, N)
|
|
655
|
+
Distance matrix.
|
|
656
|
+
n_clusters : int
|
|
657
|
+
Number of parcels.
|
|
658
|
+
point_cluster_idx : ndarray (N,)
|
|
659
|
+
Int array starting from 0 indicating the membership of each
|
|
660
|
+
point to parcels.
|
|
661
|
+
dim : int
|
|
662
|
+
Spatial dimension of the data.
|
|
663
|
+
exponent : float
|
|
664
|
+
Shape parameter fitted using the global stationary variogram,
|
|
665
|
+
kept the same for valid PSD covariance matrix.
|
|
666
|
+
|
|
667
|
+
Returns
|
|
668
|
+
-------
|
|
669
|
+
c_para : ndarray (N, N)
|
|
670
|
+
Nonstationary covariance matrix by process convolution.
|
|
671
|
+
'''
|
|
672
|
+
for i in np.arange(n_clusters-1):
|
|
673
|
+
v_select1 = point_cluster_idx == i
|
|
674
|
+
phi_i = b[i,1]
|
|
675
|
+
for j in np.arange(i+1, n_clusters):
|
|
676
|
+
v_select2 = point_cluster_idx == j
|
|
677
|
+
phi_j = b[j,1]
|
|
678
|
+
D_select = D[np.ix_(v_select1, v_select2)]
|
|
679
|
+
sig = (phi_i ** 2 + phi_j ** 2) / 2
|
|
680
|
+
Qij = D_select ** 2 / sig
|
|
681
|
+
c_para[np.ix_(v_select1, v_select2)] = (phi_i * phi_j / sig) ** (dim/2) * np.sqrt(b[i,0] * b[j,0]) * np.exp(- np.sqrt(Qij) ** exponent)
|
|
682
|
+
c_para[np.ix_(v_select2, v_select1)] = c_para[np.ix_(v_select1, v_select2)].T
|
|
683
|
+
return c_para
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
setup(
|
|
3
|
+
name='brain-pasta',
|
|
4
|
+
version='0.0.0',
|
|
5
|
+
python_requires=">=3.9",
|
|
6
|
+
packages=find_packages(),
|
|
7
|
+
install_requires=[
|
|
8
|
+
'numpy>=1.26,<2.0',
|
|
9
|
+
'scipy>=1.13',
|
|
10
|
+
'scikit-learn>=1.6',
|
|
11
|
+
'scikit-learn-extra>=0.3',
|
|
12
|
+
'setuptools'
|
|
13
|
+
],
|
|
14
|
+
)
|