pcntoolkit 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pcntoolkit/__init__.py +4 -0
- pcntoolkit/configs.py +9 -0
- pcntoolkit/dataio/__init__.py +1 -0
- pcntoolkit/dataio/fileio.py +608 -0
- pcntoolkit/model/KnuOp.py +48 -0
- pcntoolkit/model/NP.py +88 -0
- pcntoolkit/model/NPR.py +86 -0
- pcntoolkit/model/SHASH.py +509 -0
- pcntoolkit/model/__init__.py +6 -0
- pcntoolkit/model/architecture.py +219 -0
- pcntoolkit/model/bayesreg.py +585 -0
- pcntoolkit/model/core.21290 +0 -0
- pcntoolkit/model/gp.py +489 -0
- pcntoolkit/model/hbr.py +1584 -0
- pcntoolkit/model/rfa.py +245 -0
- pcntoolkit/normative.py +1647 -0
- pcntoolkit/normative_NP.py +336 -0
- pcntoolkit/normative_model/__init__.py +6 -0
- pcntoolkit/normative_model/norm_base.py +62 -0
- pcntoolkit/normative_model/norm_blr.py +303 -0
- pcntoolkit/normative_model/norm_gpr.py +112 -0
- pcntoolkit/normative_model/norm_hbr.py +752 -0
- pcntoolkit/normative_model/norm_np.py +333 -0
- pcntoolkit/normative_model/norm_rfa.py +109 -0
- pcntoolkit/normative_model/norm_utils.py +29 -0
- pcntoolkit/normative_parallel.py +1420 -0
- pcntoolkit/regression_model/blr/warp.py +1 -0
- pcntoolkit/trendsurf.py +315 -0
- pcntoolkit/util/__init__.py +1 -0
- pcntoolkit/util/bspline.py +149 -0
- pcntoolkit/util/hbr_utils.py +242 -0
- pcntoolkit/util/utils.py +1698 -0
- pcntoolkit-0.32.0.dist-info/LICENSE +674 -0
- pcntoolkit-0.32.0.dist-info/METADATA +134 -0
- pcntoolkit-0.32.0.dist-info/RECORD +37 -0
- pcntoolkit-0.32.0.dist-info/WHEEL +4 -0
- pcntoolkit-0.32.0.dist-info/entry_points.txt +5 -0
pcntoolkit/util/utils.py
ADDED
|
@@ -0,0 +1,1698 @@
|
|
|
1
|
+
from __future__ import print_function
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import pickle
|
|
5
|
+
import re
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
from abc import ABCMeta, abstractmethod
|
|
9
|
+
from io import StringIO
|
|
10
|
+
from subprocess import call
|
|
11
|
+
|
|
12
|
+
import bspline
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import pymc as pm
|
|
17
|
+
import scipy.special as spp
|
|
18
|
+
from bspline import splinelab
|
|
19
|
+
from scipy import stats
|
|
20
|
+
from scipy.stats import genextreme, norm, skewnorm
|
|
21
|
+
from six import with_metaclass
|
|
22
|
+
from sklearn.datasets import make_regression
|
|
23
|
+
from sklearn.metrics import roc_auc_score
|
|
24
|
+
|
|
25
|
+
try: # run as a package if installed
|
|
26
|
+
from pcntoolkit import configs
|
|
27
|
+
except ImportError:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
path = os.path.abspath(os.path.dirname(__file__))
|
|
31
|
+
rootpath = os.path.dirname(path) # parent directory
|
|
32
|
+
if rootpath not in sys.path:
|
|
33
|
+
sys.path.append(rootpath)
|
|
34
|
+
del path, rootpath
|
|
35
|
+
import configs
|
|
36
|
+
|
|
37
|
+
PICKLE_PROTOCOL = configs.PICKLE_PROTOCOL
|
|
38
|
+
|
|
39
|
+
# -----------------
|
|
40
|
+
# Utility functions
|
|
41
|
+
# -----------------
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def create_poly_basis(X, dimpoly):
|
|
45
|
+
"""
|
|
46
|
+
Creates a polynomial basis matrix for the given input matrix.
|
|
47
|
+
|
|
48
|
+
This function takes an input matrix `X` and a degree `dimpoly`, and returns a new matrix where each column is `X` raised to the power of a degree. The degrees range from 1 to `dimpoly`. If `X` is a 1D array, it is reshaped into a 2D array with one column.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
X : numpy.ndarray
|
|
53
|
+
The input matrix, a 2D array where each row is a sample and each column is a feature. If `X` is a 1D array, it is reshaped into a 2D array with one column.
|
|
54
|
+
dimpoly : int
|
|
55
|
+
The degree of the polynomial basis. The output matrix will have `dimpoly` times as many columns as `X`.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
Phi : numpy.ndarray
|
|
60
|
+
The polynomial basis matrix, a 2D array where each row is a sample and each column is a feature raised to a degree. The degrees range from 1 to `dimpoly`.
|
|
61
|
+
|
|
62
|
+
Examples
|
|
63
|
+
--------
|
|
64
|
+
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
|
|
65
|
+
>>> create_poly_basis(X, 2)
|
|
66
|
+
array([[ 1., 2., 1., 4.],
|
|
67
|
+
[ 3., 4., 9., 16.],
|
|
68
|
+
[ 5., 6., 25., 36.]])
|
|
69
|
+
"""
|
|
70
|
+
if len(X.shape) == 1:
|
|
71
|
+
X = X[:, np.newaxis]
|
|
72
|
+
D = X.shape[1]
|
|
73
|
+
Phi = np.zeros((X.shape[0], D*dimpoly))
|
|
74
|
+
colid = np.arange(0, D)
|
|
75
|
+
for d in range(1, dimpoly+1):
|
|
76
|
+
Phi[:, colid] = X ** d
|
|
77
|
+
colid += D
|
|
78
|
+
|
|
79
|
+
return Phi
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def create_bspline_basis(xmin, xmax, p=3, nknots=5):
|
|
83
|
+
"""
|
|
84
|
+
Compute a Bspline basis set where:
|
|
85
|
+
|
|
86
|
+
:param p: order of spline (3 = cubic)
|
|
87
|
+
:param nknots: number of knots (endpoints only counted once)
|
|
88
|
+
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
knots = np.linspace(xmin, xmax, nknots)
|
|
92
|
+
k = splinelab.augknt(knots, p) # pad the knot vector
|
|
93
|
+
B = bspline.Bspline(k, p)
|
|
94
|
+
return B
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def create_design_matrix(X, intercept=True, basis='bspline',
|
|
98
|
+
basis_column=0, site_ids=None, all_sites=None,
|
|
99
|
+
**kwargs):
|
|
100
|
+
"""
|
|
101
|
+
Prepare a design matrix from a set of covariates sutiable for
|
|
102
|
+
running Bayesian linear regression. This design matrix consists of
|
|
103
|
+
a set of user defined covariates, optional site intercepts
|
|
104
|
+
(fixed effects) and also optionally a nonlinear basis expansion over
|
|
105
|
+
one of the columns
|
|
106
|
+
|
|
107
|
+
:param X: matrix of covariates
|
|
108
|
+
:param basis: type of basis expansion to use
|
|
109
|
+
:param basis_column: which colume to perform the expansion over?
|
|
110
|
+
:param site_ids: list of site ids (one per data point)
|
|
111
|
+
:param all_sites: list of unique site ids
|
|
112
|
+
:param p: order of spline (3 = cubic)
|
|
113
|
+
:param nknots: number of knots (endpoints only counted once)
|
|
114
|
+
|
|
115
|
+
if site_ids is specified, this must have the same number of entries as
|
|
116
|
+
there are rows in X. If all_sites is specfied, these will be used to
|
|
117
|
+
create the site identifiers in place of site_ids. This accommo
|
|
118
|
+
dates
|
|
119
|
+
the scenario where not all the sites used to create the model are
|
|
120
|
+
present in the test set (i.e. there will be some empty site columns).
|
|
121
|
+
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
xmin = kwargs.pop('xmin', 0)
|
|
125
|
+
xmax = kwargs.pop('xmax', 100)
|
|
126
|
+
|
|
127
|
+
N = X.shape[0]
|
|
128
|
+
|
|
129
|
+
if isinstance(X, pd.DataFrame):
|
|
130
|
+
X = X.to_numpy()
|
|
131
|
+
|
|
132
|
+
# add intercept column
|
|
133
|
+
if intercept:
|
|
134
|
+
Phi = np.concatenate((np.ones((N, 1)), X), axis=1)
|
|
135
|
+
else:
|
|
136
|
+
Phi = X
|
|
137
|
+
|
|
138
|
+
# add dummy coded site columns
|
|
139
|
+
if all_sites is None:
|
|
140
|
+
if site_ids is not None:
|
|
141
|
+
all_sites = sorted(pd.unique(site_ids))
|
|
142
|
+
|
|
143
|
+
if site_ids is None:
|
|
144
|
+
if all_sites is None:
|
|
145
|
+
site_cols = None
|
|
146
|
+
else:
|
|
147
|
+
# site ids are not specified, but all_sites are
|
|
148
|
+
site_cols = np.zeros((N, len(all_sites)))
|
|
149
|
+
else:
|
|
150
|
+
# site ids are defined
|
|
151
|
+
# make sure the data are in pandas format
|
|
152
|
+
if not isinstance(site_ids, pd.Series):
|
|
153
|
+
site_ids = pd.Series(data=site_ids)
|
|
154
|
+
# site_ids = pd.Series(data=site_ids)
|
|
155
|
+
|
|
156
|
+
# make sure all_sites is defined
|
|
157
|
+
if all_sites is None:
|
|
158
|
+
all_sites = sorted(pd.unique(site_ids))
|
|
159
|
+
|
|
160
|
+
# dummy code the sites
|
|
161
|
+
site_cols = np.zeros((N, len(all_sites)))
|
|
162
|
+
for i, s in enumerate(all_sites):
|
|
163
|
+
site_cols[:, i] = site_ids == s
|
|
164
|
+
|
|
165
|
+
if site_cols.shape[0] != N:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
'site cols must have the same number of rows as X')
|
|
168
|
+
|
|
169
|
+
if site_cols is not None:
|
|
170
|
+
Phi = np.concatenate((Phi, site_cols), axis=1)
|
|
171
|
+
|
|
172
|
+
# create Bspline basis set
|
|
173
|
+
if basis == 'bspline':
|
|
174
|
+
B = create_bspline_basis(xmin, xmax, **kwargs)
|
|
175
|
+
Phi = np.concatenate(
|
|
176
|
+
(Phi, np.array([B(i) for i in X[:, basis_column]])), axis=1)
|
|
177
|
+
elif basis == 'poly':
|
|
178
|
+
Phi = np.concatenate((Phi, create_poly_basis(
|
|
179
|
+
X[:, basis_column], **kwargs)), axis=1)
|
|
180
|
+
|
|
181
|
+
return Phi
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def squared_dist(x, z=None):
|
|
185
|
+
"""
|
|
186
|
+
Compute sum((x-z) ** 2) for all vectors in a 2d array.
|
|
187
|
+
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
# do some basic checks
|
|
191
|
+
if z is None:
|
|
192
|
+
z = x
|
|
193
|
+
if len(x.shape) == 1:
|
|
194
|
+
x = x[:, np.newaxis]
|
|
195
|
+
if len(z.shape) == 1:
|
|
196
|
+
z = z[:, np.newaxis]
|
|
197
|
+
|
|
198
|
+
nx, dx = x.shape
|
|
199
|
+
nz, dz = z.shape
|
|
200
|
+
if dx != dz:
|
|
201
|
+
raise ValueError("""
|
|
202
|
+
Cannot compute distance: vectors have different length""")
|
|
203
|
+
|
|
204
|
+
# mean centre for numerical stability
|
|
205
|
+
m = np.mean(np.vstack((np.mean(x, axis=0), np.mean(z, axis=0))), axis=0)
|
|
206
|
+
x = x - m
|
|
207
|
+
z = z - m
|
|
208
|
+
|
|
209
|
+
xx = np.tile(np.sum((x*x), axis=1)[:, np.newaxis], (1, nz))
|
|
210
|
+
zz = np.tile(np.sum((z*z), axis=1), (nx, 1))
|
|
211
|
+
|
|
212
|
+
dist = (xx - 2*x.dot(z.T) + zz)
|
|
213
|
+
|
|
214
|
+
return dist
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def compute_pearsonr(A, B):
|
|
218
|
+
"""
|
|
219
|
+
Manually computes the Pearson correlation between two matrices.
|
|
220
|
+
|
|
221
|
+
Basic usage::
|
|
222
|
+
|
|
223
|
+
compute_pearsonr(A, B)
|
|
224
|
+
|
|
225
|
+
:param A: an N * M data array
|
|
226
|
+
:param cov: an N * M array
|
|
227
|
+
|
|
228
|
+
:returns Rho: N dimensional vector of correlation coefficients
|
|
229
|
+
:returns ys2: N dimensional vector of p-values
|
|
230
|
+
|
|
231
|
+
Notes::
|
|
232
|
+
|
|
233
|
+
This function is useful when M is large and only the diagonal entries
|
|
234
|
+
of the resulting correlation matrix are of interest. This function
|
|
235
|
+
does not compute the full correlation matrix as an intermediate step
|
|
236
|
+
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
# N = A.shape[1]
|
|
240
|
+
N = A.shape[0]
|
|
241
|
+
|
|
242
|
+
# first mean centre
|
|
243
|
+
Am = A - np.mean(A, axis=0)
|
|
244
|
+
Bm = B - np.mean(B, axis=0)
|
|
245
|
+
# then normalize
|
|
246
|
+
An = Am / np.sqrt(np.sum(Am**2, axis=0))
|
|
247
|
+
Bn = Bm / np.sqrt(np.sum(Bm**2, axis=0))
|
|
248
|
+
del (Am, Bm)
|
|
249
|
+
|
|
250
|
+
Rho = np.sum(An * Bn, axis=0)
|
|
251
|
+
del (An, Bn)
|
|
252
|
+
|
|
253
|
+
# Fisher r-to-z
|
|
254
|
+
Zr = (np.arctanh(Rho) - np.arctanh(0)) * np.sqrt(N - 3)
|
|
255
|
+
N = stats.norm()
|
|
256
|
+
pRho = 2*N.cdf(-np.abs(Zr))
|
|
257
|
+
# pRho = 1-N.cdf(Zr)
|
|
258
|
+
|
|
259
|
+
return Rho, pRho
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def explained_var(ytrue, ypred):
|
|
263
|
+
"""
|
|
264
|
+
Computes the explained variance of predicted values.
|
|
265
|
+
|
|
266
|
+
Basic usage::
|
|
267
|
+
|
|
268
|
+
exp_var = explained_var(ytrue, ypred)
|
|
269
|
+
|
|
270
|
+
where
|
|
271
|
+
|
|
272
|
+
:ytrue: n*p matrix of true values where n is the number of samples
|
|
273
|
+
and p is the number of features.
|
|
274
|
+
:ypred: n*p matrix of predicted values where n is the number of samples
|
|
275
|
+
and p is the number of features.
|
|
276
|
+
|
|
277
|
+
:returns exp_var: p dimentional vector of explained variances for each feature.
|
|
278
|
+
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
exp_var = 1 - (ytrue - ypred).var(axis=0) / ytrue.var(axis=0)
|
|
282
|
+
|
|
283
|
+
return exp_var
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def compute_MSLL(ytrue, ypred, ypred_var, train_mean=None, train_var=None):
|
|
287
|
+
"""
|
|
288
|
+
Computes the MSLL or MLL (not standardized) if 'train_mean' and 'train_var' are None.
|
|
289
|
+
|
|
290
|
+
Basic usage::
|
|
291
|
+
|
|
292
|
+
MSLL = compute_MSLL(ytrue, ypred, ytrue_sig, noise_variance, train_mean, train_var)
|
|
293
|
+
|
|
294
|
+
where
|
|
295
|
+
|
|
296
|
+
:param ytrue : n*p matrix of true values where n is the number of samples
|
|
297
|
+
and p is the number of features.
|
|
298
|
+
:param ypred : n*p matrix of predicted values where n is the number of samples
|
|
299
|
+
and p is the number of features.
|
|
300
|
+
:param ypred_var : n*p matrix of summed noise variances and prediction variances where n is the number of samples
|
|
301
|
+
and p is the number of features.
|
|
302
|
+
|
|
303
|
+
:param train_mean: p dimensional vector of mean values of the training data for each feature.
|
|
304
|
+
|
|
305
|
+
:param train_var : p dimensional vector of covariances of the training data for each feature.
|
|
306
|
+
|
|
307
|
+
:returns loss : p dimensional vector of MSLL or MLL for each feature.
|
|
308
|
+
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
if train_mean is not None and train_var is not None:
|
|
312
|
+
|
|
313
|
+
# make sure y_train_mean and y_train_sig have right dimensions (subjects x voxels):
|
|
314
|
+
Y_train_mean = np.repeat(train_mean, ytrue.shape[0], axis=0)
|
|
315
|
+
Y_train_sig = np.repeat(train_var, ytrue.shape[0], axis=0)
|
|
316
|
+
|
|
317
|
+
# compute MSLL:
|
|
318
|
+
loss = np.mean(0.5 * np.log(2 * np.pi * ypred_var) + (ytrue - ypred)**2 / (2 * ypred_var) -
|
|
319
|
+
0.5 * np.log(2 * np.pi * Y_train_sig) - (ytrue - Y_train_mean)**2 / (2 * Y_train_sig), axis=0)
|
|
320
|
+
|
|
321
|
+
else:
|
|
322
|
+
# compute MLL:
|
|
323
|
+
loss = np.mean(0.5 * np.log(2 * np.pi * ypred_var) +
|
|
324
|
+
(ytrue - ypred)**2 / (2 * ypred_var), axis=0)
|
|
325
|
+
|
|
326
|
+
return loss
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def calibration_descriptives(x):
|
|
330
|
+
"""
|
|
331
|
+
Compute statistics useful to assess the calibration of normative models,
|
|
332
|
+
including skew and kurtosis of the distribution, plus their standard
|
|
333
|
+
deviation and standar errors (separately for each column in x)
|
|
334
|
+
|
|
335
|
+
Basic usage::
|
|
336
|
+
stats = calibration_descriptives(Z)
|
|
337
|
+
|
|
338
|
+
where
|
|
339
|
+
|
|
340
|
+
:param x : n*p matrix of statistics you wish to assess
|
|
341
|
+
:returns stats :[skew, sdskew, kurtosis, sdkurtosis, semean, sesd]
|
|
342
|
+
|
|
343
|
+
"""
|
|
344
|
+
|
|
345
|
+
n = np.shape(x)[0]
|
|
346
|
+
m1 = np.mean(x, axis=0)
|
|
347
|
+
m2 = sum((x-m1)**2)
|
|
348
|
+
m3 = sum((x-m1)**3)
|
|
349
|
+
m4 = sum((x-m1)**4)
|
|
350
|
+
s1 = np.std(x, axis=0)
|
|
351
|
+
skew = n*m3/(n-1)/(n-2)/s1**3
|
|
352
|
+
sdskew = np.sqrt(6*n*(n-1) / ((n-2)*(n+1)*(n+3)))
|
|
353
|
+
kurtosis = (n * (n+1) * m4) / ((n-1) * (n-2) * (n-3) * s1**4) - (3 * (n-1)**2) / ((n-2) * (n-3))
|
|
354
|
+
sdkurtosis = np.sqrt(4*(n**2-1) * sdskew**2 / ((n-3)*(n+5)))
|
|
355
|
+
semean = np.sqrt(np.var(x)/n)
|
|
356
|
+
sesd = s1/np.sqrt(2*(n-1))
|
|
357
|
+
cd = [skew, sdskew, kurtosis, sdkurtosis, semean, sesd]
|
|
358
|
+
|
|
359
|
+
return cd
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
class WarpBase(with_metaclass(ABCMeta)):
|
|
363
|
+
"""
|
|
364
|
+
Base class for likelihood warping following:
|
|
365
|
+
Rios and Torab (2019) Compositionally-warped Gaussian processes
|
|
366
|
+
https://www.sciencedirect.com/science/article/pii/S0893608019301856
|
|
367
|
+
|
|
368
|
+
All Warps must define the following methods::
|
|
369
|
+
|
|
370
|
+
Warp.get_n_params() - return number of parameters
|
|
371
|
+
Warp.f() - warping function (Non-Gaussian field -> Gaussian)
|
|
372
|
+
Warp.invf() - inverse warp
|
|
373
|
+
Warp.df() - derivatives
|
|
374
|
+
Warp.warp_predictions() - compute predictive distribution
|
|
375
|
+
|
|
376
|
+
"""
|
|
377
|
+
|
|
378
|
+
def __init__(self):
|
|
379
|
+
self.n_params = np.nan
|
|
380
|
+
|
|
381
|
+
def get_n_params(self):
|
|
382
|
+
""" Report the number of parameters required """
|
|
383
|
+
|
|
384
|
+
assert not np.isnan(self.n_params), \
|
|
385
|
+
"Warp function not initialised"
|
|
386
|
+
|
|
387
|
+
return self.n_params
|
|
388
|
+
|
|
389
|
+
def warp_predictions(self, mu, s2, param, percentiles=[0.025, 0.975]):
|
|
390
|
+
"""
|
|
391
|
+
Compute the warped predictions from a gaussian predictive
|
|
392
|
+
distribution, specifed by a mean (mu) and variance (s2)
|
|
393
|
+
|
|
394
|
+
:param mu: Gassian predictive mean
|
|
395
|
+
:param s2: Predictive variance
|
|
396
|
+
:param param: warping parameters
|
|
397
|
+
:param percentiles: Desired percentiles of the warped likelihood
|
|
398
|
+
|
|
399
|
+
:returns: * median - median of the predictive distribution
|
|
400
|
+
* pred_interval - predictive interval(s)
|
|
401
|
+
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
# Compute percentiles of a standard Gaussian
|
|
405
|
+
N = norm
|
|
406
|
+
Z = N.ppf(percentiles)
|
|
407
|
+
|
|
408
|
+
# find the median (using mu = median)
|
|
409
|
+
median = self.invf(mu, param)
|
|
410
|
+
|
|
411
|
+
# compute the predictive intervals (non-stationary)
|
|
412
|
+
pred_interval = np.zeros((len(mu), len(Z)))
|
|
413
|
+
for i, z in enumerate(Z):
|
|
414
|
+
pred_interval[:, i] = self.invf(mu + np.sqrt(s2)*z, param)
|
|
415
|
+
|
|
416
|
+
return median, pred_interval
|
|
417
|
+
|
|
418
|
+
@abstractmethod
|
|
419
|
+
def f(self, x, param):
|
|
420
|
+
""" Evaluate the warping function (mapping non-Gaussian respone
|
|
421
|
+
variables to Gaussian variables)
|
|
422
|
+
"""
|
|
423
|
+
|
|
424
|
+
@abstractmethod
|
|
425
|
+
def invf(self, y, param):
|
|
426
|
+
""" Evaluate the warping function (mapping Gaussian latent variables
|
|
427
|
+
to non-Gaussian response variables)
|
|
428
|
+
"""
|
|
429
|
+
|
|
430
|
+
@abstractmethod
|
|
431
|
+
def df(self, x, param):
|
|
432
|
+
""" Return the derivative of the warp, dw(x)/dx """
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
class WarpLog(WarpBase):
|
|
436
|
+
""" Affine warp
|
|
437
|
+
y = a + b*x
|
|
438
|
+
"""
|
|
439
|
+
|
|
440
|
+
def __init__(self):
|
|
441
|
+
self.n_params = 0
|
|
442
|
+
|
|
443
|
+
def f(self, x, params=None):
|
|
444
|
+
|
|
445
|
+
y = np.log(x)
|
|
446
|
+
|
|
447
|
+
return y
|
|
448
|
+
|
|
449
|
+
def invf(self, y, params=None):
|
|
450
|
+
|
|
451
|
+
x = np.exp(y)
|
|
452
|
+
|
|
453
|
+
return x
|
|
454
|
+
|
|
455
|
+
def df(self, x, params):
|
|
456
|
+
|
|
457
|
+
df = 1/x
|
|
458
|
+
|
|
459
|
+
return df
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
class WarpAffine(WarpBase):
|
|
463
|
+
""" Affine warp
|
|
464
|
+
y = a + b*x
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
def __init__(self):
|
|
468
|
+
self.n_params = 2
|
|
469
|
+
|
|
470
|
+
def _get_params(self, param):
|
|
471
|
+
if len(param) != self.n_params:
|
|
472
|
+
raise ValueError(
|
|
473
|
+
'number of parameters must be ' + str(self.n_params))
|
|
474
|
+
return param[0], np.exp(param[1])
|
|
475
|
+
|
|
476
|
+
def f(self, x, params):
|
|
477
|
+
a, b = self._get_params(params)
|
|
478
|
+
|
|
479
|
+
y = a + b*x
|
|
480
|
+
return y
|
|
481
|
+
|
|
482
|
+
def invf(self, y, params):
|
|
483
|
+
a, b = self._get_params(params)
|
|
484
|
+
|
|
485
|
+
x = (y - a) / b
|
|
486
|
+
|
|
487
|
+
return x
|
|
488
|
+
|
|
489
|
+
def df(self, x, params):
|
|
490
|
+
a, b = self._get_params(params)
|
|
491
|
+
|
|
492
|
+
df = np.ones(x.shape)*b
|
|
493
|
+
return df
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
class WarpBoxCox(WarpBase):
|
|
497
|
+
""" Box cox transform having a single parameter (lambda), i.e.
|
|
498
|
+
|
|
499
|
+
y = (sign(x) * abs(x) ** lamda - 1) / lambda
|
|
500
|
+
|
|
501
|
+
This follows the generalization in Bicken and Doksum (1981) JASA 76
|
|
502
|
+
and allows x to assume negative values.
|
|
503
|
+
"""
|
|
504
|
+
|
|
505
|
+
def __init__(self):
|
|
506
|
+
self.n_params = 1
|
|
507
|
+
|
|
508
|
+
def _get_params(self, param):
|
|
509
|
+
|
|
510
|
+
return np.exp(param)
|
|
511
|
+
|
|
512
|
+
def f(self, x, params):
|
|
513
|
+
lam = self._get_params(params)
|
|
514
|
+
|
|
515
|
+
if lam == 0:
|
|
516
|
+
y = np.log(x)
|
|
517
|
+
else:
|
|
518
|
+
y = (np.sign(x) * np.abs(x) ** lam - 1) / lam
|
|
519
|
+
return y
|
|
520
|
+
|
|
521
|
+
def invf(self, y, params):
|
|
522
|
+
lam = self._get_params(params)
|
|
523
|
+
|
|
524
|
+
if lam == 0:
|
|
525
|
+
x = np.exp(y)
|
|
526
|
+
else:
|
|
527
|
+
x = np.sign(lam * y + 1) * np.abs(lam * y + 1) ** (1 / lam)
|
|
528
|
+
|
|
529
|
+
return x
|
|
530
|
+
|
|
531
|
+
def df(self, x, params):
|
|
532
|
+
lam = self._get_params(params)
|
|
533
|
+
|
|
534
|
+
dx = np.abs(x) ** (lam - 1)
|
|
535
|
+
|
|
536
|
+
return dx
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
class WarpSinArcsinh(WarpBase):
|
|
540
|
+
""" Sin-hyperbolic arcsin warp having two parameters (a, b) and defined by
|
|
541
|
+
|
|
542
|
+
y = sinh(b * arcsinh(x) - a)
|
|
543
|
+
|
|
544
|
+
Using the parametrisation of Rios et al, Neural Networks 118 (2017)
|
|
545
|
+
where a controls skew and b controls kurtosis, such that:
|
|
546
|
+
|
|
547
|
+
* a = 0 : symmetric
|
|
548
|
+
* a > 0 : positive skew
|
|
549
|
+
* a < 0 : negative skew
|
|
550
|
+
* b = 1 : mesokurtic
|
|
551
|
+
* b > 1 : leptokurtic
|
|
552
|
+
* b < 1 : platykurtic
|
|
553
|
+
|
|
554
|
+
where b > 0. However, it is more convenentent to use an alternative
|
|
555
|
+
parameterisation, given in Jones and Pewsey 2019 JRSS Significance 16
|
|
556
|
+
https://doi.org/10.1111/j.1740-9713.2019.01245.x
|
|
557
|
+
|
|
558
|
+
where:
|
|
559
|
+
|
|
560
|
+
y = sinh(b * arcsinh(x) + epsilon * b)
|
|
561
|
+
|
|
562
|
+
and a = -epsilon*b
|
|
563
|
+
|
|
564
|
+
see also Jones and Pewsey 2009 Biometrika, 96 (4) for more details
|
|
565
|
+
about the SHASH distribution
|
|
566
|
+
https://www.jstor.org/stable/27798865
|
|
567
|
+
"""
|
|
568
|
+
|
|
569
|
+
def __init__(self):
|
|
570
|
+
self.n_params = 2
|
|
571
|
+
|
|
572
|
+
def _get_params(self, param):
|
|
573
|
+
if len(param) != self.n_params:
|
|
574
|
+
raise ValueError(
|
|
575
|
+
'number of parameters must be ' + str(self.n_params))
|
|
576
|
+
|
|
577
|
+
epsilon = param[0]
|
|
578
|
+
b = np.exp(param[1])
|
|
579
|
+
a = -epsilon*b
|
|
580
|
+
|
|
581
|
+
return a, b
|
|
582
|
+
|
|
583
|
+
def f(self, x, params):
|
|
584
|
+
a, b = self._get_params(params)
|
|
585
|
+
|
|
586
|
+
y = np.sinh(b * np.arcsinh(x) - a)
|
|
587
|
+
return y
|
|
588
|
+
|
|
589
|
+
def invf(self, y, params):
|
|
590
|
+
a, b = self._get_params(params)
|
|
591
|
+
|
|
592
|
+
x = np.sinh((np.arcsinh(y)+a)/b)
|
|
593
|
+
|
|
594
|
+
return x
|
|
595
|
+
|
|
596
|
+
def df(self, x, params):
|
|
597
|
+
a, b = self._get_params(params)
|
|
598
|
+
|
|
599
|
+
dx = (b * np.cosh(b * np.arcsinh(x) - a))/np.sqrt(1 + x ** 2)
|
|
600
|
+
|
|
601
|
+
return dx
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
class WarpCompose(WarpBase):
|
|
605
|
+
""" Composition of warps. These are passed in as an array and
|
|
606
|
+
intialised automatically. For example::
|
|
607
|
+
|
|
608
|
+
W = WarpCompose(('WarpBoxCox', 'WarpAffine'))
|
|
609
|
+
|
|
610
|
+
where ell_i are lengthscale parameters and sf2 is the signal variance
|
|
611
|
+
"""
|
|
612
|
+
|
|
613
|
+
def __init__(self, warpnames=None, debugwarp=False):
|
|
614
|
+
|
|
615
|
+
if warpnames is None:
|
|
616
|
+
raise ValueError("A list of warp functions is required")
|
|
617
|
+
self.debugwarp = debugwarp
|
|
618
|
+
self.warps = []
|
|
619
|
+
self.n_params = 0
|
|
620
|
+
for wname in warpnames:
|
|
621
|
+
warp = eval(wname + '()')
|
|
622
|
+
self.n_params += warp.get_n_params()
|
|
623
|
+
self.warps.append(warp)
|
|
624
|
+
|
|
625
|
+
def f(self, x, theta):
|
|
626
|
+
theta_offset = 0
|
|
627
|
+
|
|
628
|
+
if self.debugwarp:
|
|
629
|
+
print('begin composition')
|
|
630
|
+
for ci, warp in enumerate(self.warps):
|
|
631
|
+
n_params_c = warp.get_n_params()
|
|
632
|
+
theta_c = [theta[c] for c in
|
|
633
|
+
range(theta_offset, theta_offset + n_params_c)]
|
|
634
|
+
theta_offset += n_params_c
|
|
635
|
+
|
|
636
|
+
if self.debugwarp:
|
|
637
|
+
print('f:', ci, theta_c, warp)
|
|
638
|
+
|
|
639
|
+
if ci == 0:
|
|
640
|
+
fw = warp.f(x, theta_c)
|
|
641
|
+
else:
|
|
642
|
+
fw = warp.f(fw, theta_c)
|
|
643
|
+
return fw
|
|
644
|
+
|
|
645
|
+
def invf(self, x, theta):
|
|
646
|
+
n_params = 0
|
|
647
|
+
n_warps = 0
|
|
648
|
+
if self.debugwarp:
|
|
649
|
+
print('begin composition')
|
|
650
|
+
|
|
651
|
+
for ci, warp in enumerate(self.warps):
|
|
652
|
+
n_params += warp.get_n_params()
|
|
653
|
+
n_warps += 1
|
|
654
|
+
theta_offset = n_params
|
|
655
|
+
for ci, warp in reversed(list(enumerate(self.warps))):
|
|
656
|
+
n_params_c = warp.get_n_params()
|
|
657
|
+
theta_offset -= n_params_c
|
|
658
|
+
theta_c = [theta[c] for c in
|
|
659
|
+
range(theta_offset, theta_offset + n_params_c)]
|
|
660
|
+
|
|
661
|
+
if self.debugwarp:
|
|
662
|
+
print('invf:', theta_c, warp)
|
|
663
|
+
|
|
664
|
+
if ci == n_warps-1:
|
|
665
|
+
finvw = warp.invf(x, theta_c)
|
|
666
|
+
else:
|
|
667
|
+
finvw = warp.invf(finvw, theta_c)
|
|
668
|
+
|
|
669
|
+
return finvw
|
|
670
|
+
|
|
671
|
+
def df(self, x, theta):
|
|
672
|
+
theta_offset = 0
|
|
673
|
+
if self.debugwarp:
|
|
674
|
+
print('begin composition')
|
|
675
|
+
for ci, warp in enumerate(self.warps):
|
|
676
|
+
n_params_c = warp.get_n_params()
|
|
677
|
+
|
|
678
|
+
theta_c = [theta[c] for c in
|
|
679
|
+
range(theta_offset, theta_offset + n_params_c)]
|
|
680
|
+
theta_offset += n_params_c
|
|
681
|
+
|
|
682
|
+
if self.debugwarp:
|
|
683
|
+
print('df:', ci, theta_c, warp)
|
|
684
|
+
|
|
685
|
+
if ci == 0:
|
|
686
|
+
dfw = warp.df(x, theta_c)
|
|
687
|
+
else:
|
|
688
|
+
dfw = warp.df(dfw, theta_c)
|
|
689
|
+
|
|
690
|
+
return dfw
|
|
691
|
+
|
|
692
|
+
# -----------------------
|
|
693
|
+
# Functions for inference
|
|
694
|
+
# -----------------------
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
class CustomCV:
|
|
698
|
+
""" Custom cross-validation approach. This function does not do much, it
|
|
699
|
+
merely provides a wrapper designed to be compatible with
|
|
700
|
+
scikit-learn (e.g. sklearn.model_selection...)
|
|
701
|
+
|
|
702
|
+
:param train: a list of indices of training splits (each itself a list)
|
|
703
|
+
:param test: a list of indices of test splits (each itself a list)
|
|
704
|
+
|
|
705
|
+
:returns tr: Indices for training set
|
|
706
|
+
:returns te: Indices for test set
|
|
707
|
+
|
|
708
|
+
"""
|
|
709
|
+
|
|
710
|
+
def __init__(self, train, test, X=None, y=None):
|
|
711
|
+
self.train = train
|
|
712
|
+
self.test = test
|
|
713
|
+
self.n_splits = len(train)
|
|
714
|
+
if X is not None:
|
|
715
|
+
self.N = X.shape[0]
|
|
716
|
+
else:
|
|
717
|
+
self.N = None
|
|
718
|
+
|
|
719
|
+
def split(self, X, y=None):
|
|
720
|
+
if self.N is None:
|
|
721
|
+
self.N = X.shape[0]
|
|
722
|
+
|
|
723
|
+
for i in range(0, self.n_splits):
|
|
724
|
+
tr = self.train[i]
|
|
725
|
+
te = self.test[i]
|
|
726
|
+
yield tr, te
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def bashwrap(processing_dir, python_path, script_command, job_name,
|
|
730
|
+
bash_environment=None):
|
|
731
|
+
""" This function wraps normative modelling into a bash script to run it
|
|
732
|
+
on a torque cluster system.
|
|
733
|
+
|
|
734
|
+
:param processing_dir: Full path to the processing dir
|
|
735
|
+
:param python_path: Full path to the python distribution
|
|
736
|
+
:param script_command: python command to execute
|
|
737
|
+
:param job_name: Name for the bash script output by this function
|
|
738
|
+
:param covfile_path: Full path to covariates
|
|
739
|
+
:param respfile_path: Full path to response variables
|
|
740
|
+
:param cv_folds: Number of cross validations
|
|
741
|
+
:param testcovfile_path: Full path to test covariates
|
|
742
|
+
:param testrespfile_path: Full path to tes responses
|
|
743
|
+
:param bash_environment: A file containing enviornment specific commands
|
|
744
|
+
|
|
745
|
+
:returns: A .sh file containing the commands for normative modelling
|
|
746
|
+
|
|
747
|
+
written by Thomas Wolfers
|
|
748
|
+
"""
|
|
749
|
+
|
|
750
|
+
# change to processing dir
|
|
751
|
+
os.chdir(processing_dir)
|
|
752
|
+
output_changedir = ['cd ' + processing_dir + '\n']
|
|
753
|
+
|
|
754
|
+
# sets bash environment if necessary
|
|
755
|
+
if bash_environment is not None:
|
|
756
|
+
bash_environment = [bash_environment]
|
|
757
|
+
print("""Your own environment requires in any case:
|
|
758
|
+
#!/bin/bash\n export and optionally OMP_NUM_THREADS=1\n""")
|
|
759
|
+
else:
|
|
760
|
+
bash_lines = '#!/bin/bash\n\n'
|
|
761
|
+
bash_cores = 'export OMP_NUM_THREADS=1\n'
|
|
762
|
+
bash_environment = [bash_lines + bash_cores]
|
|
763
|
+
|
|
764
|
+
command = [python_path + ' ' + script_command + '\n']
|
|
765
|
+
|
|
766
|
+
# writes bash file into processing dir
|
|
767
|
+
bash_file_name = os.path.join(processing_dir, job_name + '.sh')
|
|
768
|
+
with open(bash_file_name, 'w') as bash_file:
|
|
769
|
+
bash_file.writelines(bash_environment + output_changedir + command)
|
|
770
|
+
|
|
771
|
+
# changes permissoins for bash.sh file
|
|
772
|
+
os.chmod(bash_file_name, 0o700)
|
|
773
|
+
|
|
774
|
+
return bash_file_name
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def qsub(job_path, memory, duration, logdir=None):
|
|
778
|
+
"""This function submits a job.sh scipt to the torque custer using the qsub command.
|
|
779
|
+
|
|
780
|
+
Basic usage::
|
|
781
|
+
|
|
782
|
+
qsub_nm(job_path, log_path, memory, duration)
|
|
783
|
+
|
|
784
|
+
:param job_path: Full path to the job.sh file.
|
|
785
|
+
:param memory: Memory requirements written as string for example 4gb or 500mb.
|
|
786
|
+
:param duation: The approximate duration of the job, a string with HH:MM:SS for example 01:01:01.
|
|
787
|
+
|
|
788
|
+
:outputs: Submission of the job to the (torque) cluster.
|
|
789
|
+
|
|
790
|
+
written by (primarily) T Wolfers, (adapted) SM Kia, (adapted) S Rutherford.
|
|
791
|
+
"""
|
|
792
|
+
if logdir is None:
|
|
793
|
+
logdir = os.path.expanduser('~')
|
|
794
|
+
|
|
795
|
+
# created qsub command
|
|
796
|
+
qsub_call = ['echo ' + job_path + ' | qsub -N ' + job_path + ' -l ' +
|
|
797
|
+
'mem=' + memory + ',walltime=' + duration +
|
|
798
|
+
' -e ' + logdir + ' -o ' + logdir]
|
|
799
|
+
|
|
800
|
+
# submits job to cluster
|
|
801
|
+
call(qsub_call, shell=True)
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
def extreme_value_prob_fit(NPM, perc):
|
|
805
|
+
n = NPM.shape[0]
|
|
806
|
+
t = NPM.shape[1]
|
|
807
|
+
n_perc = int(round(t * perc))
|
|
808
|
+
m = np.zeros(n)
|
|
809
|
+
for i in range(n):
|
|
810
|
+
temp = np.abs(NPM[i, :])
|
|
811
|
+
temp = np.sort(temp)
|
|
812
|
+
temp = temp[t - n_perc:]
|
|
813
|
+
temp = temp[0:int(np.floor(0.90*temp.shape[0]))]
|
|
814
|
+
m[i] = np.mean(temp)
|
|
815
|
+
params = genextreme.fit(m)
|
|
816
|
+
return params
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def extreme_value_prob(params, NPM, perc):
|
|
820
|
+
n = NPM.shape[0]
|
|
821
|
+
t = NPM.shape[1]
|
|
822
|
+
n_perc = int(round(t * perc))
|
|
823
|
+
m = np.zeros(n)
|
|
824
|
+
for i in range(n):
|
|
825
|
+
temp = np.abs(NPM[i, :])
|
|
826
|
+
temp = np.sort(temp)
|
|
827
|
+
temp = temp[t - n_perc:]
|
|
828
|
+
temp = temp[0:int(np.floor(0.90*temp.shape[0]))]
|
|
829
|
+
m[i] = np.mean(temp)
|
|
830
|
+
probs = genextreme.cdf(m, *params)
|
|
831
|
+
return probs
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def ravel_2D(a):
|
|
835
|
+
s = a.shape
|
|
836
|
+
return np.reshape(a, [s[0], np.prod(s[1:])])
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
def unravel_2D(a, s):
|
|
840
|
+
return np.reshape(a, s)
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
def threshold_NPM(NPMs, fdr_thr=0.05, npm_thr=0.1):
|
|
844
|
+
""" Compute voxels with significant NPMs. """
|
|
845
|
+
p_values = stats.norm.cdf(-np.abs(NPMs))
|
|
846
|
+
results = np.zeros(NPMs.shape)
|
|
847
|
+
masks = np.full(NPMs.shape, False, dtype=bool)
|
|
848
|
+
for i in range(p_values.shape[0]):
|
|
849
|
+
masks[i, :] = FDR(p_values[i, :], fdr_thr)
|
|
850
|
+
results[i,] = NPMs[i, :] * masks[i, :].astype(np.int)
|
|
851
|
+
m = np.sum(masks, axis=0)/masks.shape[0] > npm_thr
|
|
852
|
+
# m = np.any(masks,axis=0)
|
|
853
|
+
return results, masks, m
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
def FDR(p_values, alpha):
|
|
857
|
+
""" Compute the false discovery rate in all voxels for a subject. """
|
|
858
|
+
dim = np.shape(p_values)
|
|
859
|
+
p_values = np.reshape(p_values, [np.prod(dim),])
|
|
860
|
+
sorted_p_values = np.sort(p_values)
|
|
861
|
+
sorted_p_values_idx = np.argsort(p_values)
|
|
862
|
+
testNum = len(p_values)
|
|
863
|
+
thresh = ((np.array(range(testNum)) + 1)/np.float(testNum)) * alpha
|
|
864
|
+
h = sorted_p_values <= thresh
|
|
865
|
+
unsort = np.argsort(sorted_p_values_idx)
|
|
866
|
+
h = h[unsort]
|
|
867
|
+
h = np.reshape(h, dim)
|
|
868
|
+
return h
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def calibration_error(Y, m, s, cal_levels):
|
|
872
|
+
ce = 0
|
|
873
|
+
for cl in cal_levels:
|
|
874
|
+
z = np.abs(norm.ppf((1-cl)/2))
|
|
875
|
+
ub = m + z * s
|
|
876
|
+
lb = m - z * s
|
|
877
|
+
ce = ce + \
|
|
878
|
+
np.abs(cl - np.sum(np.logical_and(Y >= lb, Y <= ub))/Y.shape[0])
|
|
879
|
+
return ce
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
def simulate_data(method='linear', n_samples=100, n_features=1, n_grps=1,
|
|
883
|
+
working_dir=None, plot=False, random_state=None, noise=None):
|
|
884
|
+
"""
|
|
885
|
+
Simulates synthetic data for testing purposes, with options for linear, non-linear,
|
|
886
|
+
or combined data generation methods, and various noise types.
|
|
887
|
+
|
|
888
|
+
:param method: Method to simulate ('linear', 'non-linear', or 'combined').
|
|
889
|
+
:param n_samples: Number of samples per group, either an int or a list for each group (default=100).
|
|
890
|
+
:param n_features: Number of features to simulate (default=1).
|
|
891
|
+
:param n_grps: Number of groups in the data (default=1).
|
|
892
|
+
:param working_dir: Directory to save the data (default=None).
|
|
893
|
+
:param plot: Boolean flag to plot the simulated training data (default=False).
|
|
894
|
+
:param random_state: Seed for random number generation (default=None).
|
|
895
|
+
:param noise: Type of noise to add ('homoscedastic_gaussian', 'heteroscedastic_gaussian',
|
|
896
|
+
'homoscedastic_nongaussian', 'heteroscedastic_nongaussian', default=None).
|
|
897
|
+
|
|
898
|
+
:returns: Tuple of (X_train, Y_train, grp_id_train, X_test, Y_test, grp_id_test, coef)
|
|
899
|
+
"""
|
|
900
|
+
|
|
901
|
+
np.random.seed(random_state)
|
|
902
|
+
|
|
903
|
+
if isinstance(n_samples, int):
|
|
904
|
+
n_samples = [n_samples for _ in range(n_grps)]
|
|
905
|
+
|
|
906
|
+
X_train, Y_train, X_test, Y_test = [], [], [], []
|
|
907
|
+
grp_id_train, grp_id_test = [], []
|
|
908
|
+
coef = []
|
|
909
|
+
|
|
910
|
+
for i in range(n_grps):
|
|
911
|
+
bias = np.random.randint(-10, high=10)
|
|
912
|
+
|
|
913
|
+
if method == 'linear':
|
|
914
|
+
X_temp, Y_temp, coef_temp = make_regression(
|
|
915
|
+
n_samples=n_samples[i] * 2, n_features=n_features, n_targets=1,
|
|
916
|
+
noise=10 * np.random.rand(), bias=bias, n_informative=1, coef=True,
|
|
917
|
+
)
|
|
918
|
+
elif method == 'non-linear':
|
|
919
|
+
X_temp = np.random.randint(-2, 6, [2 * n_samples[i], n_features]) \
|
|
920
|
+
+ np.random.randn(2 * n_samples[i], n_features)
|
|
921
|
+
Y_temp = X_temp[:, 0] * 20 * np.random.rand() + np.random.randint(10, 100) \
|
|
922
|
+
* np.sin(2 * np.random.rand() + 2 * np.pi / 5 * X_temp[:, 0])
|
|
923
|
+
coef_temp = 0
|
|
924
|
+
elif method == 'combined':
|
|
925
|
+
X_temp = np.random.randint(-2, 6, [2 * n_samples[i], n_features]) \
|
|
926
|
+
+ np.random.randn(2 * n_samples[i], n_features)
|
|
927
|
+
Y_temp = (X_temp[:, 0]**3) * np.random.uniform(0, 0.5) \
|
|
928
|
+
+ X_temp[:, 0] * 20 * np.random.rand() \
|
|
929
|
+
+ np.random.randint(10, 100)
|
|
930
|
+
coef_temp = 0
|
|
931
|
+
else:
|
|
932
|
+
raise ValueError(
|
|
933
|
+
"Unknown method. Please specify 'linear', 'non-linear', or 'combined'.")
|
|
934
|
+
|
|
935
|
+
coef.append(coef_temp / 100)
|
|
936
|
+
X_train.append(X_temp[:n_samples[i]])
|
|
937
|
+
Y_train.append(Y_temp[:n_samples[i]] / 100)
|
|
938
|
+
X_test.append(X_temp[n_samples[i]:])
|
|
939
|
+
Y_test.append(Y_temp[n_samples[i]:] / 100)
|
|
940
|
+
grp_id = np.repeat(i, n_samples[i] * 2)
|
|
941
|
+
grp_id_train.append(grp_id[:n_samples[i]])
|
|
942
|
+
grp_id_test.append(grp_id[n_samples[i]:])
|
|
943
|
+
|
|
944
|
+
t = np.random.randint(1, 5)
|
|
945
|
+
# Add noise to the data
|
|
946
|
+
if noise == 'homoscedastic_gaussian':
|
|
947
|
+
Y_train[i] += np.random.normal(loc=0,
|
|
948
|
+
scale=0.2, size=Y_train[i].shape[0]) / t
|
|
949
|
+
Y_test[i] += np.random.normal(loc=0,
|
|
950
|
+
scale=0.2, size=Y_test[i].shape[0]) / t
|
|
951
|
+
|
|
952
|
+
elif noise == 'heteroscedastic_gaussian':
|
|
953
|
+
Y_train[i] += np.random.normal(loc=0, scale=np.log(
|
|
954
|
+
1 + np.exp(X_train[i][:, 0])), size=Y_train[i].shape[0])
|
|
955
|
+
Y_test[i] += np.random.normal(loc=0, scale=np.log(
|
|
956
|
+
1 + np.exp(X_test[i][:, 0])), size=Y_test[i].shape[0])
|
|
957
|
+
|
|
958
|
+
elif noise == 'homoscedastic_nongaussian':
|
|
959
|
+
Y_train[i] += skewnorm.rvs(a=10, loc=0,
|
|
960
|
+
scale=0.2, size=Y_train[i].shape[0]) / t
|
|
961
|
+
Y_test[i] += skewnorm.rvs(a=10, loc=0,
|
|
962
|
+
scale=0.2, size=Y_test[i].shape[0]) / t
|
|
963
|
+
|
|
964
|
+
elif noise == 'heteroscedastic_nongaussian':
|
|
965
|
+
Y_train[i] += skewnorm.rvs(a=10, loc=0, scale=np.log(
|
|
966
|
+
1 + np.exp(0.3 * X_train[i][:, 0])), size=Y_train[i].shape[0])
|
|
967
|
+
Y_test[i] += skewnorm.rvs(a=10, loc=0, scale=np.log(1 +
|
|
968
|
+
np.exp(0.3 * X_test[i][:, 0])), size=Y_test[i].shape[0])
|
|
969
|
+
|
|
970
|
+
X_train = np.vstack(X_train)
|
|
971
|
+
X_test = np.vstack(X_test)
|
|
972
|
+
Y_train = np.concatenate(Y_train)
|
|
973
|
+
Y_test = np.concatenate(Y_test)
|
|
974
|
+
grp_id_train = np.expand_dims(np.concatenate(grp_id_train), axis=1)
|
|
975
|
+
grp_id_test = np.expand_dims(np.concatenate(grp_id_test), axis=1)
|
|
976
|
+
|
|
977
|
+
if plot:
|
|
978
|
+
for i in range(n_features):
|
|
979
|
+
plt.figure()
|
|
980
|
+
for j in range(n_grps):
|
|
981
|
+
plt.scatter(X_train[grp_id_train[:, 0] == j, i],
|
|
982
|
+
Y_train[grp_id_train[:, 0] == j], label='Group ' + str(j))
|
|
983
|
+
plt.xlabel(f'X{i}')
|
|
984
|
+
plt.ylabel('Y')
|
|
985
|
+
plt.legend()
|
|
986
|
+
plt.show()
|
|
987
|
+
|
|
988
|
+
if working_dir:
|
|
989
|
+
if not os.path.isdir(working_dir):
|
|
990
|
+
os.mkdir(working_dir)
|
|
991
|
+
|
|
992
|
+
with open(os.path.join(working_dir, 'trbefile.pkl'), 'wb') as file:
|
|
993
|
+
pickle.dump(pd.DataFrame(grp_id_train), file,
|
|
994
|
+
protocol=pickle.HIGHEST_PROTOCOL)
|
|
995
|
+
with open(os.path.join(working_dir, 'tsbefile.pkl'), 'wb') as file:
|
|
996
|
+
pickle.dump(pd.DataFrame(grp_id_test), file,
|
|
997
|
+
protocol=pickle.HIGHEST_PROTOCOL)
|
|
998
|
+
with open(os.path.join(working_dir, 'X_train.pkl'), 'wb') as file:
|
|
999
|
+
pickle.dump(pd.DataFrame(X_train), file,
|
|
1000
|
+
protocol=pickle.HIGHEST_PROTOCOL)
|
|
1001
|
+
with open(os.path.join(working_dir, 'X_test.pkl'), 'wb') as file:
|
|
1002
|
+
pickle.dump(pd.DataFrame(X_test), file,
|
|
1003
|
+
protocol=pickle.HIGHEST_PROTOCOL)
|
|
1004
|
+
with open(os.path.join(working_dir, 'Y_train.pkl'), 'wb') as file:
|
|
1005
|
+
pickle.dump(pd.DataFrame(Y_train), file,
|
|
1006
|
+
protocol=pickle.HIGHEST_PROTOCOL)
|
|
1007
|
+
with open(os.path.join(working_dir, 'Y_test.pkl'), 'wb') as file:
|
|
1008
|
+
pickle.dump(pd.DataFrame(Y_test), file,
|
|
1009
|
+
protocol=pickle.HIGHEST_PROTOCOL)
|
|
1010
|
+
|
|
1011
|
+
return X_train, Y_train, grp_id_train, X_test, Y_test, grp_id_test, coef
|
|
1012
|
+
|
|
1013
|
+
|
|
1014
|
+
def divergence_plot(nm, ylim=None):
|
|
1015
|
+
|
|
1016
|
+
if nm.hbr.configs['n_chains'] > 1 and nm.hbr.model_type != 'nn':
|
|
1017
|
+
a = pm.summary(nm.hbr.trace).round(2)
|
|
1018
|
+
plt.figure()
|
|
1019
|
+
plt.hist(a['r_hat'], 10)
|
|
1020
|
+
plt.title('Gelman-Rubin diagnostic for divergence')
|
|
1021
|
+
|
|
1022
|
+
divergent = nm.hbr.trace['diverging']
|
|
1023
|
+
|
|
1024
|
+
tracedf = pm.trace_to_dataframe(nm.hbr.trace)
|
|
1025
|
+
|
|
1026
|
+
_, ax = plt.subplots(2, 1, figsize=(15, 4), sharex=True, sharey=True)
|
|
1027
|
+
ax[0].plot(tracedf.values[divergent == 0].T, color='k', alpha=.05)
|
|
1028
|
+
ax[0].set_title('No Divergences', fontsize=10)
|
|
1029
|
+
ax[1].plot(tracedf.values[divergent == 1].T, color='C2', lw=.5, alpha=.5)
|
|
1030
|
+
ax[1].set_title('Divergences', fontsize=10)
|
|
1031
|
+
plt.ylim(ylim)
|
|
1032
|
+
plt.xticks(range(tracedf.shape[1]), list(tracedf.columns))
|
|
1033
|
+
plt.xticks(rotation=90, fontsize=7)
|
|
1034
|
+
plt.tight_layout()
|
|
1035
|
+
plt.show()
|
|
1036
|
+
|
|
1037
|
+
|
|
1038
|
+
def load_freesurfer_measure(measure, data_path, subjects_list):
|
|
1039
|
+
"""This is a utility function to load different Freesurfer measures in a pandas Dataframe.
|
|
1040
|
+
|
|
1041
|
+
Inputs
|
|
1042
|
+
|
|
1043
|
+
:param measure: a string that defines the type of Freesurfer measure we want to load. \
|
|
1044
|
+
The options include:
|
|
1045
|
+
|
|
1046
|
+
* 'NumVert': Number of Vertices in each cortical area based on Destrieux atlas.
|
|
1047
|
+
* 'SurfArea: Surface area for each cortical area based on Destrieux atlas.
|
|
1048
|
+
* 'GrayVol': Gary matter volume in each cortical area based on Destrieux atlas.
|
|
1049
|
+
* 'ThickAvg': Average Cortical thinckness in each cortical area based on Destrieux atlas.
|
|
1050
|
+
* 'ThickStd': STD of Cortical thinckness in each cortical area based on Destrieux atlas.
|
|
1051
|
+
* 'MeanCurv': Integrated Rectified Mean Curvature in each cortical area based on Destrieux atlas.
|
|
1052
|
+
* 'GausCurv': Integrated Rectified Gaussian Curvature in each cortical area based on Destrieux atlas.
|
|
1053
|
+
* 'FoldInd': Folding Index in each cortical area based on Destrieux atlas.
|
|
1054
|
+
* 'CurvInd': Intrinsic Curvature Index in each cortical area based on Destrieux atlas.
|
|
1055
|
+
* 'brain': Brain Segmentation Statistics from aseg.stats file.
|
|
1056
|
+
* 'subcortical_volumes': Subcortical areas volume.
|
|
1057
|
+
|
|
1058
|
+
:param data_path: a string that specifies the path to the main Freesurfer folder.
|
|
1059
|
+
:param subjects_list: A Pythin list containing the list of subject names to load the data for. \
|
|
1060
|
+
The subject names should match the folder name for each subject's Freesurfer data folder.
|
|
1061
|
+
|
|
1062
|
+
Outputs:
|
|
1063
|
+
- df: A pandas datafrmae containing the subject names as Index and target Freesurfer measures.
|
|
1064
|
+
- missing_subs: A Python list of subject names that miss the target Freesurefr measures.
|
|
1065
|
+
|
|
1066
|
+
"""
|
|
1067
|
+
|
|
1068
|
+
df = pd.DataFrame()
|
|
1069
|
+
missing_subs = []
|
|
1070
|
+
|
|
1071
|
+
if measure in ['NumVert', 'SurfArea', 'GrayVol', 'ThickAvg',
|
|
1072
|
+
'ThickStd', 'MeanCurv', 'GausCurv', 'FoldInd', 'CurvInd']:
|
|
1073
|
+
l = ['NumVert', 'SurfArea', 'GrayVol', 'ThickAvg',
|
|
1074
|
+
'ThickStd', 'MeanCurv', 'GausCurv', 'FoldInd', 'CurvInd']
|
|
1075
|
+
col = l.index(measure) + 1
|
|
1076
|
+
for i, sub in enumerate(subjects_list):
|
|
1077
|
+
try:
|
|
1078
|
+
data = dict()
|
|
1079
|
+
|
|
1080
|
+
a = pd.read_csv(data_path + sub + '/stats/lh.aparc.a2009s.stats',
|
|
1081
|
+
delimiter=r'\s+', comment='#', header=None)
|
|
1082
|
+
temp = dict(zip(a[0], a[col]))
|
|
1083
|
+
for key in list(temp.keys()):
|
|
1084
|
+
temp['L_'+key] = temp.pop(key)
|
|
1085
|
+
data.update(temp)
|
|
1086
|
+
|
|
1087
|
+
a = pd.read_csv(data_path + sub + '/stats/rh.aparc.a2009s.stats',
|
|
1088
|
+
delimiter=r'\s+', comment='#', header=None)
|
|
1089
|
+
temp = dict(zip(a[0], a[col]))
|
|
1090
|
+
for key in list(temp.keys()):
|
|
1091
|
+
temp['R_'+key] = temp.pop(key)
|
|
1092
|
+
data.update(temp)
|
|
1093
|
+
|
|
1094
|
+
df_temp = pd.DataFrame(data, index=[sub])
|
|
1095
|
+
df = pd.concat([df, df_temp])
|
|
1096
|
+
print('%d / %d: %s is done!' % (i, len(subjects_list), sub))
|
|
1097
|
+
except:
|
|
1098
|
+
missing_subs.append(sub)
|
|
1099
|
+
print('%d / %d: %s is missing!' % (i, len(subjects_list), sub))
|
|
1100
|
+
continue
|
|
1101
|
+
|
|
1102
|
+
elif measure == 'brain':
|
|
1103
|
+
for i, sub in enumerate(subjects_list):
|
|
1104
|
+
try:
|
|
1105
|
+
data = dict()
|
|
1106
|
+
s = StringIO()
|
|
1107
|
+
with open(data_path + sub + '/stats/aseg.stats') as f:
|
|
1108
|
+
for line in f:
|
|
1109
|
+
if line.startswith('# Measure'):
|
|
1110
|
+
s.write(line)
|
|
1111
|
+
s.seek(0) # "rewind" to the beginning of the StringIO object
|
|
1112
|
+
a = pd.read_csv(s, header=None) # with further parameters?
|
|
1113
|
+
data_brain = dict(zip(a[1], a[3]))
|
|
1114
|
+
data.update(data_brain)
|
|
1115
|
+
df_temp = pd.DataFrame(data, index=[sub])
|
|
1116
|
+
df = pd.concat([df, df_temp])
|
|
1117
|
+
print('%d / %d: %s is done!' % (i, len(subjects_list), sub))
|
|
1118
|
+
except:
|
|
1119
|
+
missing_subs.append(sub)
|
|
1120
|
+
print('%d / %d: %s is missing!' % (i, len(subjects_list), sub))
|
|
1121
|
+
continue
|
|
1122
|
+
|
|
1123
|
+
elif measure == 'subcortical_volumes':
|
|
1124
|
+
for i, sub in enumerate(subjects_list):
|
|
1125
|
+
try:
|
|
1126
|
+
data = dict()
|
|
1127
|
+
s = StringIO()
|
|
1128
|
+
with open(data_path + sub + '/stats/aseg.stats') as f:
|
|
1129
|
+
for line in f:
|
|
1130
|
+
if line.startswith('# Measure'):
|
|
1131
|
+
s.write(line)
|
|
1132
|
+
s.seek(0) # "rewind" to the beginning of the StringIO object
|
|
1133
|
+
a = pd.read_csv(s, header=None) # with further parameters?
|
|
1134
|
+
a = dict(zip(a[1], a[3]))
|
|
1135
|
+
if ' eTIV' in a.keys():
|
|
1136
|
+
tiv = a[' eTIV']
|
|
1137
|
+
else:
|
|
1138
|
+
tiv = a[' ICV']
|
|
1139
|
+
a = pd.read_csv(data_path + sub + '/stats/aseg.stats',
|
|
1140
|
+
delimiter=r'\s+', comment='#', header=None)
|
|
1141
|
+
data_vol = dict(zip(a[4]+'_mm3', a[3]))
|
|
1142
|
+
for key in data_vol.keys():
|
|
1143
|
+
data_vol[key] = data_vol[key]/tiv
|
|
1144
|
+
data.update(data_vol)
|
|
1145
|
+
data = pd.DataFrame(data, index=[sub])
|
|
1146
|
+
df = pd.concat([df, data])
|
|
1147
|
+
print('%d / %d: %s is done!' % (i, len(subjects_list), sub))
|
|
1148
|
+
except:
|
|
1149
|
+
missing_subs.append(sub)
|
|
1150
|
+
print('%d / %d: %s is missing!' % (i, len(subjects_list), sub))
|
|
1151
|
+
continue
|
|
1152
|
+
|
|
1153
|
+
return df, missing_subs
|
|
1154
|
+
|
|
1155
|
+
|
|
1156
|
+
class scaler:
|
|
1157
|
+
|
|
1158
|
+
def __init__(self, scaler_type='standardize', tail=0.05,
|
|
1159
|
+
adjust_outliers=True):
|
|
1160
|
+
"""
|
|
1161
|
+
A class for rescaling data using either standardization or minmax
|
|
1162
|
+
normalization.
|
|
1163
|
+
|
|
1164
|
+
:param scaler_type: String that decides the type of scaler including
|
|
1165
|
+
1) 'standardize' for standardizing data, 2) 'minmax' for minmax normalization
|
|
1166
|
+
in range of [0,1], and 3) 'robminmax' for robust (to outliers) minmax
|
|
1167
|
+
normalization.The default is 'standardize'.
|
|
1168
|
+
:param tail: Is a decimal in range [0,1] that decides the tails of
|
|
1169
|
+
distribution for finding robust min and max in 'robminmax'
|
|
1170
|
+
normalization. The defualt is 0.05.
|
|
1171
|
+
:param adjust_outliers: Boolean that decides whether to adjust the
|
|
1172
|
+
outliers in 'robminmax' normalization or not. If True the outliers
|
|
1173
|
+
values are truncated to 0 or 1. The defauls is True.
|
|
1174
|
+
|
|
1175
|
+
"""
|
|
1176
|
+
|
|
1177
|
+
self.scaler_type = scaler_type
|
|
1178
|
+
self.tail = tail
|
|
1179
|
+
self.adjust_outliers = adjust_outliers
|
|
1180
|
+
|
|
1181
|
+
if self.scaler_type not in ['standardize', 'minmax', 'robminmax']:
|
|
1182
|
+
raise ValueError("Undifined scaler type!")
|
|
1183
|
+
|
|
1184
|
+
def fit(self, X):
|
|
1185
|
+
|
|
1186
|
+
if self.scaler_type == 'standardize':
|
|
1187
|
+
self.w = Welford()
|
|
1188
|
+
self.w.consume(X)
|
|
1189
|
+
self.m = self.w.mean
|
|
1190
|
+
self.s = self.w.std
|
|
1191
|
+
|
|
1192
|
+
elif self.scaler_type == 'minmax':
|
|
1193
|
+
self.min = np.min(X, axis=0)
|
|
1194
|
+
self.max = np.max(X, axis=0)
|
|
1195
|
+
|
|
1196
|
+
elif self.scaler_type == 'robminmax':
|
|
1197
|
+
self.min = np.zeros([X.shape[1],])
|
|
1198
|
+
self.max = np.zeros([X.shape[1],])
|
|
1199
|
+
for i in range(X.shape[1]):
|
|
1200
|
+
self.min[i] = np.median(
|
|
1201
|
+
np.sort(X[:, i])[0:int(np.round(X.shape[0] * self.tail))])
|
|
1202
|
+
self.max[i] = np.median(
|
|
1203
|
+
np.sort(X[:, i])[-int(np.round(X.shape[0] * self.tail)):])
|
|
1204
|
+
|
|
1205
|
+
|
|
1206
|
+
def extend(self, X):
|
|
1207
|
+
if self.scaler_type == 'standardize':
|
|
1208
|
+
self.w.consume(X)
|
|
1209
|
+
self.m = self.w.mean
|
|
1210
|
+
self.s = self.w.std
|
|
1211
|
+
|
|
1212
|
+
elif self.scaler_type in ['minmax']:
|
|
1213
|
+
self.min = np.min(np.stack([self.min, np.min(X, axis=0)], axis=0), axis=0)
|
|
1214
|
+
self.max = np.max(np.stack([self.max, np.max(X, axis=0)], axis=0), axis=0)
|
|
1215
|
+
|
|
1216
|
+
elif self.scaler_type in ['robminmax']:
|
|
1217
|
+
for i in range(X.shape[1]):
|
|
1218
|
+
med1 = np.median(np.sort(X[:, i])[0:int(np.round(X.shape[0] * self.tail))])
|
|
1219
|
+
med2 = np.median(np.sort(X[:, i])[-int(np.round(X.shape[0] * self.tail)):])
|
|
1220
|
+
self.min[i] = np.min(np.stack([self.min[i], med1], axis=0), axis=0)
|
|
1221
|
+
self.max[i] = np.max(np.stack([self.max[i], med2], axis=0), axis=0)
|
|
1222
|
+
|
|
1223
|
+
def transform(self, X, index=None):
|
|
1224
|
+
|
|
1225
|
+
if self.scaler_type == 'standardize':
|
|
1226
|
+
if index is None:
|
|
1227
|
+
X = (X - self.m) / self.s
|
|
1228
|
+
else:
|
|
1229
|
+
X = (X - self.m[index]) / self.s[index]
|
|
1230
|
+
|
|
1231
|
+
elif self.scaler_type in ['minmax', 'robminmax']:
|
|
1232
|
+
if index is None:
|
|
1233
|
+
X = (X - self.min) / (self.max - self.min)
|
|
1234
|
+
else:
|
|
1235
|
+
X = (X - self.min[index]) / (self.max[index] - self.min[index])
|
|
1236
|
+
|
|
1237
|
+
if self.adjust_outliers:
|
|
1238
|
+
|
|
1239
|
+
X[X < 0] = 0
|
|
1240
|
+
X[X > 1] = 1
|
|
1241
|
+
|
|
1242
|
+
return X
|
|
1243
|
+
|
|
1244
|
+
def inverse_transform(self, X, index=None):
|
|
1245
|
+
|
|
1246
|
+
if self.scaler_type == 'standardize':
|
|
1247
|
+
if index is None:
|
|
1248
|
+
X = X * self.s + self.m
|
|
1249
|
+
else:
|
|
1250
|
+
X = X * self.s[index] + self.m[index]
|
|
1251
|
+
|
|
1252
|
+
elif self.scaler_type in ['minmax', 'robminmax']:
|
|
1253
|
+
if index is None:
|
|
1254
|
+
X = X * (self.max - self.min) + self.min
|
|
1255
|
+
else:
|
|
1256
|
+
X = X * (self.max[index] - self.min[index]) + self.min[index]
|
|
1257
|
+
return X
|
|
1258
|
+
|
|
1259
|
+
def fit_transform(self, X):
|
|
1260
|
+
|
|
1261
|
+
if self.scaler_type == 'standardize':
|
|
1262
|
+
self.w = Welford()
|
|
1263
|
+
self.w.consume(X)
|
|
1264
|
+
self.m = self.w.mean
|
|
1265
|
+
self.s = self.w.std
|
|
1266
|
+
X = (X - self.m) / self.s
|
|
1267
|
+
|
|
1268
|
+
elif self.scaler_type == 'minmax':
|
|
1269
|
+
|
|
1270
|
+
self.min = np.min(X, axis=0)
|
|
1271
|
+
self.max = np.max(X, axis=0)
|
|
1272
|
+
X = (X - self.min) / (self.max - self.min)
|
|
1273
|
+
|
|
1274
|
+
elif self.scaler_type == 'robminmax':
|
|
1275
|
+
|
|
1276
|
+
self.min = np.zeros([X.shape[1],])
|
|
1277
|
+
self.max = np.zeros([X.shape[1],])
|
|
1278
|
+
|
|
1279
|
+
for i in range(X.shape[1]):
|
|
1280
|
+
self.min[i] = np.median(
|
|
1281
|
+
np.sort(X[:, i])[0:int(np.round(X.shape[0] * self.tail))])
|
|
1282
|
+
self.max[i] = np.median(
|
|
1283
|
+
np.sort(X[:, i])[-int(np.round(X.shape[0] * self.tail)):])
|
|
1284
|
+
|
|
1285
|
+
X = (X - self.min) / (self.max - self.min)
|
|
1286
|
+
|
|
1287
|
+
if self.adjust_outliers:
|
|
1288
|
+
X[X < 0] = 0
|
|
1289
|
+
X[X > 1] = 1
|
|
1290
|
+
|
|
1291
|
+
return X
|
|
1292
|
+
|
|
1293
|
+
|
|
1294
|
+
def retrieve_freesurfer_eulernum(freesurfer_dir, subjects=None, save_path=None):
|
|
1295
|
+
"""
|
|
1296
|
+
This function receives the freesurfer directory (including processed data
|
|
1297
|
+
for several subjects) and retrieves the Euler number from the log files. If
|
|
1298
|
+
the log file does not exist, this function uses 'mris_euler_number' to recompute
|
|
1299
|
+
the Euler numbers (ENs). The function returns the ENs in a dataframe and
|
|
1300
|
+
the list of missing subjects (that for which computing EN is failed). If
|
|
1301
|
+
'save_path' is specified then the results will be saved in a pickle file.
|
|
1302
|
+
|
|
1303
|
+
Basic usage::
|
|
1304
|
+
|
|
1305
|
+
ENs, missing_subjects = retrieve_freesurfer_eulernum(freesurfer_dir)
|
|
1306
|
+
|
|
1307
|
+
where the arguments are defined below.
|
|
1308
|
+
|
|
1309
|
+
:param freesurfer_dir: absolute path to the Freesurfer directory.
|
|
1310
|
+
:param subjects: List of subject that we want to retrieve the ENs for.
|
|
1311
|
+
If it is 'None' (the default), the list of the subjects will be automatically
|
|
1312
|
+
retreived from existing directories in the 'freesurfer_dir' (i.e. the ENs
|
|
1313
|
+
for all subjects will be retrieved).
|
|
1314
|
+
:param save_path: The path to save the results. If 'None' (default) the
|
|
1315
|
+
results are not saves on the disk.
|
|
1316
|
+
|
|
1317
|
+
|
|
1318
|
+
:outputs: * ENs - A dataframe of retrieved ENs.
|
|
1319
|
+
* missing_subjects - The list of missing subjects.
|
|
1320
|
+
|
|
1321
|
+
Developed by S.M. Kia
|
|
1322
|
+
|
|
1323
|
+
"""
|
|
1324
|
+
|
|
1325
|
+
if subjects is None:
|
|
1326
|
+
subjects = [temp for temp in os.listdir(freesurfer_dir)
|
|
1327
|
+
if os.path.isdir(os.path.join(freesurfer_dir, temp))]
|
|
1328
|
+
|
|
1329
|
+
df = pd.DataFrame(index=subjects, columns=['lh_en', 'rh_en', 'avg_en'])
|
|
1330
|
+
missing_subjects = []
|
|
1331
|
+
|
|
1332
|
+
for s, sub in enumerate(subjects):
|
|
1333
|
+
sub_dir = os.path.join(freesurfer_dir, sub)
|
|
1334
|
+
log_file = os.path.join(sub_dir, 'scripts', 'recon-all.log')
|
|
1335
|
+
|
|
1336
|
+
if os.path.exists(sub_dir):
|
|
1337
|
+
if os.path.exists(log_file):
|
|
1338
|
+
with open(log_file) as f:
|
|
1339
|
+
for line in f:
|
|
1340
|
+
# find the part that refers to the EC
|
|
1341
|
+
if re.search('orig.nofix lheno', line):
|
|
1342
|
+
eno_line = line
|
|
1343
|
+
f.close()
|
|
1344
|
+
eno_l = eno_line.split()[3][0:-1] # remove the trailing comma
|
|
1345
|
+
eno_r = eno_line.split()[6]
|
|
1346
|
+
euler = (float(eno_l) + float(eno_r)) / 2
|
|
1347
|
+
|
|
1348
|
+
df.at[sub, 'lh_en'] = eno_l
|
|
1349
|
+
df.at[sub, 'rh_en'] = eno_r
|
|
1350
|
+
df.at[sub, 'avg_en'] = euler
|
|
1351
|
+
|
|
1352
|
+
print('%d: Subject %s is successfully processed. EN = %f'
|
|
1353
|
+
% (s, sub, df.at[sub, 'avg_en']))
|
|
1354
|
+
else:
|
|
1355
|
+
print('%d: Subject %s is missing log file, running QC ...' % (s, sub))
|
|
1356
|
+
try:
|
|
1357
|
+
bashCommand = 'mris_euler_number ' + freesurfer_dir + \
|
|
1358
|
+
sub + '/surf/lh.orig.nofix>' + 'temp_l.txt 2>&1'
|
|
1359
|
+
res = subprocess.run(
|
|
1360
|
+
bashCommand, stdout=subprocess.PIPE, shell=True)
|
|
1361
|
+
file = open('temp_l.txt', mode='r', encoding='utf-8-sig')
|
|
1362
|
+
lines = file.readlines()
|
|
1363
|
+
file.close()
|
|
1364
|
+
words = []
|
|
1365
|
+
for line in lines:
|
|
1366
|
+
line = line.strip()
|
|
1367
|
+
words.append([item.strip()
|
|
1368
|
+
for item in line.split(' ')])
|
|
1369
|
+
eno_l = np.float32(words[0][12])
|
|
1370
|
+
|
|
1371
|
+
bashCommand = 'mris_euler_number ' + freesurfer_dir + \
|
|
1372
|
+
sub + '/surf/rh.orig.nofix>' + 'temp_r.txt 2>&1'
|
|
1373
|
+
res = subprocess.run(
|
|
1374
|
+
bashCommand, stdout=subprocess.PIPE, shell=True)
|
|
1375
|
+
file = open('temp_r.txt', mode='r', encoding='utf-8-sig')
|
|
1376
|
+
lines = file.readlines()
|
|
1377
|
+
file.close()
|
|
1378
|
+
words = []
|
|
1379
|
+
for line in lines:
|
|
1380
|
+
line = line.strip()
|
|
1381
|
+
words.append([item.strip()
|
|
1382
|
+
for item in line.split(' ')])
|
|
1383
|
+
eno_r = np.float32(words[0][12])
|
|
1384
|
+
|
|
1385
|
+
df.at[sub, 'lh_en'] = eno_l
|
|
1386
|
+
df.at[sub, 'rh_en'] = eno_r
|
|
1387
|
+
df.at[sub, 'avg_en'] = (eno_r + eno_l) / 2
|
|
1388
|
+
|
|
1389
|
+
print('%d: Subject %s is successfully processed. EN = %f'
|
|
1390
|
+
% (s, sub, df.at[sub, 'avg_en']))
|
|
1391
|
+
|
|
1392
|
+
except:
|
|
1393
|
+
e = sys.exc_info()[0]
|
|
1394
|
+
missing_subjects.append(sub)
|
|
1395
|
+
print('%d: QC is failed for subject %s: %s.' % (s, sub, e))
|
|
1396
|
+
|
|
1397
|
+
else:
|
|
1398
|
+
missing_subjects.append(sub)
|
|
1399
|
+
print('%d: Subject %s is missing.' % (s, sub))
|
|
1400
|
+
df = df.dropna()
|
|
1401
|
+
|
|
1402
|
+
if save_path is not None:
|
|
1403
|
+
with open(save_path, 'wb') as file:
|
|
1404
|
+
pickle.dump({'ENs': df}, file)
|
|
1405
|
+
|
|
1406
|
+
return df, missing_subjects
|
|
1407
|
+
|
|
1408
|
+
|
|
1409
|
+
def get_package_versions():
|
|
1410
|
+
|
|
1411
|
+
import platform
|
|
1412
|
+
versions = dict()
|
|
1413
|
+
versions['Python'] = platform.python_version()
|
|
1414
|
+
|
|
1415
|
+
try:
|
|
1416
|
+
import pytensor
|
|
1417
|
+
versions['pytensor'] = pytensor.__version__
|
|
1418
|
+
except:
|
|
1419
|
+
versions['pytensor'] = ''
|
|
1420
|
+
|
|
1421
|
+
try:
|
|
1422
|
+
import pymc
|
|
1423
|
+
versions['PyMC'] = pymc.__version__
|
|
1424
|
+
except:
|
|
1425
|
+
versions['PyMC'] = ''
|
|
1426
|
+
|
|
1427
|
+
try:
|
|
1428
|
+
import pcntoolkit
|
|
1429
|
+
versions['PCNtoolkit'] = pcntoolkit.__version__
|
|
1430
|
+
except:
|
|
1431
|
+
versions['PCNtoolkit'] = ''
|
|
1432
|
+
|
|
1433
|
+
return versions
|
|
1434
|
+
|
|
1435
|
+
|
|
1436
|
+
def z_to_abnormal_p(Z):
|
|
1437
|
+
"""
|
|
1438
|
+
|
|
1439
|
+
This function receives a matrix of z-scores (deviations) and transfer them
|
|
1440
|
+
to corresponding abnormal probabilities. For more information see Sec. 2.5
|
|
1441
|
+
in https://www.biorxiv.org/content/10.1101/2021.05.28.446120v1.full.pdf.
|
|
1442
|
+
|
|
1443
|
+
:param Z: n by p matrix of z-scores (deviations in normative modeling) where
|
|
1444
|
+
n is the number of subjects and p is the number of features.
|
|
1445
|
+
:type Z: numpy.array
|
|
1446
|
+
|
|
1447
|
+
:return: a matrix of same size as Z, with probability of each sample being
|
|
1448
|
+
an abnormal sample.
|
|
1449
|
+
:rtype: numpy.array
|
|
1450
|
+
|
|
1451
|
+
"""
|
|
1452
|
+
|
|
1453
|
+
abn_p = 1 - norm.sf(np.abs(Z))*2
|
|
1454
|
+
|
|
1455
|
+
return abn_p
|
|
1456
|
+
|
|
1457
|
+
|
|
1458
|
+
def anomaly_detection_auc(abn_p, labels, n_permutation=None):
|
|
1459
|
+
"""
|
|
1460
|
+
This is a utility function for computing region-wise AUC scores for anomaly
|
|
1461
|
+
detection using normative model. If n_permutations is not None (e.g. 1000),
|
|
1462
|
+
it also computes permuation p_values for each region.
|
|
1463
|
+
|
|
1464
|
+
:param abn_p: n by p matrix of with probability of each sample being
|
|
1465
|
+
an abnormal sample. This matrix can be computed using 'z_to_abnormal_p'
|
|
1466
|
+
function.
|
|
1467
|
+
:type abn_p: numpy.array
|
|
1468
|
+
:param labels: a vactor of binary labels for n subjects, 0 for healthy and
|
|
1469
|
+
1 for patients.
|
|
1470
|
+
:type labels: numpy.array
|
|
1471
|
+
:param n_permutation: If not none the permutation significance test with
|
|
1472
|
+
n_permutation repetitions is performed for each feature. defaults to None.
|
|
1473
|
+
:type n_permutation: numpy.int
|
|
1474
|
+
:return: p by 1 matrix of AUCs and p_values for permutation test for each
|
|
1475
|
+
feature (i.e. brain region).
|
|
1476
|
+
:rtype: numpy.array
|
|
1477
|
+
|
|
1478
|
+
"""
|
|
1479
|
+
|
|
1480
|
+
n, p = abn_p.shape
|
|
1481
|
+
aucs = np.zeros([p])
|
|
1482
|
+
p_values = np.zeros([p])
|
|
1483
|
+
|
|
1484
|
+
for i in range(p):
|
|
1485
|
+
aucs[i] = roc_auc_score(labels, abn_p[:, i])
|
|
1486
|
+
|
|
1487
|
+
if n_permutation is not None:
|
|
1488
|
+
|
|
1489
|
+
auc_perm = np.zeros([n_permutation])
|
|
1490
|
+
for j in range(n_permutation):
|
|
1491
|
+
rand_idx = np.random.permutation(len(labels))
|
|
1492
|
+
rand_labels = labels[rand_idx]
|
|
1493
|
+
auc_perm[j] = roc_auc_score(rand_labels, abn_p[:, i])
|
|
1494
|
+
|
|
1495
|
+
p_values[i] = (np.sum(auc_perm > aucs[i]) + 1) / \
|
|
1496
|
+
(n_permutation + 1)
|
|
1497
|
+
print('Feature %d of %d is done: p_value=%f' %
|
|
1498
|
+
(i, p, p_values[i]))
|
|
1499
|
+
|
|
1500
|
+
return aucs, p_values
|
|
1501
|
+
|
|
1502
|
+
|
|
1503
|
+
def cartesian_product(arrays):
|
|
1504
|
+
"""
|
|
1505
|
+
This is a utility function for creating dummy data (covariates). It computes the cartesian product of N 1D arrays.
|
|
1506
|
+
|
|
1507
|
+
Example:
|
|
1508
|
+
a = cartesian_product(np.arange(0,5), np.arange(6,10))
|
|
1509
|
+
|
|
1510
|
+
:param arrays: a list of N input 1D numpy arrays with size d1,d2,dN.
|
|
1511
|
+
:return: A d1...dN by N matrix of cartesian product of N arrays.
|
|
1512
|
+
|
|
1513
|
+
"""
|
|
1514
|
+
|
|
1515
|
+
la = len(arrays)
|
|
1516
|
+
dtype = np.result_type(arrays[0])
|
|
1517
|
+
arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
|
|
1518
|
+
for i, a in enumerate(np.ix_(*arrays)):
|
|
1519
|
+
arr[..., i] = a
|
|
1520
|
+
|
|
1521
|
+
return arr.reshape(-1, la)
|
|
1522
|
+
|
|
1523
|
+
|
|
1524
|
+
def yes_or_no(question):
|
|
1525
|
+
"""
|
|
1526
|
+
Utility function for getting yes/no action from the user.
|
|
1527
|
+
|
|
1528
|
+
:param question: String for user query.
|
|
1529
|
+
|
|
1530
|
+
:return: Boolean of True for 'yes' and False for 'no'.
|
|
1531
|
+
|
|
1532
|
+
|
|
1533
|
+
"""
|
|
1534
|
+
|
|
1535
|
+
while "the answer is invalid":
|
|
1536
|
+
reply = str(input(question+' (y/n): ')).lower().strip()
|
|
1537
|
+
if reply[:1] == 'y':
|
|
1538
|
+
return True
|
|
1539
|
+
if reply[:1] == 'n':
|
|
1540
|
+
return False
|
|
1541
|
+
|
|
1542
|
+
|
|
1543
|
+
# ====== This is stuff used for the SHASH distributions, but using numpy (not pymc or pytensor) ===
|
|
1544
|
+
|
|
1545
|
+
def K(p, x):
|
|
1546
|
+
return np.array(spp.kv(p, x))
|
|
1547
|
+
|
|
1548
|
+
|
|
1549
|
+
def P(q):
|
|
1550
|
+
"""
|
|
1551
|
+
The P function as given in Jones et al.
|
|
1552
|
+
:param q:
|
|
1553
|
+
:return:
|
|
1554
|
+
|
|
1555
|
+
"""
|
|
1556
|
+
frac = np.exp(1 / 4) / np.sqrt(8 * np.pi)
|
|
1557
|
+
K1 = K((q + 1) / 2, 1 / 4)
|
|
1558
|
+
K2 = K((q - 1) / 2, 1 / 4)
|
|
1559
|
+
a = (K1 + K2) * frac
|
|
1560
|
+
return a
|
|
1561
|
+
|
|
1562
|
+
|
|
1563
|
+
def m(epsilon, delta, r):
|
|
1564
|
+
"""
|
|
1565
|
+
The r'th uncentered moment. Given by Jones et al.
|
|
1566
|
+
"""
|
|
1567
|
+
frac1 = 1 / np.power(2, r)
|
|
1568
|
+
acc = 0
|
|
1569
|
+
for i in range(r + 1):
|
|
1570
|
+
combs = spp.comb(r, i)
|
|
1571
|
+
flip = np.power(-1, i)
|
|
1572
|
+
ex = np.exp((r - 2 * i) * epsilon / delta)
|
|
1573
|
+
p = P((r - 2 * i) / delta)
|
|
1574
|
+
acc += combs * flip * ex * p
|
|
1575
|
+
return frac1 * acc
|
|
1576
|
+
|
|
1577
|
+
# ====== end stufff for SHASH
|
|
1578
|
+
|
|
1579
|
+
# Design matrix function
|
|
1580
|
+
|
|
1581
|
+
|
|
1582
|
+
def z_score(y, mean, std, skew=None, kurtosis=None, likelihood="Normal"):
|
|
1583
|
+
"""
|
|
1584
|
+
Computes Z-score of some data given parameters and a likelihood type string.
|
|
1585
|
+
if likelihood == "Normal", parameters 'skew' and 'kurtosis' are ignored
|
|
1586
|
+
:param y:
|
|
1587
|
+
:param mean:
|
|
1588
|
+
:param std:
|
|
1589
|
+
:param skew:
|
|
1590
|
+
:param kurtosis:
|
|
1591
|
+
:param likelihood:
|
|
1592
|
+
:return:
|
|
1593
|
+
"""
|
|
1594
|
+
if likelihood == "SHASHo":
|
|
1595
|
+
SHASH = (y-mean)/std
|
|
1596
|
+
Z = np.sinh(np.arcsinh(SHASH)*kurtosis - skew)
|
|
1597
|
+
elif likelihood == "SHASHo2":
|
|
1598
|
+
std_d = std/kurtosis
|
|
1599
|
+
SHASH = (y-mean)/std_d
|
|
1600
|
+
Z = np.sinh(np.arcsinh(SHASH)*kurtosis - skew)
|
|
1601
|
+
elif likelihood == "SHASHb":
|
|
1602
|
+
true_mean = m(skew, kurtosis, 1)
|
|
1603
|
+
true_std = np.sqrt((m(skew, kurtosis, 2) - true_mean ** 2))
|
|
1604
|
+
SHASH_c = ((y-mean)/std)
|
|
1605
|
+
SHASH = SHASH_c * true_std + true_mean
|
|
1606
|
+
Z = np.sinh(np.arcsinh(SHASH) * kurtosis - skew)
|
|
1607
|
+
else:
|
|
1608
|
+
Z = (y-mean)/std
|
|
1609
|
+
return Z
|
|
1610
|
+
|
|
1611
|
+
|
|
1612
|
+
def expand_all(*args):
|
|
1613
|
+
def expand(a):
|
|
1614
|
+
if len(a.shape) == 1:
|
|
1615
|
+
return np.expand_dims(a, axis=1)
|
|
1616
|
+
else:
|
|
1617
|
+
return a
|
|
1618
|
+
return [expand(x) for x in args]
|
|
1619
|
+
|
|
1620
|
+
|
|
1621
|
+
|
|
1622
|
+
|
|
1623
|
+
class Welford(object):
|
|
1624
|
+
"""Implements Welford's algorithm for computing a running mean
|
|
1625
|
+
and standard deviation as described at:
|
|
1626
|
+
http://www.johndcook.com/standard_deviation.html
|
|
1627
|
+
Taken from: https://gist.github.com/alexalemi/2151722#file-welford-py
|
|
1628
|
+
Adapted to work with numpy arrays.
|
|
1629
|
+
|
|
1630
|
+
can take single values or iterables
|
|
1631
|
+
|
|
1632
|
+
Properties:
|
|
1633
|
+
mean - returns the mean
|
|
1634
|
+
std - returns the std
|
|
1635
|
+
meanfull- returns the mean and std of the mean
|
|
1636
|
+
|
|
1637
|
+
Usage:
|
|
1638
|
+
>>> foo = Welford()
|
|
1639
|
+
>>> foo(range(100))
|
|
1640
|
+
>>> foo
|
|
1641
|
+
<Welford: 49.5 +- 29.0114919759>
|
|
1642
|
+
>>> foo([1]*1000)
|
|
1643
|
+
>>> foo
|
|
1644
|
+
<Welford: 5.40909090909 +- 16.4437417146>
|
|
1645
|
+
>>> foo.mean
|
|
1646
|
+
5.409090909090906
|
|
1647
|
+
>>> foo.std
|
|
1648
|
+
16.44374171455467
|
|
1649
|
+
>>> foo.meanfull
|
|
1650
|
+
(5.409090909090906, 0.4957974674244838)
|
|
1651
|
+
"""
|
|
1652
|
+
|
|
1653
|
+
def __init__(self, lst=None):
|
|
1654
|
+
self.k = np.array([0])
|
|
1655
|
+
self.M = np.array([0])
|
|
1656
|
+
self.S = np.array([0])
|
|
1657
|
+
|
|
1658
|
+
self.__call__(lst)
|
|
1659
|
+
|
|
1660
|
+
def update(self, x):
|
|
1661
|
+
if self.k == 0:
|
|
1662
|
+
if isinstance(x, np.ndarray):
|
|
1663
|
+
self.M = np.zeros_like(x)
|
|
1664
|
+
self.S = np.zeros_like(x)
|
|
1665
|
+
if x is None:
|
|
1666
|
+
return
|
|
1667
|
+
self.k += 1
|
|
1668
|
+
newM = self.M + (x - self.M) * 1.0 / self.k
|
|
1669
|
+
newS = self.S + (x - self.M) * (x - newM)
|
|
1670
|
+
self.M, self.S = newM, newS
|
|
1671
|
+
|
|
1672
|
+
def consume(self, lst):
|
|
1673
|
+
lst = iter(lst)
|
|
1674
|
+
for x in lst:
|
|
1675
|
+
self.update(x)
|
|
1676
|
+
|
|
1677
|
+
def __call__(self, x):
|
|
1678
|
+
if hasattr(x, "__iter__"):
|
|
1679
|
+
self.consume(x)
|
|
1680
|
+
else:
|
|
1681
|
+
self.update(x)
|
|
1682
|
+
|
|
1683
|
+
@property
|
|
1684
|
+
def mean(self) -> np.ndarray:
|
|
1685
|
+
return self.M
|
|
1686
|
+
|
|
1687
|
+
@property
|
|
1688
|
+
def meanfull(self) -> tuple[np.ndarray, np.ndarray]:
|
|
1689
|
+
return self.mean, self.std / np.sqrt(self.k)
|
|
1690
|
+
|
|
1691
|
+
@property
|
|
1692
|
+
def std(self) -> np.ndarray:
|
|
1693
|
+
if self.k == 1:
|
|
1694
|
+
return np.zeros_like(self.M)
|
|
1695
|
+
return np.sqrt(self.S / (self.k - 1))
|
|
1696
|
+
|
|
1697
|
+
def __repr__(self):
|
|
1698
|
+
return "<Welford: {} +- {}>".format(self.mean, self.std)
|