pcntoolkit 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1698 @@
1
+ from __future__ import print_function
2
+
3
+ import os
4
+ import pickle
5
+ import re
6
+ import subprocess
7
+ import sys
8
+ from abc import ABCMeta, abstractmethod
9
+ from io import StringIO
10
+ from subprocess import call
11
+
12
+ import bspline
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np
15
+ import pandas as pd
16
+ import pymc as pm
17
+ import scipy.special as spp
18
+ from bspline import splinelab
19
+ from scipy import stats
20
+ from scipy.stats import genextreme, norm, skewnorm
21
+ from six import with_metaclass
22
+ from sklearn.datasets import make_regression
23
+ from sklearn.metrics import roc_auc_score
24
+
25
+ try: # run as a package if installed
26
+ from pcntoolkit import configs
27
+ except ImportError:
28
+ pass
29
+
30
+ path = os.path.abspath(os.path.dirname(__file__))
31
+ rootpath = os.path.dirname(path) # parent directory
32
+ if rootpath not in sys.path:
33
+ sys.path.append(rootpath)
34
+ del path, rootpath
35
+ import configs
36
+
37
+ PICKLE_PROTOCOL = configs.PICKLE_PROTOCOL
38
+
39
+ # -----------------
40
+ # Utility functions
41
+ # -----------------
42
+
43
+
44
+ def create_poly_basis(X, dimpoly):
45
+ """
46
+ Creates a polynomial basis matrix for the given input matrix.
47
+
48
+ This function takes an input matrix `X` and a degree `dimpoly`, and returns a new matrix where each column is `X` raised to the power of a degree. The degrees range from 1 to `dimpoly`. If `X` is a 1D array, it is reshaped into a 2D array with one column.
49
+
50
+ Parameters
51
+ ----------
52
+ X : numpy.ndarray
53
+ The input matrix, a 2D array where each row is a sample and each column is a feature. If `X` is a 1D array, it is reshaped into a 2D array with one column.
54
+ dimpoly : int
55
+ The degree of the polynomial basis. The output matrix will have `dimpoly` times as many columns as `X`.
56
+
57
+ Returns
58
+ -------
59
+ Phi : numpy.ndarray
60
+ The polynomial basis matrix, a 2D array where each row is a sample and each column is a feature raised to a degree. The degrees range from 1 to `dimpoly`.
61
+
62
+ Examples
63
+ --------
64
+ >>> X = np.array([[1, 2], [3, 4], [5, 6]])
65
+ >>> create_poly_basis(X, 2)
66
+ array([[ 1., 2., 1., 4.],
67
+ [ 3., 4., 9., 16.],
68
+ [ 5., 6., 25., 36.]])
69
+ """
70
+ if len(X.shape) == 1:
71
+ X = X[:, np.newaxis]
72
+ D = X.shape[1]
73
+ Phi = np.zeros((X.shape[0], D*dimpoly))
74
+ colid = np.arange(0, D)
75
+ for d in range(1, dimpoly+1):
76
+ Phi[:, colid] = X ** d
77
+ colid += D
78
+
79
+ return Phi
80
+
81
+
82
+ def create_bspline_basis(xmin, xmax, p=3, nknots=5):
83
+ """
84
+ Compute a Bspline basis set where:
85
+
86
+ :param p: order of spline (3 = cubic)
87
+ :param nknots: number of knots (endpoints only counted once)
88
+
89
+ """
90
+
91
+ knots = np.linspace(xmin, xmax, nknots)
92
+ k = splinelab.augknt(knots, p) # pad the knot vector
93
+ B = bspline.Bspline(k, p)
94
+ return B
95
+
96
+
97
+ def create_design_matrix(X, intercept=True, basis='bspline',
98
+ basis_column=0, site_ids=None, all_sites=None,
99
+ **kwargs):
100
+ """
101
+ Prepare a design matrix from a set of covariates sutiable for
102
+ running Bayesian linear regression. This design matrix consists of
103
+ a set of user defined covariates, optional site intercepts
104
+ (fixed effects) and also optionally a nonlinear basis expansion over
105
+ one of the columns
106
+
107
+ :param X: matrix of covariates
108
+ :param basis: type of basis expansion to use
109
+ :param basis_column: which colume to perform the expansion over?
110
+ :param site_ids: list of site ids (one per data point)
111
+ :param all_sites: list of unique site ids
112
+ :param p: order of spline (3 = cubic)
113
+ :param nknots: number of knots (endpoints only counted once)
114
+
115
+ if site_ids is specified, this must have the same number of entries as
116
+ there are rows in X. If all_sites is specfied, these will be used to
117
+ create the site identifiers in place of site_ids. This accommo
118
+ dates
119
+ the scenario where not all the sites used to create the model are
120
+ present in the test set (i.e. there will be some empty site columns).
121
+
122
+ """
123
+
124
+ xmin = kwargs.pop('xmin', 0)
125
+ xmax = kwargs.pop('xmax', 100)
126
+
127
+ N = X.shape[0]
128
+
129
+ if isinstance(X, pd.DataFrame):
130
+ X = X.to_numpy()
131
+
132
+ # add intercept column
133
+ if intercept:
134
+ Phi = np.concatenate((np.ones((N, 1)), X), axis=1)
135
+ else:
136
+ Phi = X
137
+
138
+ # add dummy coded site columns
139
+ if all_sites is None:
140
+ if site_ids is not None:
141
+ all_sites = sorted(pd.unique(site_ids))
142
+
143
+ if site_ids is None:
144
+ if all_sites is None:
145
+ site_cols = None
146
+ else:
147
+ # site ids are not specified, but all_sites are
148
+ site_cols = np.zeros((N, len(all_sites)))
149
+ else:
150
+ # site ids are defined
151
+ # make sure the data are in pandas format
152
+ if not isinstance(site_ids, pd.Series):
153
+ site_ids = pd.Series(data=site_ids)
154
+ # site_ids = pd.Series(data=site_ids)
155
+
156
+ # make sure all_sites is defined
157
+ if all_sites is None:
158
+ all_sites = sorted(pd.unique(site_ids))
159
+
160
+ # dummy code the sites
161
+ site_cols = np.zeros((N, len(all_sites)))
162
+ for i, s in enumerate(all_sites):
163
+ site_cols[:, i] = site_ids == s
164
+
165
+ if site_cols.shape[0] != N:
166
+ raise ValueError(
167
+ 'site cols must have the same number of rows as X')
168
+
169
+ if site_cols is not None:
170
+ Phi = np.concatenate((Phi, site_cols), axis=1)
171
+
172
+ # create Bspline basis set
173
+ if basis == 'bspline':
174
+ B = create_bspline_basis(xmin, xmax, **kwargs)
175
+ Phi = np.concatenate(
176
+ (Phi, np.array([B(i) for i in X[:, basis_column]])), axis=1)
177
+ elif basis == 'poly':
178
+ Phi = np.concatenate((Phi, create_poly_basis(
179
+ X[:, basis_column], **kwargs)), axis=1)
180
+
181
+ return Phi
182
+
183
+
184
+ def squared_dist(x, z=None):
185
+ """
186
+ Compute sum((x-z) ** 2) for all vectors in a 2d array.
187
+
188
+ """
189
+
190
+ # do some basic checks
191
+ if z is None:
192
+ z = x
193
+ if len(x.shape) == 1:
194
+ x = x[:, np.newaxis]
195
+ if len(z.shape) == 1:
196
+ z = z[:, np.newaxis]
197
+
198
+ nx, dx = x.shape
199
+ nz, dz = z.shape
200
+ if dx != dz:
201
+ raise ValueError("""
202
+ Cannot compute distance: vectors have different length""")
203
+
204
+ # mean centre for numerical stability
205
+ m = np.mean(np.vstack((np.mean(x, axis=0), np.mean(z, axis=0))), axis=0)
206
+ x = x - m
207
+ z = z - m
208
+
209
+ xx = np.tile(np.sum((x*x), axis=1)[:, np.newaxis], (1, nz))
210
+ zz = np.tile(np.sum((z*z), axis=1), (nx, 1))
211
+
212
+ dist = (xx - 2*x.dot(z.T) + zz)
213
+
214
+ return dist
215
+
216
+
217
+ def compute_pearsonr(A, B):
218
+ """
219
+ Manually computes the Pearson correlation between two matrices.
220
+
221
+ Basic usage::
222
+
223
+ compute_pearsonr(A, B)
224
+
225
+ :param A: an N * M data array
226
+ :param cov: an N * M array
227
+
228
+ :returns Rho: N dimensional vector of correlation coefficients
229
+ :returns ys2: N dimensional vector of p-values
230
+
231
+ Notes::
232
+
233
+ This function is useful when M is large and only the diagonal entries
234
+ of the resulting correlation matrix are of interest. This function
235
+ does not compute the full correlation matrix as an intermediate step
236
+
237
+ """
238
+
239
+ # N = A.shape[1]
240
+ N = A.shape[0]
241
+
242
+ # first mean centre
243
+ Am = A - np.mean(A, axis=0)
244
+ Bm = B - np.mean(B, axis=0)
245
+ # then normalize
246
+ An = Am / np.sqrt(np.sum(Am**2, axis=0))
247
+ Bn = Bm / np.sqrt(np.sum(Bm**2, axis=0))
248
+ del (Am, Bm)
249
+
250
+ Rho = np.sum(An * Bn, axis=0)
251
+ del (An, Bn)
252
+
253
+ # Fisher r-to-z
254
+ Zr = (np.arctanh(Rho) - np.arctanh(0)) * np.sqrt(N - 3)
255
+ N = stats.norm()
256
+ pRho = 2*N.cdf(-np.abs(Zr))
257
+ # pRho = 1-N.cdf(Zr)
258
+
259
+ return Rho, pRho
260
+
261
+
262
+ def explained_var(ytrue, ypred):
263
+ """
264
+ Computes the explained variance of predicted values.
265
+
266
+ Basic usage::
267
+
268
+ exp_var = explained_var(ytrue, ypred)
269
+
270
+ where
271
+
272
+ :ytrue: n*p matrix of true values where n is the number of samples
273
+ and p is the number of features.
274
+ :ypred: n*p matrix of predicted values where n is the number of samples
275
+ and p is the number of features.
276
+
277
+ :returns exp_var: p dimentional vector of explained variances for each feature.
278
+
279
+ """
280
+
281
+ exp_var = 1 - (ytrue - ypred).var(axis=0) / ytrue.var(axis=0)
282
+
283
+ return exp_var
284
+
285
+
286
+ def compute_MSLL(ytrue, ypred, ypred_var, train_mean=None, train_var=None):
287
+ """
288
+ Computes the MSLL or MLL (not standardized) if 'train_mean' and 'train_var' are None.
289
+
290
+ Basic usage::
291
+
292
+ MSLL = compute_MSLL(ytrue, ypred, ytrue_sig, noise_variance, train_mean, train_var)
293
+
294
+ where
295
+
296
+ :param ytrue : n*p matrix of true values where n is the number of samples
297
+ and p is the number of features.
298
+ :param ypred : n*p matrix of predicted values where n is the number of samples
299
+ and p is the number of features.
300
+ :param ypred_var : n*p matrix of summed noise variances and prediction variances where n is the number of samples
301
+ and p is the number of features.
302
+
303
+ :param train_mean: p dimensional vector of mean values of the training data for each feature.
304
+
305
+ :param train_var : p dimensional vector of covariances of the training data for each feature.
306
+
307
+ :returns loss : p dimensional vector of MSLL or MLL for each feature.
308
+
309
+ """
310
+
311
+ if train_mean is not None and train_var is not None:
312
+
313
+ # make sure y_train_mean and y_train_sig have right dimensions (subjects x voxels):
314
+ Y_train_mean = np.repeat(train_mean, ytrue.shape[0], axis=0)
315
+ Y_train_sig = np.repeat(train_var, ytrue.shape[0], axis=0)
316
+
317
+ # compute MSLL:
318
+ loss = np.mean(0.5 * np.log(2 * np.pi * ypred_var) + (ytrue - ypred)**2 / (2 * ypred_var) -
319
+ 0.5 * np.log(2 * np.pi * Y_train_sig) - (ytrue - Y_train_mean)**2 / (2 * Y_train_sig), axis=0)
320
+
321
+ else:
322
+ # compute MLL:
323
+ loss = np.mean(0.5 * np.log(2 * np.pi * ypred_var) +
324
+ (ytrue - ypred)**2 / (2 * ypred_var), axis=0)
325
+
326
+ return loss
327
+
328
+
329
+ def calibration_descriptives(x):
330
+ """
331
+ Compute statistics useful to assess the calibration of normative models,
332
+ including skew and kurtosis of the distribution, plus their standard
333
+ deviation and standar errors (separately for each column in x)
334
+
335
+ Basic usage::
336
+ stats = calibration_descriptives(Z)
337
+
338
+ where
339
+
340
+ :param x : n*p matrix of statistics you wish to assess
341
+ :returns stats :[skew, sdskew, kurtosis, sdkurtosis, semean, sesd]
342
+
343
+ """
344
+
345
+ n = np.shape(x)[0]
346
+ m1 = np.mean(x, axis=0)
347
+ m2 = sum((x-m1)**2)
348
+ m3 = sum((x-m1)**3)
349
+ m4 = sum((x-m1)**4)
350
+ s1 = np.std(x, axis=0)
351
+ skew = n*m3/(n-1)/(n-2)/s1**3
352
+ sdskew = np.sqrt(6*n*(n-1) / ((n-2)*(n+1)*(n+3)))
353
+ kurtosis = (n * (n+1) * m4) / ((n-1) * (n-2) * (n-3) * s1**4) - (3 * (n-1)**2) / ((n-2) * (n-3))
354
+ sdkurtosis = np.sqrt(4*(n**2-1) * sdskew**2 / ((n-3)*(n+5)))
355
+ semean = np.sqrt(np.var(x)/n)
356
+ sesd = s1/np.sqrt(2*(n-1))
357
+ cd = [skew, sdskew, kurtosis, sdkurtosis, semean, sesd]
358
+
359
+ return cd
360
+
361
+
362
+ class WarpBase(with_metaclass(ABCMeta)):
363
+ """
364
+ Base class for likelihood warping following:
365
+ Rios and Torab (2019) Compositionally-warped Gaussian processes
366
+ https://www.sciencedirect.com/science/article/pii/S0893608019301856
367
+
368
+ All Warps must define the following methods::
369
+
370
+ Warp.get_n_params() - return number of parameters
371
+ Warp.f() - warping function (Non-Gaussian field -> Gaussian)
372
+ Warp.invf() - inverse warp
373
+ Warp.df() - derivatives
374
+ Warp.warp_predictions() - compute predictive distribution
375
+
376
+ """
377
+
378
+ def __init__(self):
379
+ self.n_params = np.nan
380
+
381
+ def get_n_params(self):
382
+ """ Report the number of parameters required """
383
+
384
+ assert not np.isnan(self.n_params), \
385
+ "Warp function not initialised"
386
+
387
+ return self.n_params
388
+
389
+ def warp_predictions(self, mu, s2, param, percentiles=[0.025, 0.975]):
390
+ """
391
+ Compute the warped predictions from a gaussian predictive
392
+ distribution, specifed by a mean (mu) and variance (s2)
393
+
394
+ :param mu: Gassian predictive mean
395
+ :param s2: Predictive variance
396
+ :param param: warping parameters
397
+ :param percentiles: Desired percentiles of the warped likelihood
398
+
399
+ :returns: * median - median of the predictive distribution
400
+ * pred_interval - predictive interval(s)
401
+
402
+ """
403
+
404
+ # Compute percentiles of a standard Gaussian
405
+ N = norm
406
+ Z = N.ppf(percentiles)
407
+
408
+ # find the median (using mu = median)
409
+ median = self.invf(mu, param)
410
+
411
+ # compute the predictive intervals (non-stationary)
412
+ pred_interval = np.zeros((len(mu), len(Z)))
413
+ for i, z in enumerate(Z):
414
+ pred_interval[:, i] = self.invf(mu + np.sqrt(s2)*z, param)
415
+
416
+ return median, pred_interval
417
+
418
+ @abstractmethod
419
+ def f(self, x, param):
420
+ """ Evaluate the warping function (mapping non-Gaussian respone
421
+ variables to Gaussian variables)
422
+ """
423
+
424
+ @abstractmethod
425
+ def invf(self, y, param):
426
+ """ Evaluate the warping function (mapping Gaussian latent variables
427
+ to non-Gaussian response variables)
428
+ """
429
+
430
+ @abstractmethod
431
+ def df(self, x, param):
432
+ """ Return the derivative of the warp, dw(x)/dx """
433
+
434
+
435
+ class WarpLog(WarpBase):
436
+ """ Affine warp
437
+ y = a + b*x
438
+ """
439
+
440
+ def __init__(self):
441
+ self.n_params = 0
442
+
443
+ def f(self, x, params=None):
444
+
445
+ y = np.log(x)
446
+
447
+ return y
448
+
449
+ def invf(self, y, params=None):
450
+
451
+ x = np.exp(y)
452
+
453
+ return x
454
+
455
+ def df(self, x, params):
456
+
457
+ df = 1/x
458
+
459
+ return df
460
+
461
+
462
+ class WarpAffine(WarpBase):
463
+ """ Affine warp
464
+ y = a + b*x
465
+ """
466
+
467
+ def __init__(self):
468
+ self.n_params = 2
469
+
470
+ def _get_params(self, param):
471
+ if len(param) != self.n_params:
472
+ raise ValueError(
473
+ 'number of parameters must be ' + str(self.n_params))
474
+ return param[0], np.exp(param[1])
475
+
476
+ def f(self, x, params):
477
+ a, b = self._get_params(params)
478
+
479
+ y = a + b*x
480
+ return y
481
+
482
+ def invf(self, y, params):
483
+ a, b = self._get_params(params)
484
+
485
+ x = (y - a) / b
486
+
487
+ return x
488
+
489
+ def df(self, x, params):
490
+ a, b = self._get_params(params)
491
+
492
+ df = np.ones(x.shape)*b
493
+ return df
494
+
495
+
496
+ class WarpBoxCox(WarpBase):
497
+ """ Box cox transform having a single parameter (lambda), i.e.
498
+
499
+ y = (sign(x) * abs(x) ** lamda - 1) / lambda
500
+
501
+ This follows the generalization in Bicken and Doksum (1981) JASA 76
502
+ and allows x to assume negative values.
503
+ """
504
+
505
+ def __init__(self):
506
+ self.n_params = 1
507
+
508
+ def _get_params(self, param):
509
+
510
+ return np.exp(param)
511
+
512
+ def f(self, x, params):
513
+ lam = self._get_params(params)
514
+
515
+ if lam == 0:
516
+ y = np.log(x)
517
+ else:
518
+ y = (np.sign(x) * np.abs(x) ** lam - 1) / lam
519
+ return y
520
+
521
+ def invf(self, y, params):
522
+ lam = self._get_params(params)
523
+
524
+ if lam == 0:
525
+ x = np.exp(y)
526
+ else:
527
+ x = np.sign(lam * y + 1) * np.abs(lam * y + 1) ** (1 / lam)
528
+
529
+ return x
530
+
531
+ def df(self, x, params):
532
+ lam = self._get_params(params)
533
+
534
+ dx = np.abs(x) ** (lam - 1)
535
+
536
+ return dx
537
+
538
+
539
+ class WarpSinArcsinh(WarpBase):
540
+ """ Sin-hyperbolic arcsin warp having two parameters (a, b) and defined by
541
+
542
+ y = sinh(b * arcsinh(x) - a)
543
+
544
+ Using the parametrisation of Rios et al, Neural Networks 118 (2017)
545
+ where a controls skew and b controls kurtosis, such that:
546
+
547
+ * a = 0 : symmetric
548
+ * a > 0 : positive skew
549
+ * a < 0 : negative skew
550
+ * b = 1 : mesokurtic
551
+ * b > 1 : leptokurtic
552
+ * b < 1 : platykurtic
553
+
554
+ where b > 0. However, it is more convenentent to use an alternative
555
+ parameterisation, given in Jones and Pewsey 2019 JRSS Significance 16
556
+ https://doi.org/10.1111/j.1740-9713.2019.01245.x
557
+
558
+ where:
559
+
560
+ y = sinh(b * arcsinh(x) + epsilon * b)
561
+
562
+ and a = -epsilon*b
563
+
564
+ see also Jones and Pewsey 2009 Biometrika, 96 (4) for more details
565
+ about the SHASH distribution
566
+ https://www.jstor.org/stable/27798865
567
+ """
568
+
569
+ def __init__(self):
570
+ self.n_params = 2
571
+
572
+ def _get_params(self, param):
573
+ if len(param) != self.n_params:
574
+ raise ValueError(
575
+ 'number of parameters must be ' + str(self.n_params))
576
+
577
+ epsilon = param[0]
578
+ b = np.exp(param[1])
579
+ a = -epsilon*b
580
+
581
+ return a, b
582
+
583
+ def f(self, x, params):
584
+ a, b = self._get_params(params)
585
+
586
+ y = np.sinh(b * np.arcsinh(x) - a)
587
+ return y
588
+
589
+ def invf(self, y, params):
590
+ a, b = self._get_params(params)
591
+
592
+ x = np.sinh((np.arcsinh(y)+a)/b)
593
+
594
+ return x
595
+
596
+ def df(self, x, params):
597
+ a, b = self._get_params(params)
598
+
599
+ dx = (b * np.cosh(b * np.arcsinh(x) - a))/np.sqrt(1 + x ** 2)
600
+
601
+ return dx
602
+
603
+
604
+ class WarpCompose(WarpBase):
605
+ """ Composition of warps. These are passed in as an array and
606
+ intialised automatically. For example::
607
+
608
+ W = WarpCompose(('WarpBoxCox', 'WarpAffine'))
609
+
610
+ where ell_i are lengthscale parameters and sf2 is the signal variance
611
+ """
612
+
613
+ def __init__(self, warpnames=None, debugwarp=False):
614
+
615
+ if warpnames is None:
616
+ raise ValueError("A list of warp functions is required")
617
+ self.debugwarp = debugwarp
618
+ self.warps = []
619
+ self.n_params = 0
620
+ for wname in warpnames:
621
+ warp = eval(wname + '()')
622
+ self.n_params += warp.get_n_params()
623
+ self.warps.append(warp)
624
+
625
+ def f(self, x, theta):
626
+ theta_offset = 0
627
+
628
+ if self.debugwarp:
629
+ print('begin composition')
630
+ for ci, warp in enumerate(self.warps):
631
+ n_params_c = warp.get_n_params()
632
+ theta_c = [theta[c] for c in
633
+ range(theta_offset, theta_offset + n_params_c)]
634
+ theta_offset += n_params_c
635
+
636
+ if self.debugwarp:
637
+ print('f:', ci, theta_c, warp)
638
+
639
+ if ci == 0:
640
+ fw = warp.f(x, theta_c)
641
+ else:
642
+ fw = warp.f(fw, theta_c)
643
+ return fw
644
+
645
+ def invf(self, x, theta):
646
+ n_params = 0
647
+ n_warps = 0
648
+ if self.debugwarp:
649
+ print('begin composition')
650
+
651
+ for ci, warp in enumerate(self.warps):
652
+ n_params += warp.get_n_params()
653
+ n_warps += 1
654
+ theta_offset = n_params
655
+ for ci, warp in reversed(list(enumerate(self.warps))):
656
+ n_params_c = warp.get_n_params()
657
+ theta_offset -= n_params_c
658
+ theta_c = [theta[c] for c in
659
+ range(theta_offset, theta_offset + n_params_c)]
660
+
661
+ if self.debugwarp:
662
+ print('invf:', theta_c, warp)
663
+
664
+ if ci == n_warps-1:
665
+ finvw = warp.invf(x, theta_c)
666
+ else:
667
+ finvw = warp.invf(finvw, theta_c)
668
+
669
+ return finvw
670
+
671
+ def df(self, x, theta):
672
+ theta_offset = 0
673
+ if self.debugwarp:
674
+ print('begin composition')
675
+ for ci, warp in enumerate(self.warps):
676
+ n_params_c = warp.get_n_params()
677
+
678
+ theta_c = [theta[c] for c in
679
+ range(theta_offset, theta_offset + n_params_c)]
680
+ theta_offset += n_params_c
681
+
682
+ if self.debugwarp:
683
+ print('df:', ci, theta_c, warp)
684
+
685
+ if ci == 0:
686
+ dfw = warp.df(x, theta_c)
687
+ else:
688
+ dfw = warp.df(dfw, theta_c)
689
+
690
+ return dfw
691
+
692
+ # -----------------------
693
+ # Functions for inference
694
+ # -----------------------
695
+
696
+
697
+ class CustomCV:
698
+ """ Custom cross-validation approach. This function does not do much, it
699
+ merely provides a wrapper designed to be compatible with
700
+ scikit-learn (e.g. sklearn.model_selection...)
701
+
702
+ :param train: a list of indices of training splits (each itself a list)
703
+ :param test: a list of indices of test splits (each itself a list)
704
+
705
+ :returns tr: Indices for training set
706
+ :returns te: Indices for test set
707
+
708
+ """
709
+
710
+ def __init__(self, train, test, X=None, y=None):
711
+ self.train = train
712
+ self.test = test
713
+ self.n_splits = len(train)
714
+ if X is not None:
715
+ self.N = X.shape[0]
716
+ else:
717
+ self.N = None
718
+
719
+ def split(self, X, y=None):
720
+ if self.N is None:
721
+ self.N = X.shape[0]
722
+
723
+ for i in range(0, self.n_splits):
724
+ tr = self.train[i]
725
+ te = self.test[i]
726
+ yield tr, te
727
+
728
+
729
+ def bashwrap(processing_dir, python_path, script_command, job_name,
730
+ bash_environment=None):
731
+ """ This function wraps normative modelling into a bash script to run it
732
+ on a torque cluster system.
733
+
734
+ :param processing_dir: Full path to the processing dir
735
+ :param python_path: Full path to the python distribution
736
+ :param script_command: python command to execute
737
+ :param job_name: Name for the bash script output by this function
738
+ :param covfile_path: Full path to covariates
739
+ :param respfile_path: Full path to response variables
740
+ :param cv_folds: Number of cross validations
741
+ :param testcovfile_path: Full path to test covariates
742
+ :param testrespfile_path: Full path to tes responses
743
+ :param bash_environment: A file containing enviornment specific commands
744
+
745
+ :returns: A .sh file containing the commands for normative modelling
746
+
747
+ written by Thomas Wolfers
748
+ """
749
+
750
+ # change to processing dir
751
+ os.chdir(processing_dir)
752
+ output_changedir = ['cd ' + processing_dir + '\n']
753
+
754
+ # sets bash environment if necessary
755
+ if bash_environment is not None:
756
+ bash_environment = [bash_environment]
757
+ print("""Your own environment requires in any case:
758
+ #!/bin/bash\n export and optionally OMP_NUM_THREADS=1\n""")
759
+ else:
760
+ bash_lines = '#!/bin/bash\n\n'
761
+ bash_cores = 'export OMP_NUM_THREADS=1\n'
762
+ bash_environment = [bash_lines + bash_cores]
763
+
764
+ command = [python_path + ' ' + script_command + '\n']
765
+
766
+ # writes bash file into processing dir
767
+ bash_file_name = os.path.join(processing_dir, job_name + '.sh')
768
+ with open(bash_file_name, 'w') as bash_file:
769
+ bash_file.writelines(bash_environment + output_changedir + command)
770
+
771
+ # changes permissoins for bash.sh file
772
+ os.chmod(bash_file_name, 0o700)
773
+
774
+ return bash_file_name
775
+
776
+
777
+ def qsub(job_path, memory, duration, logdir=None):
778
+ """This function submits a job.sh scipt to the torque custer using the qsub command.
779
+
780
+ Basic usage::
781
+
782
+ qsub_nm(job_path, log_path, memory, duration)
783
+
784
+ :param job_path: Full path to the job.sh file.
785
+ :param memory: Memory requirements written as string for example 4gb or 500mb.
786
+ :param duation: The approximate duration of the job, a string with HH:MM:SS for example 01:01:01.
787
+
788
+ :outputs: Submission of the job to the (torque) cluster.
789
+
790
+ written by (primarily) T Wolfers, (adapted) SM Kia, (adapted) S Rutherford.
791
+ """
792
+ if logdir is None:
793
+ logdir = os.path.expanduser('~')
794
+
795
+ # created qsub command
796
+ qsub_call = ['echo ' + job_path + ' | qsub -N ' + job_path + ' -l ' +
797
+ 'mem=' + memory + ',walltime=' + duration +
798
+ ' -e ' + logdir + ' -o ' + logdir]
799
+
800
+ # submits job to cluster
801
+ call(qsub_call, shell=True)
802
+
803
+
804
+ def extreme_value_prob_fit(NPM, perc):
805
+ n = NPM.shape[0]
806
+ t = NPM.shape[1]
807
+ n_perc = int(round(t * perc))
808
+ m = np.zeros(n)
809
+ for i in range(n):
810
+ temp = np.abs(NPM[i, :])
811
+ temp = np.sort(temp)
812
+ temp = temp[t - n_perc:]
813
+ temp = temp[0:int(np.floor(0.90*temp.shape[0]))]
814
+ m[i] = np.mean(temp)
815
+ params = genextreme.fit(m)
816
+ return params
817
+
818
+
819
+ def extreme_value_prob(params, NPM, perc):
820
+ n = NPM.shape[0]
821
+ t = NPM.shape[1]
822
+ n_perc = int(round(t * perc))
823
+ m = np.zeros(n)
824
+ for i in range(n):
825
+ temp = np.abs(NPM[i, :])
826
+ temp = np.sort(temp)
827
+ temp = temp[t - n_perc:]
828
+ temp = temp[0:int(np.floor(0.90*temp.shape[0]))]
829
+ m[i] = np.mean(temp)
830
+ probs = genextreme.cdf(m, *params)
831
+ return probs
832
+
833
+
834
+ def ravel_2D(a):
835
+ s = a.shape
836
+ return np.reshape(a, [s[0], np.prod(s[1:])])
837
+
838
+
839
+ def unravel_2D(a, s):
840
+ return np.reshape(a, s)
841
+
842
+
843
+ def threshold_NPM(NPMs, fdr_thr=0.05, npm_thr=0.1):
844
+ """ Compute voxels with significant NPMs. """
845
+ p_values = stats.norm.cdf(-np.abs(NPMs))
846
+ results = np.zeros(NPMs.shape)
847
+ masks = np.full(NPMs.shape, False, dtype=bool)
848
+ for i in range(p_values.shape[0]):
849
+ masks[i, :] = FDR(p_values[i, :], fdr_thr)
850
+ results[i,] = NPMs[i, :] * masks[i, :].astype(np.int)
851
+ m = np.sum(masks, axis=0)/masks.shape[0] > npm_thr
852
+ # m = np.any(masks,axis=0)
853
+ return results, masks, m
854
+
855
+
856
+ def FDR(p_values, alpha):
857
+ """ Compute the false discovery rate in all voxels for a subject. """
858
+ dim = np.shape(p_values)
859
+ p_values = np.reshape(p_values, [np.prod(dim),])
860
+ sorted_p_values = np.sort(p_values)
861
+ sorted_p_values_idx = np.argsort(p_values)
862
+ testNum = len(p_values)
863
+ thresh = ((np.array(range(testNum)) + 1)/np.float(testNum)) * alpha
864
+ h = sorted_p_values <= thresh
865
+ unsort = np.argsort(sorted_p_values_idx)
866
+ h = h[unsort]
867
+ h = np.reshape(h, dim)
868
+ return h
869
+
870
+
871
+ def calibration_error(Y, m, s, cal_levels):
872
+ ce = 0
873
+ for cl in cal_levels:
874
+ z = np.abs(norm.ppf((1-cl)/2))
875
+ ub = m + z * s
876
+ lb = m - z * s
877
+ ce = ce + \
878
+ np.abs(cl - np.sum(np.logical_and(Y >= lb, Y <= ub))/Y.shape[0])
879
+ return ce
880
+
881
+
882
+ def simulate_data(method='linear', n_samples=100, n_features=1, n_grps=1,
883
+ working_dir=None, plot=False, random_state=None, noise=None):
884
+ """
885
+ Simulates synthetic data for testing purposes, with options for linear, non-linear,
886
+ or combined data generation methods, and various noise types.
887
+
888
+ :param method: Method to simulate ('linear', 'non-linear', or 'combined').
889
+ :param n_samples: Number of samples per group, either an int or a list for each group (default=100).
890
+ :param n_features: Number of features to simulate (default=1).
891
+ :param n_grps: Number of groups in the data (default=1).
892
+ :param working_dir: Directory to save the data (default=None).
893
+ :param plot: Boolean flag to plot the simulated training data (default=False).
894
+ :param random_state: Seed for random number generation (default=None).
895
+ :param noise: Type of noise to add ('homoscedastic_gaussian', 'heteroscedastic_gaussian',
896
+ 'homoscedastic_nongaussian', 'heteroscedastic_nongaussian', default=None).
897
+
898
+ :returns: Tuple of (X_train, Y_train, grp_id_train, X_test, Y_test, grp_id_test, coef)
899
+ """
900
+
901
+ np.random.seed(random_state)
902
+
903
+ if isinstance(n_samples, int):
904
+ n_samples = [n_samples for _ in range(n_grps)]
905
+
906
+ X_train, Y_train, X_test, Y_test = [], [], [], []
907
+ grp_id_train, grp_id_test = [], []
908
+ coef = []
909
+
910
+ for i in range(n_grps):
911
+ bias = np.random.randint(-10, high=10)
912
+
913
+ if method == 'linear':
914
+ X_temp, Y_temp, coef_temp = make_regression(
915
+ n_samples=n_samples[i] * 2, n_features=n_features, n_targets=1,
916
+ noise=10 * np.random.rand(), bias=bias, n_informative=1, coef=True,
917
+ )
918
+ elif method == 'non-linear':
919
+ X_temp = np.random.randint(-2, 6, [2 * n_samples[i], n_features]) \
920
+ + np.random.randn(2 * n_samples[i], n_features)
921
+ Y_temp = X_temp[:, 0] * 20 * np.random.rand() + np.random.randint(10, 100) \
922
+ * np.sin(2 * np.random.rand() + 2 * np.pi / 5 * X_temp[:, 0])
923
+ coef_temp = 0
924
+ elif method == 'combined':
925
+ X_temp = np.random.randint(-2, 6, [2 * n_samples[i], n_features]) \
926
+ + np.random.randn(2 * n_samples[i], n_features)
927
+ Y_temp = (X_temp[:, 0]**3) * np.random.uniform(0, 0.5) \
928
+ + X_temp[:, 0] * 20 * np.random.rand() \
929
+ + np.random.randint(10, 100)
930
+ coef_temp = 0
931
+ else:
932
+ raise ValueError(
933
+ "Unknown method. Please specify 'linear', 'non-linear', or 'combined'.")
934
+
935
+ coef.append(coef_temp / 100)
936
+ X_train.append(X_temp[:n_samples[i]])
937
+ Y_train.append(Y_temp[:n_samples[i]] / 100)
938
+ X_test.append(X_temp[n_samples[i]:])
939
+ Y_test.append(Y_temp[n_samples[i]:] / 100)
940
+ grp_id = np.repeat(i, n_samples[i] * 2)
941
+ grp_id_train.append(grp_id[:n_samples[i]])
942
+ grp_id_test.append(grp_id[n_samples[i]:])
943
+
944
+ t = np.random.randint(1, 5)
945
+ # Add noise to the data
946
+ if noise == 'homoscedastic_gaussian':
947
+ Y_train[i] += np.random.normal(loc=0,
948
+ scale=0.2, size=Y_train[i].shape[0]) / t
949
+ Y_test[i] += np.random.normal(loc=0,
950
+ scale=0.2, size=Y_test[i].shape[0]) / t
951
+
952
+ elif noise == 'heteroscedastic_gaussian':
953
+ Y_train[i] += np.random.normal(loc=0, scale=np.log(
954
+ 1 + np.exp(X_train[i][:, 0])), size=Y_train[i].shape[0])
955
+ Y_test[i] += np.random.normal(loc=0, scale=np.log(
956
+ 1 + np.exp(X_test[i][:, 0])), size=Y_test[i].shape[0])
957
+
958
+ elif noise == 'homoscedastic_nongaussian':
959
+ Y_train[i] += skewnorm.rvs(a=10, loc=0,
960
+ scale=0.2, size=Y_train[i].shape[0]) / t
961
+ Y_test[i] += skewnorm.rvs(a=10, loc=0,
962
+ scale=0.2, size=Y_test[i].shape[0]) / t
963
+
964
+ elif noise == 'heteroscedastic_nongaussian':
965
+ Y_train[i] += skewnorm.rvs(a=10, loc=0, scale=np.log(
966
+ 1 + np.exp(0.3 * X_train[i][:, 0])), size=Y_train[i].shape[0])
967
+ Y_test[i] += skewnorm.rvs(a=10, loc=0, scale=np.log(1 +
968
+ np.exp(0.3 * X_test[i][:, 0])), size=Y_test[i].shape[0])
969
+
970
+ X_train = np.vstack(X_train)
971
+ X_test = np.vstack(X_test)
972
+ Y_train = np.concatenate(Y_train)
973
+ Y_test = np.concatenate(Y_test)
974
+ grp_id_train = np.expand_dims(np.concatenate(grp_id_train), axis=1)
975
+ grp_id_test = np.expand_dims(np.concatenate(grp_id_test), axis=1)
976
+
977
+ if plot:
978
+ for i in range(n_features):
979
+ plt.figure()
980
+ for j in range(n_grps):
981
+ plt.scatter(X_train[grp_id_train[:, 0] == j, i],
982
+ Y_train[grp_id_train[:, 0] == j], label='Group ' + str(j))
983
+ plt.xlabel(f'X{i}')
984
+ plt.ylabel('Y')
985
+ plt.legend()
986
+ plt.show()
987
+
988
+ if working_dir:
989
+ if not os.path.isdir(working_dir):
990
+ os.mkdir(working_dir)
991
+
992
+ with open(os.path.join(working_dir, 'trbefile.pkl'), 'wb') as file:
993
+ pickle.dump(pd.DataFrame(grp_id_train), file,
994
+ protocol=pickle.HIGHEST_PROTOCOL)
995
+ with open(os.path.join(working_dir, 'tsbefile.pkl'), 'wb') as file:
996
+ pickle.dump(pd.DataFrame(grp_id_test), file,
997
+ protocol=pickle.HIGHEST_PROTOCOL)
998
+ with open(os.path.join(working_dir, 'X_train.pkl'), 'wb') as file:
999
+ pickle.dump(pd.DataFrame(X_train), file,
1000
+ protocol=pickle.HIGHEST_PROTOCOL)
1001
+ with open(os.path.join(working_dir, 'X_test.pkl'), 'wb') as file:
1002
+ pickle.dump(pd.DataFrame(X_test), file,
1003
+ protocol=pickle.HIGHEST_PROTOCOL)
1004
+ with open(os.path.join(working_dir, 'Y_train.pkl'), 'wb') as file:
1005
+ pickle.dump(pd.DataFrame(Y_train), file,
1006
+ protocol=pickle.HIGHEST_PROTOCOL)
1007
+ with open(os.path.join(working_dir, 'Y_test.pkl'), 'wb') as file:
1008
+ pickle.dump(pd.DataFrame(Y_test), file,
1009
+ protocol=pickle.HIGHEST_PROTOCOL)
1010
+
1011
+ return X_train, Y_train, grp_id_train, X_test, Y_test, grp_id_test, coef
1012
+
1013
+
1014
+ def divergence_plot(nm, ylim=None):
1015
+
1016
+ if nm.hbr.configs['n_chains'] > 1 and nm.hbr.model_type != 'nn':
1017
+ a = pm.summary(nm.hbr.trace).round(2)
1018
+ plt.figure()
1019
+ plt.hist(a['r_hat'], 10)
1020
+ plt.title('Gelman-Rubin diagnostic for divergence')
1021
+
1022
+ divergent = nm.hbr.trace['diverging']
1023
+
1024
+ tracedf = pm.trace_to_dataframe(nm.hbr.trace)
1025
+
1026
+ _, ax = plt.subplots(2, 1, figsize=(15, 4), sharex=True, sharey=True)
1027
+ ax[0].plot(tracedf.values[divergent == 0].T, color='k', alpha=.05)
1028
+ ax[0].set_title('No Divergences', fontsize=10)
1029
+ ax[1].plot(tracedf.values[divergent == 1].T, color='C2', lw=.5, alpha=.5)
1030
+ ax[1].set_title('Divergences', fontsize=10)
1031
+ plt.ylim(ylim)
1032
+ plt.xticks(range(tracedf.shape[1]), list(tracedf.columns))
1033
+ plt.xticks(rotation=90, fontsize=7)
1034
+ plt.tight_layout()
1035
+ plt.show()
1036
+
1037
+
1038
+ def load_freesurfer_measure(measure, data_path, subjects_list):
1039
+ """This is a utility function to load different Freesurfer measures in a pandas Dataframe.
1040
+
1041
+ Inputs
1042
+
1043
+ :param measure: a string that defines the type of Freesurfer measure we want to load. \
1044
+ The options include:
1045
+
1046
+ * 'NumVert': Number of Vertices in each cortical area based on Destrieux atlas.
1047
+ * 'SurfArea: Surface area for each cortical area based on Destrieux atlas.
1048
+ * 'GrayVol': Gary matter volume in each cortical area based on Destrieux atlas.
1049
+ * 'ThickAvg': Average Cortical thinckness in each cortical area based on Destrieux atlas.
1050
+ * 'ThickStd': STD of Cortical thinckness in each cortical area based on Destrieux atlas.
1051
+ * 'MeanCurv': Integrated Rectified Mean Curvature in each cortical area based on Destrieux atlas.
1052
+ * 'GausCurv': Integrated Rectified Gaussian Curvature in each cortical area based on Destrieux atlas.
1053
+ * 'FoldInd': Folding Index in each cortical area based on Destrieux atlas.
1054
+ * 'CurvInd': Intrinsic Curvature Index in each cortical area based on Destrieux atlas.
1055
+ * 'brain': Brain Segmentation Statistics from aseg.stats file.
1056
+ * 'subcortical_volumes': Subcortical areas volume.
1057
+
1058
+ :param data_path: a string that specifies the path to the main Freesurfer folder.
1059
+ :param subjects_list: A Pythin list containing the list of subject names to load the data for. \
1060
+ The subject names should match the folder name for each subject's Freesurfer data folder.
1061
+
1062
+ Outputs:
1063
+ - df: A pandas datafrmae containing the subject names as Index and target Freesurfer measures.
1064
+ - missing_subs: A Python list of subject names that miss the target Freesurefr measures.
1065
+
1066
+ """
1067
+
1068
+ df = pd.DataFrame()
1069
+ missing_subs = []
1070
+
1071
+ if measure in ['NumVert', 'SurfArea', 'GrayVol', 'ThickAvg',
1072
+ 'ThickStd', 'MeanCurv', 'GausCurv', 'FoldInd', 'CurvInd']:
1073
+ l = ['NumVert', 'SurfArea', 'GrayVol', 'ThickAvg',
1074
+ 'ThickStd', 'MeanCurv', 'GausCurv', 'FoldInd', 'CurvInd']
1075
+ col = l.index(measure) + 1
1076
+ for i, sub in enumerate(subjects_list):
1077
+ try:
1078
+ data = dict()
1079
+
1080
+ a = pd.read_csv(data_path + sub + '/stats/lh.aparc.a2009s.stats',
1081
+ delimiter=r'\s+', comment='#', header=None)
1082
+ temp = dict(zip(a[0], a[col]))
1083
+ for key in list(temp.keys()):
1084
+ temp['L_'+key] = temp.pop(key)
1085
+ data.update(temp)
1086
+
1087
+ a = pd.read_csv(data_path + sub + '/stats/rh.aparc.a2009s.stats',
1088
+ delimiter=r'\s+', comment='#', header=None)
1089
+ temp = dict(zip(a[0], a[col]))
1090
+ for key in list(temp.keys()):
1091
+ temp['R_'+key] = temp.pop(key)
1092
+ data.update(temp)
1093
+
1094
+ df_temp = pd.DataFrame(data, index=[sub])
1095
+ df = pd.concat([df, df_temp])
1096
+ print('%d / %d: %s is done!' % (i, len(subjects_list), sub))
1097
+ except:
1098
+ missing_subs.append(sub)
1099
+ print('%d / %d: %s is missing!' % (i, len(subjects_list), sub))
1100
+ continue
1101
+
1102
+ elif measure == 'brain':
1103
+ for i, sub in enumerate(subjects_list):
1104
+ try:
1105
+ data = dict()
1106
+ s = StringIO()
1107
+ with open(data_path + sub + '/stats/aseg.stats') as f:
1108
+ for line in f:
1109
+ if line.startswith('# Measure'):
1110
+ s.write(line)
1111
+ s.seek(0) # "rewind" to the beginning of the StringIO object
1112
+ a = pd.read_csv(s, header=None) # with further parameters?
1113
+ data_brain = dict(zip(a[1], a[3]))
1114
+ data.update(data_brain)
1115
+ df_temp = pd.DataFrame(data, index=[sub])
1116
+ df = pd.concat([df, df_temp])
1117
+ print('%d / %d: %s is done!' % (i, len(subjects_list), sub))
1118
+ except:
1119
+ missing_subs.append(sub)
1120
+ print('%d / %d: %s is missing!' % (i, len(subjects_list), sub))
1121
+ continue
1122
+
1123
+ elif measure == 'subcortical_volumes':
1124
+ for i, sub in enumerate(subjects_list):
1125
+ try:
1126
+ data = dict()
1127
+ s = StringIO()
1128
+ with open(data_path + sub + '/stats/aseg.stats') as f:
1129
+ for line in f:
1130
+ if line.startswith('# Measure'):
1131
+ s.write(line)
1132
+ s.seek(0) # "rewind" to the beginning of the StringIO object
1133
+ a = pd.read_csv(s, header=None) # with further parameters?
1134
+ a = dict(zip(a[1], a[3]))
1135
+ if ' eTIV' in a.keys():
1136
+ tiv = a[' eTIV']
1137
+ else:
1138
+ tiv = a[' ICV']
1139
+ a = pd.read_csv(data_path + sub + '/stats/aseg.stats',
1140
+ delimiter=r'\s+', comment='#', header=None)
1141
+ data_vol = dict(zip(a[4]+'_mm3', a[3]))
1142
+ for key in data_vol.keys():
1143
+ data_vol[key] = data_vol[key]/tiv
1144
+ data.update(data_vol)
1145
+ data = pd.DataFrame(data, index=[sub])
1146
+ df = pd.concat([df, data])
1147
+ print('%d / %d: %s is done!' % (i, len(subjects_list), sub))
1148
+ except:
1149
+ missing_subs.append(sub)
1150
+ print('%d / %d: %s is missing!' % (i, len(subjects_list), sub))
1151
+ continue
1152
+
1153
+ return df, missing_subs
1154
+
1155
+
1156
+ class scaler:
1157
+
1158
+ def __init__(self, scaler_type='standardize', tail=0.05,
1159
+ adjust_outliers=True):
1160
+ """
1161
+ A class for rescaling data using either standardization or minmax
1162
+ normalization.
1163
+
1164
+ :param scaler_type: String that decides the type of scaler including
1165
+ 1) 'standardize' for standardizing data, 2) 'minmax' for minmax normalization
1166
+ in range of [0,1], and 3) 'robminmax' for robust (to outliers) minmax
1167
+ normalization.The default is 'standardize'.
1168
+ :param tail: Is a decimal in range [0,1] that decides the tails of
1169
+ distribution for finding robust min and max in 'robminmax'
1170
+ normalization. The defualt is 0.05.
1171
+ :param adjust_outliers: Boolean that decides whether to adjust the
1172
+ outliers in 'robminmax' normalization or not. If True the outliers
1173
+ values are truncated to 0 or 1. The defauls is True.
1174
+
1175
+ """
1176
+
1177
+ self.scaler_type = scaler_type
1178
+ self.tail = tail
1179
+ self.adjust_outliers = adjust_outliers
1180
+
1181
+ if self.scaler_type not in ['standardize', 'minmax', 'robminmax']:
1182
+ raise ValueError("Undifined scaler type!")
1183
+
1184
+ def fit(self, X):
1185
+
1186
+ if self.scaler_type == 'standardize':
1187
+ self.w = Welford()
1188
+ self.w.consume(X)
1189
+ self.m = self.w.mean
1190
+ self.s = self.w.std
1191
+
1192
+ elif self.scaler_type == 'minmax':
1193
+ self.min = np.min(X, axis=0)
1194
+ self.max = np.max(X, axis=0)
1195
+
1196
+ elif self.scaler_type == 'robminmax':
1197
+ self.min = np.zeros([X.shape[1],])
1198
+ self.max = np.zeros([X.shape[1],])
1199
+ for i in range(X.shape[1]):
1200
+ self.min[i] = np.median(
1201
+ np.sort(X[:, i])[0:int(np.round(X.shape[0] * self.tail))])
1202
+ self.max[i] = np.median(
1203
+ np.sort(X[:, i])[-int(np.round(X.shape[0] * self.tail)):])
1204
+
1205
+
1206
+ def extend(self, X):
1207
+ if self.scaler_type == 'standardize':
1208
+ self.w.consume(X)
1209
+ self.m = self.w.mean
1210
+ self.s = self.w.std
1211
+
1212
+ elif self.scaler_type in ['minmax']:
1213
+ self.min = np.min(np.stack([self.min, np.min(X, axis=0)], axis=0), axis=0)
1214
+ self.max = np.max(np.stack([self.max, np.max(X, axis=0)], axis=0), axis=0)
1215
+
1216
+ elif self.scaler_type in ['robminmax']:
1217
+ for i in range(X.shape[1]):
1218
+ med1 = np.median(np.sort(X[:, i])[0:int(np.round(X.shape[0] * self.tail))])
1219
+ med2 = np.median(np.sort(X[:, i])[-int(np.round(X.shape[0] * self.tail)):])
1220
+ self.min[i] = np.min(np.stack([self.min[i], med1], axis=0), axis=0)
1221
+ self.max[i] = np.max(np.stack([self.max[i], med2], axis=0), axis=0)
1222
+
1223
+ def transform(self, X, index=None):
1224
+
1225
+ if self.scaler_type == 'standardize':
1226
+ if index is None:
1227
+ X = (X - self.m) / self.s
1228
+ else:
1229
+ X = (X - self.m[index]) / self.s[index]
1230
+
1231
+ elif self.scaler_type in ['minmax', 'robminmax']:
1232
+ if index is None:
1233
+ X = (X - self.min) / (self.max - self.min)
1234
+ else:
1235
+ X = (X - self.min[index]) / (self.max[index] - self.min[index])
1236
+
1237
+ if self.adjust_outliers:
1238
+
1239
+ X[X < 0] = 0
1240
+ X[X > 1] = 1
1241
+
1242
+ return X
1243
+
1244
+ def inverse_transform(self, X, index=None):
1245
+
1246
+ if self.scaler_type == 'standardize':
1247
+ if index is None:
1248
+ X = X * self.s + self.m
1249
+ else:
1250
+ X = X * self.s[index] + self.m[index]
1251
+
1252
+ elif self.scaler_type in ['minmax', 'robminmax']:
1253
+ if index is None:
1254
+ X = X * (self.max - self.min) + self.min
1255
+ else:
1256
+ X = X * (self.max[index] - self.min[index]) + self.min[index]
1257
+ return X
1258
+
1259
+ def fit_transform(self, X):
1260
+
1261
+ if self.scaler_type == 'standardize':
1262
+ self.w = Welford()
1263
+ self.w.consume(X)
1264
+ self.m = self.w.mean
1265
+ self.s = self.w.std
1266
+ X = (X - self.m) / self.s
1267
+
1268
+ elif self.scaler_type == 'minmax':
1269
+
1270
+ self.min = np.min(X, axis=0)
1271
+ self.max = np.max(X, axis=0)
1272
+ X = (X - self.min) / (self.max - self.min)
1273
+
1274
+ elif self.scaler_type == 'robminmax':
1275
+
1276
+ self.min = np.zeros([X.shape[1],])
1277
+ self.max = np.zeros([X.shape[1],])
1278
+
1279
+ for i in range(X.shape[1]):
1280
+ self.min[i] = np.median(
1281
+ np.sort(X[:, i])[0:int(np.round(X.shape[0] * self.tail))])
1282
+ self.max[i] = np.median(
1283
+ np.sort(X[:, i])[-int(np.round(X.shape[0] * self.tail)):])
1284
+
1285
+ X = (X - self.min) / (self.max - self.min)
1286
+
1287
+ if self.adjust_outliers:
1288
+ X[X < 0] = 0
1289
+ X[X > 1] = 1
1290
+
1291
+ return X
1292
+
1293
+
1294
+ def retrieve_freesurfer_eulernum(freesurfer_dir, subjects=None, save_path=None):
1295
+ """
1296
+ This function receives the freesurfer directory (including processed data
1297
+ for several subjects) and retrieves the Euler number from the log files. If
1298
+ the log file does not exist, this function uses 'mris_euler_number' to recompute
1299
+ the Euler numbers (ENs). The function returns the ENs in a dataframe and
1300
+ the list of missing subjects (that for which computing EN is failed). If
1301
+ 'save_path' is specified then the results will be saved in a pickle file.
1302
+
1303
+ Basic usage::
1304
+
1305
+ ENs, missing_subjects = retrieve_freesurfer_eulernum(freesurfer_dir)
1306
+
1307
+ where the arguments are defined below.
1308
+
1309
+ :param freesurfer_dir: absolute path to the Freesurfer directory.
1310
+ :param subjects: List of subject that we want to retrieve the ENs for.
1311
+ If it is 'None' (the default), the list of the subjects will be automatically
1312
+ retreived from existing directories in the 'freesurfer_dir' (i.e. the ENs
1313
+ for all subjects will be retrieved).
1314
+ :param save_path: The path to save the results. If 'None' (default) the
1315
+ results are not saves on the disk.
1316
+
1317
+
1318
+ :outputs: * ENs - A dataframe of retrieved ENs.
1319
+ * missing_subjects - The list of missing subjects.
1320
+
1321
+ Developed by S.M. Kia
1322
+
1323
+ """
1324
+
1325
+ if subjects is None:
1326
+ subjects = [temp for temp in os.listdir(freesurfer_dir)
1327
+ if os.path.isdir(os.path.join(freesurfer_dir, temp))]
1328
+
1329
+ df = pd.DataFrame(index=subjects, columns=['lh_en', 'rh_en', 'avg_en'])
1330
+ missing_subjects = []
1331
+
1332
+ for s, sub in enumerate(subjects):
1333
+ sub_dir = os.path.join(freesurfer_dir, sub)
1334
+ log_file = os.path.join(sub_dir, 'scripts', 'recon-all.log')
1335
+
1336
+ if os.path.exists(sub_dir):
1337
+ if os.path.exists(log_file):
1338
+ with open(log_file) as f:
1339
+ for line in f:
1340
+ # find the part that refers to the EC
1341
+ if re.search('orig.nofix lheno', line):
1342
+ eno_line = line
1343
+ f.close()
1344
+ eno_l = eno_line.split()[3][0:-1] # remove the trailing comma
1345
+ eno_r = eno_line.split()[6]
1346
+ euler = (float(eno_l) + float(eno_r)) / 2
1347
+
1348
+ df.at[sub, 'lh_en'] = eno_l
1349
+ df.at[sub, 'rh_en'] = eno_r
1350
+ df.at[sub, 'avg_en'] = euler
1351
+
1352
+ print('%d: Subject %s is successfully processed. EN = %f'
1353
+ % (s, sub, df.at[sub, 'avg_en']))
1354
+ else:
1355
+ print('%d: Subject %s is missing log file, running QC ...' % (s, sub))
1356
+ try:
1357
+ bashCommand = 'mris_euler_number ' + freesurfer_dir + \
1358
+ sub + '/surf/lh.orig.nofix>' + 'temp_l.txt 2>&1'
1359
+ res = subprocess.run(
1360
+ bashCommand, stdout=subprocess.PIPE, shell=True)
1361
+ file = open('temp_l.txt', mode='r', encoding='utf-8-sig')
1362
+ lines = file.readlines()
1363
+ file.close()
1364
+ words = []
1365
+ for line in lines:
1366
+ line = line.strip()
1367
+ words.append([item.strip()
1368
+ for item in line.split(' ')])
1369
+ eno_l = np.float32(words[0][12])
1370
+
1371
+ bashCommand = 'mris_euler_number ' + freesurfer_dir + \
1372
+ sub + '/surf/rh.orig.nofix>' + 'temp_r.txt 2>&1'
1373
+ res = subprocess.run(
1374
+ bashCommand, stdout=subprocess.PIPE, shell=True)
1375
+ file = open('temp_r.txt', mode='r', encoding='utf-8-sig')
1376
+ lines = file.readlines()
1377
+ file.close()
1378
+ words = []
1379
+ for line in lines:
1380
+ line = line.strip()
1381
+ words.append([item.strip()
1382
+ for item in line.split(' ')])
1383
+ eno_r = np.float32(words[0][12])
1384
+
1385
+ df.at[sub, 'lh_en'] = eno_l
1386
+ df.at[sub, 'rh_en'] = eno_r
1387
+ df.at[sub, 'avg_en'] = (eno_r + eno_l) / 2
1388
+
1389
+ print('%d: Subject %s is successfully processed. EN = %f'
1390
+ % (s, sub, df.at[sub, 'avg_en']))
1391
+
1392
+ except:
1393
+ e = sys.exc_info()[0]
1394
+ missing_subjects.append(sub)
1395
+ print('%d: QC is failed for subject %s: %s.' % (s, sub, e))
1396
+
1397
+ else:
1398
+ missing_subjects.append(sub)
1399
+ print('%d: Subject %s is missing.' % (s, sub))
1400
+ df = df.dropna()
1401
+
1402
+ if save_path is not None:
1403
+ with open(save_path, 'wb') as file:
1404
+ pickle.dump({'ENs': df}, file)
1405
+
1406
+ return df, missing_subjects
1407
+
1408
+
1409
+ def get_package_versions():
1410
+
1411
+ import platform
1412
+ versions = dict()
1413
+ versions['Python'] = platform.python_version()
1414
+
1415
+ try:
1416
+ import pytensor
1417
+ versions['pytensor'] = pytensor.__version__
1418
+ except:
1419
+ versions['pytensor'] = ''
1420
+
1421
+ try:
1422
+ import pymc
1423
+ versions['PyMC'] = pymc.__version__
1424
+ except:
1425
+ versions['PyMC'] = ''
1426
+
1427
+ try:
1428
+ import pcntoolkit
1429
+ versions['PCNtoolkit'] = pcntoolkit.__version__
1430
+ except:
1431
+ versions['PCNtoolkit'] = ''
1432
+
1433
+ return versions
1434
+
1435
+
1436
+ def z_to_abnormal_p(Z):
1437
+ """
1438
+
1439
+ This function receives a matrix of z-scores (deviations) and transfer them
1440
+ to corresponding abnormal probabilities. For more information see Sec. 2.5
1441
+ in https://www.biorxiv.org/content/10.1101/2021.05.28.446120v1.full.pdf.
1442
+
1443
+ :param Z: n by p matrix of z-scores (deviations in normative modeling) where
1444
+ n is the number of subjects and p is the number of features.
1445
+ :type Z: numpy.array
1446
+
1447
+ :return: a matrix of same size as Z, with probability of each sample being
1448
+ an abnormal sample.
1449
+ :rtype: numpy.array
1450
+
1451
+ """
1452
+
1453
+ abn_p = 1 - norm.sf(np.abs(Z))*2
1454
+
1455
+ return abn_p
1456
+
1457
+
1458
+ def anomaly_detection_auc(abn_p, labels, n_permutation=None):
1459
+ """
1460
+ This is a utility function for computing region-wise AUC scores for anomaly
1461
+ detection using normative model. If n_permutations is not None (e.g. 1000),
1462
+ it also computes permuation p_values for each region.
1463
+
1464
+ :param abn_p: n by p matrix of with probability of each sample being
1465
+ an abnormal sample. This matrix can be computed using 'z_to_abnormal_p'
1466
+ function.
1467
+ :type abn_p: numpy.array
1468
+ :param labels: a vactor of binary labels for n subjects, 0 for healthy and
1469
+ 1 for patients.
1470
+ :type labels: numpy.array
1471
+ :param n_permutation: If not none the permutation significance test with
1472
+ n_permutation repetitions is performed for each feature. defaults to None.
1473
+ :type n_permutation: numpy.int
1474
+ :return: p by 1 matrix of AUCs and p_values for permutation test for each
1475
+ feature (i.e. brain region).
1476
+ :rtype: numpy.array
1477
+
1478
+ """
1479
+
1480
+ n, p = abn_p.shape
1481
+ aucs = np.zeros([p])
1482
+ p_values = np.zeros([p])
1483
+
1484
+ for i in range(p):
1485
+ aucs[i] = roc_auc_score(labels, abn_p[:, i])
1486
+
1487
+ if n_permutation is not None:
1488
+
1489
+ auc_perm = np.zeros([n_permutation])
1490
+ for j in range(n_permutation):
1491
+ rand_idx = np.random.permutation(len(labels))
1492
+ rand_labels = labels[rand_idx]
1493
+ auc_perm[j] = roc_auc_score(rand_labels, abn_p[:, i])
1494
+
1495
+ p_values[i] = (np.sum(auc_perm > aucs[i]) + 1) / \
1496
+ (n_permutation + 1)
1497
+ print('Feature %d of %d is done: p_value=%f' %
1498
+ (i, p, p_values[i]))
1499
+
1500
+ return aucs, p_values
1501
+
1502
+
1503
+ def cartesian_product(arrays):
1504
+ """
1505
+ This is a utility function for creating dummy data (covariates). It computes the cartesian product of N 1D arrays.
1506
+
1507
+ Example:
1508
+ a = cartesian_product(np.arange(0,5), np.arange(6,10))
1509
+
1510
+ :param arrays: a list of N input 1D numpy arrays with size d1,d2,dN.
1511
+ :return: A d1...dN by N matrix of cartesian product of N arrays.
1512
+
1513
+ """
1514
+
1515
+ la = len(arrays)
1516
+ dtype = np.result_type(arrays[0])
1517
+ arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
1518
+ for i, a in enumerate(np.ix_(*arrays)):
1519
+ arr[..., i] = a
1520
+
1521
+ return arr.reshape(-1, la)
1522
+
1523
+
1524
+ def yes_or_no(question):
1525
+ """
1526
+ Utility function for getting yes/no action from the user.
1527
+
1528
+ :param question: String for user query.
1529
+
1530
+ :return: Boolean of True for 'yes' and False for 'no'.
1531
+
1532
+
1533
+ """
1534
+
1535
+ while "the answer is invalid":
1536
+ reply = str(input(question+' (y/n): ')).lower().strip()
1537
+ if reply[:1] == 'y':
1538
+ return True
1539
+ if reply[:1] == 'n':
1540
+ return False
1541
+
1542
+
1543
+ # ====== This is stuff used for the SHASH distributions, but using numpy (not pymc or pytensor) ===
1544
+
1545
+ def K(p, x):
1546
+ return np.array(spp.kv(p, x))
1547
+
1548
+
1549
+ def P(q):
1550
+ """
1551
+ The P function as given in Jones et al.
1552
+ :param q:
1553
+ :return:
1554
+
1555
+ """
1556
+ frac = np.exp(1 / 4) / np.sqrt(8 * np.pi)
1557
+ K1 = K((q + 1) / 2, 1 / 4)
1558
+ K2 = K((q - 1) / 2, 1 / 4)
1559
+ a = (K1 + K2) * frac
1560
+ return a
1561
+
1562
+
1563
+ def m(epsilon, delta, r):
1564
+ """
1565
+ The r'th uncentered moment. Given by Jones et al.
1566
+ """
1567
+ frac1 = 1 / np.power(2, r)
1568
+ acc = 0
1569
+ for i in range(r + 1):
1570
+ combs = spp.comb(r, i)
1571
+ flip = np.power(-1, i)
1572
+ ex = np.exp((r - 2 * i) * epsilon / delta)
1573
+ p = P((r - 2 * i) / delta)
1574
+ acc += combs * flip * ex * p
1575
+ return frac1 * acc
1576
+
1577
+ # ====== end stufff for SHASH
1578
+
1579
+ # Design matrix function
1580
+
1581
+
1582
+ def z_score(y, mean, std, skew=None, kurtosis=None, likelihood="Normal"):
1583
+ """
1584
+ Computes Z-score of some data given parameters and a likelihood type string.
1585
+ if likelihood == "Normal", parameters 'skew' and 'kurtosis' are ignored
1586
+ :param y:
1587
+ :param mean:
1588
+ :param std:
1589
+ :param skew:
1590
+ :param kurtosis:
1591
+ :param likelihood:
1592
+ :return:
1593
+ """
1594
+ if likelihood == "SHASHo":
1595
+ SHASH = (y-mean)/std
1596
+ Z = np.sinh(np.arcsinh(SHASH)*kurtosis - skew)
1597
+ elif likelihood == "SHASHo2":
1598
+ std_d = std/kurtosis
1599
+ SHASH = (y-mean)/std_d
1600
+ Z = np.sinh(np.arcsinh(SHASH)*kurtosis - skew)
1601
+ elif likelihood == "SHASHb":
1602
+ true_mean = m(skew, kurtosis, 1)
1603
+ true_std = np.sqrt((m(skew, kurtosis, 2) - true_mean ** 2))
1604
+ SHASH_c = ((y-mean)/std)
1605
+ SHASH = SHASH_c * true_std + true_mean
1606
+ Z = np.sinh(np.arcsinh(SHASH) * kurtosis - skew)
1607
+ else:
1608
+ Z = (y-mean)/std
1609
+ return Z
1610
+
1611
+
1612
+ def expand_all(*args):
1613
+ def expand(a):
1614
+ if len(a.shape) == 1:
1615
+ return np.expand_dims(a, axis=1)
1616
+ else:
1617
+ return a
1618
+ return [expand(x) for x in args]
1619
+
1620
+
1621
+
1622
+
1623
+ class Welford(object):
1624
+ """Implements Welford's algorithm for computing a running mean
1625
+ and standard deviation as described at:
1626
+ http://www.johndcook.com/standard_deviation.html
1627
+ Taken from: https://gist.github.com/alexalemi/2151722#file-welford-py
1628
+ Adapted to work with numpy arrays.
1629
+
1630
+ can take single values or iterables
1631
+
1632
+ Properties:
1633
+ mean - returns the mean
1634
+ std - returns the std
1635
+ meanfull- returns the mean and std of the mean
1636
+
1637
+ Usage:
1638
+ >>> foo = Welford()
1639
+ >>> foo(range(100))
1640
+ >>> foo
1641
+ <Welford: 49.5 +- 29.0114919759>
1642
+ >>> foo([1]*1000)
1643
+ >>> foo
1644
+ <Welford: 5.40909090909 +- 16.4437417146>
1645
+ >>> foo.mean
1646
+ 5.409090909090906
1647
+ >>> foo.std
1648
+ 16.44374171455467
1649
+ >>> foo.meanfull
1650
+ (5.409090909090906, 0.4957974674244838)
1651
+ """
1652
+
1653
+ def __init__(self, lst=None):
1654
+ self.k = np.array([0])
1655
+ self.M = np.array([0])
1656
+ self.S = np.array([0])
1657
+
1658
+ self.__call__(lst)
1659
+
1660
+ def update(self, x):
1661
+ if self.k == 0:
1662
+ if isinstance(x, np.ndarray):
1663
+ self.M = np.zeros_like(x)
1664
+ self.S = np.zeros_like(x)
1665
+ if x is None:
1666
+ return
1667
+ self.k += 1
1668
+ newM = self.M + (x - self.M) * 1.0 / self.k
1669
+ newS = self.S + (x - self.M) * (x - newM)
1670
+ self.M, self.S = newM, newS
1671
+
1672
+ def consume(self, lst):
1673
+ lst = iter(lst)
1674
+ for x in lst:
1675
+ self.update(x)
1676
+
1677
+ def __call__(self, x):
1678
+ if hasattr(x, "__iter__"):
1679
+ self.consume(x)
1680
+ else:
1681
+ self.update(x)
1682
+
1683
+ @property
1684
+ def mean(self) -> np.ndarray:
1685
+ return self.M
1686
+
1687
+ @property
1688
+ def meanfull(self) -> tuple[np.ndarray, np.ndarray]:
1689
+ return self.mean, self.std / np.sqrt(self.k)
1690
+
1691
+ @property
1692
+ def std(self) -> np.ndarray:
1693
+ if self.k == 1:
1694
+ return np.zeros_like(self.M)
1695
+ return np.sqrt(self.S / (self.k - 1))
1696
+
1697
+ def __repr__(self):
1698
+ return "<Welford: {} +- {}>".format(self.mean, self.std)