pcntoolkit 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1647 @@
1
+ #!/opt/conda/bin/python
2
+
3
+ # ------------------------------------------------------------------------------
4
+ # Usage:
5
+ # python normative.py -m [maskfile] -k [number of CV folds] -c <covariates>
6
+ # -t [test covariates] -r [test responses] <infile>
7
+ #
8
+ # Either the -k switch or -t switch should be specified, but not both.
9
+ # If -t is selected, a set of responses should be provided with the -r switch
10
+ #
11
+ # Written by A. Marquand
12
+ # ------------------------------------------------------------------------------
13
+
14
+ from __future__ import division, print_function
15
+
16
+ import argparse
17
+ import glob
18
+ import os
19
+ import pickle
20
+ import sys
21
+ from pathlib import Path
22
+
23
+ import numpy as np
24
+ from sklearn.model_selection import KFold
25
+
26
+ try:
27
+ import nutpie
28
+ except ImportError:
29
+ # warnings.warn("Nutpie not installed. For fitting HBR models with the nutpie backend, install it with `conda install nutpie numba`")
30
+ pass
31
+
32
+
33
+ try: # run as a package if installed
34
+ from pcntoolkit import configs
35
+ from pcntoolkit.dataio import fileio
36
+ from pcntoolkit.normative_model.norm_utils import norm_init
37
+ from pcntoolkit.util.utils import (
38
+ CustomCV,
39
+ compute_MSLL,
40
+ compute_pearsonr,
41
+ explained_var,
42
+ get_package_versions,
43
+ scaler,
44
+ )
45
+ except ImportError:
46
+ pass
47
+
48
+ path = os.path.abspath(os.path.dirname(__file__))
49
+ if path not in sys.path:
50
+ sys.path.append(path)
51
+ # sys.path.append(os.path.join(path,'normative_model'))
52
+ del path
53
+
54
+ import configs
55
+ from dataio import fileio
56
+ from normative_model.norm_utils import norm_init
57
+ from util.utils import (
58
+ CustomCV,
59
+ compute_MSLL,
60
+ compute_pearsonr,
61
+ explained_var,
62
+ get_package_versions,
63
+ scaler,
64
+ )
65
+
66
+ PICKLE_PROTOCOL = configs.PICKLE_PROTOCOL
67
+
68
+
69
+ def load_response_vars(datafile, maskfile=None, vol=True):
70
+ """
71
+ Load response variables from file. This will load the data and mask it if
72
+ necessary. If the data is in ascii format it will be converted into a numpy
73
+ array. If the data is in neuroimaging format it will be reshaped into a
74
+ 2D array (subjects x variables) and a mask will be created if necessary.
75
+
76
+ :param datafile: File containing the response variables
77
+ :param maskfile: Mask file (nifti only)
78
+ :param vol: If True, load the data as a 4D volume (nifti only)
79
+ :returns Y: Response variables
80
+ :returns volmask: Mask file (nifti only)
81
+ """
82
+
83
+ if fileio.file_type(datafile) == 'nifti':
84
+ dat = fileio.load_nifti(datafile, vol=vol)
85
+ volmask = fileio.create_mask(dat, mask=maskfile)
86
+ Y = fileio.vol2vec(dat, volmask).T
87
+ else:
88
+ Y = fileio.load(datafile)
89
+ volmask = None
90
+ if fileio.file_type(datafile) == 'cifti':
91
+ Y = Y.T
92
+
93
+ return Y, volmask
94
+
95
+
96
+ def get_args(*args):
97
+ """
98
+ Parse command line arguments for normative modeling
99
+
100
+ :param args: command line arguments
101
+ :returns respfile: response variables for the normative model
102
+ :returns maskfile: mask used to apply to the data (nifti only)
103
+ :returns covfile: covariates used to predict the response variable
104
+ :returns cvfolds: Number of cross-validation folds
105
+ :returns testcov: Test covariates
106
+ :returns testresp: Test responses
107
+ :returns func: Function to call
108
+ :returns alg: Algorithm for normative model
109
+ :returns configparam: Parameters controlling the estimation algorithm
110
+ :returns kw_args: Additional keyword arguments
111
+ """
112
+ args = args[0][0]
113
+ # parse arguments
114
+ parser = argparse.ArgumentParser(description="Normative Modeling")
115
+ parser.add_argument("respfile", help="Response variables for the normative model")
116
+ parser.add_argument("-f", help="Function to call", dest="func", default="estimate")
117
+ parser.add_argument("-m", help="mask file", dest="maskfile", default=None)
118
+ parser.add_argument("-c", help="covariates file", dest="covfile", default=None)
119
+ parser.add_argument("-k", help="cross-validation folds", dest="cvfolds", default=None)
120
+ parser.add_argument("-t", help="covariates (test data)", dest="testcov", default=None)
121
+ parser.add_argument("-r", help="responses (test data)", dest="testresp", default=None)
122
+ parser.add_argument("-a", help="algorithm", dest="alg", default="gpr")
123
+ parser.add_argument("-x", help="algorithm specific config options", dest="configparam", default=None)
124
+ parsed_args, keyword_args = parser.parse_known_args(args)
125
+
126
+ # Process required arguments
127
+ wdir = os.path.realpath(os.path.curdir)
128
+ respfile = os.path.join(wdir, parsed_args.respfile)
129
+ if parsed_args.covfile is None:
130
+ raise ValueError("No covariates specified")
131
+ else:
132
+ covfile = parsed_args.covfile
133
+
134
+ # Process optional arguments
135
+ if parsed_args.maskfile is None:
136
+ maskfile = None
137
+ else:
138
+ maskfile = os.path.join(wdir, parsed_args.maskfile)
139
+ if parsed_args.testcov is None and parsed_args.cvfolds is not None:
140
+ testcov = None
141
+ testresp = None
142
+ cvfolds = int(parsed_args.cvfolds)
143
+ print("Running under " + str(cvfolds) + " fold cross-validation.")
144
+ else:
145
+ print("Test covariates specified")
146
+ testcov = parsed_args.testcov
147
+ cvfolds = None
148
+ if parsed_args.testresp is None:
149
+ testresp = None
150
+ print("No test response variables specified")
151
+ else:
152
+ testresp = parsed_args.testresp
153
+ if parsed_args.cvfolds is not None:
154
+ print("Ignoring cross-valdation specification (test data given)")
155
+
156
+ # Process addtional keyword arguments. These are always added as strings
157
+ kw_args = {}
158
+ for kw in keyword_args:
159
+ kw_arg = kw.split('=')
160
+
161
+ exec("kw_args.update({'" + kw_arg[0] + "' : " +
162
+ "'" + str(kw_arg[1]) + "'" + "})")
163
+
164
+ return respfile, maskfile, covfile, cvfolds, \
165
+ testcov, testresp, parsed_args.func, parsed_args.alg, \
166
+ parsed_args.configparam, kw_args
167
+
168
+
169
+ def evaluate(Y, Yhat, S2=None, mY=None, sY=None, nlZ=None, nm=None, Xz_tr=None, alg=None,
170
+ metrics=['Rho', 'RMSE', 'SMSE', 'EXPV', 'MSLL']):
171
+ ''' Compute error metrics
172
+ This function will compute error metrics based on a set of predictions Yhat
173
+ and a set of true response variables Y, namely:
174
+
175
+ * Rho: Pearson correlation
176
+ * RMSE: root mean squared error
177
+ * SMSE: standardized mean squared error
178
+ * EXPV: explained variance
179
+
180
+ If the predictive variance is also specified the log loss will be computed
181
+ (which also takes into account the predictive variance). If the mean and
182
+ standard deviation are also specified these will be used to standardize
183
+ this, yielding the mean standardized log loss
184
+
185
+ :param Y: N x P array of true response variables
186
+ :param Yhat: N x P array of predicted response variables
187
+ :param S2: predictive variance
188
+ :param mY: mean of the training set
189
+ :param sY: standard deviation of the training set
190
+
191
+ :returns metrics: evaluation metrics
192
+
193
+ '''
194
+
195
+ feature_num = Y.shape[1]
196
+
197
+ # Remove metrics that cannot be computed with only a single data point
198
+ if Y.shape[0] == 1:
199
+ if 'MSLL' in metrics:
200
+ metrics.remove('MSLL')
201
+ if 'SMSE' in metrics:
202
+ metrics.remove('SMSE')
203
+
204
+ # find and remove bad variables from the response variables
205
+ nz = np.where(np.bitwise_and(np.isfinite(Y).any(axis=0),
206
+ np.var(Y, axis=0) != 0))[0]
207
+
208
+ MSE = np.mean((Y - Yhat)**2, axis=0)
209
+
210
+ results = dict()
211
+
212
+ if 'RMSE' in metrics:
213
+ RMSE = np.sqrt(MSE)
214
+ results['RMSE'] = RMSE
215
+
216
+ if 'Rho' in metrics:
217
+ Rho = np.zeros(feature_num)
218
+ pRho = np.ones(feature_num)
219
+ Rho[nz], pRho[nz] = compute_pearsonr(Y[:, nz], Yhat[:, nz])
220
+ results['Rho'] = Rho
221
+ results['pRho'] = pRho
222
+
223
+ if 'SMSE' in metrics:
224
+ SMSE = np.zeros_like(MSE)
225
+ SMSE[nz] = MSE[nz] / np.var(Y[:, nz], axis=0)
226
+ results['SMSE'] = SMSE
227
+
228
+ if 'EXPV' in metrics:
229
+ EXPV = np.zeros(feature_num)
230
+ EXPV[nz] = explained_var(Y[:, nz], Yhat[:, nz])
231
+ results['EXPV'] = EXPV
232
+
233
+ if 'MSLL' in metrics:
234
+ if ((S2 is not None) and (mY is not None) and (sY is not None)):
235
+ MSLL = np.zeros(feature_num)
236
+ MSLL[nz] = compute_MSLL(Y[:, nz], Yhat[:, nz], S2[:, nz],
237
+ mY.reshape(-1, 1).T,
238
+ (sY**2).reshape(-1, 1).T)
239
+ results['MSLL'] = MSLL
240
+
241
+ if 'NLL' in metrics:
242
+ results['NLL'] = nlZ
243
+
244
+ if 'BIC' in metrics:
245
+ if hasattr(getattr(nm, alg), 'hyp'):
246
+ n = Xz_tr.shape[0]
247
+ k = len(getattr(nm, alg).hyp)
248
+ BIC = k * np.log(n) + 2 * nlZ
249
+ results['BIC'] = BIC
250
+
251
+ return results
252
+
253
+
254
+ def save_results(respfile, Yhat, S2, maskvol, Z=None, Y=None, outputsuffix=None,
255
+ results=None, save_path=''):
256
+ """
257
+ Writes the results of the normative model to disk.
258
+
259
+ Parameters:
260
+ respfile (str): The response variables file.
261
+ Yhat (np.array): The predicted response variables.
262
+ S2 (np.array): The predictive variance.
263
+ maskvol (np.array): The mask volume.
264
+ Z (np.array, optional): The latent variable. Defaults to None.
265
+ Y (np.array, optional): The observed response variables. Defaults to None.
266
+ outputsuffix (str, optional): The suffix to append to the output files. Defaults to None.
267
+ results (dict, optional): The results of the normative model. Defaults to None.
268
+ save_path (str, optional): The directory to save the results to. Defaults to ''.
269
+
270
+ Returns:
271
+ None
272
+ """
273
+
274
+ print("Writing outputs ...")
275
+ if respfile is None:
276
+ exfile = None
277
+ file_ext = '.pkl'
278
+ else:
279
+ if fileio.file_type(respfile) == 'cifti' or \
280
+ fileio.file_type(respfile) == 'nifti':
281
+ exfile = respfile
282
+ else:
283
+ exfile = None
284
+ file_ext = fileio.file_extension(respfile)
285
+
286
+ if outputsuffix is not None:
287
+ ext = str(outputsuffix) + file_ext
288
+ else:
289
+ ext = file_ext
290
+
291
+ fileio.save(Yhat, os.path.join(save_path, 'yhat' + ext), example=exfile,
292
+ mask=maskvol)
293
+ fileio.save(S2, os.path.join(save_path, 'ys2' + ext), example=exfile,
294
+ mask=maskvol)
295
+ if Z is not None:
296
+ fileio.save(Z, os.path.join(save_path, 'Z' + ext), example=exfile,
297
+ mask=maskvol)
298
+ if Y is not None:
299
+ fileio.save(Y, os.path.join(save_path, 'Y' + ext), example=exfile,
300
+ mask=maskvol)
301
+ if results is not None:
302
+ for metric in list(results.keys()):
303
+ if (metric == 'NLL' or metric == 'BIC') and file_ext == '.nii.gz':
304
+ fileio.save(results[metric], os.path.join(save_path, metric + str(outputsuffix) + '.pkl'),
305
+ example=exfile, mask=maskvol)
306
+ else:
307
+ fileio.save(results[metric], os.path.join(save_path, metric + ext),
308
+ example=exfile, mask=maskvol)
309
+
310
+
311
+ def estimate(covfile, respfile, **kwargs):
312
+ """ Estimate a normative model
313
+
314
+ This will estimate a model in one of two settings according to
315
+ theparticular parameters specified (see below)
316
+
317
+ * under k-fold cross-validation.
318
+ requires respfile, covfile and cvfolds>=2
319
+ * estimating a training dataset then applying to a second test dataset.
320
+ requires respfile, covfile, testcov and testresp.
321
+ * estimating on a training dataset ouput of forward maps mean and se.
322
+ requires respfile, covfile and testcov
323
+
324
+ The models are estimated on the basis of data stored on disk in ascii or
325
+ neuroimaging data formats (nifti or cifti). Ascii data should be in
326
+ tab or space delimited format with the number of subjects in rows and the
327
+ number of variables in columns. Neuroimaging data will be reshaped
328
+ into the appropriate format
329
+
330
+ Basic usage::
331
+
332
+ estimate(covfile, respfile, [extra_arguments])
333
+
334
+ where the variables are defined below. Note that either the cfolds
335
+ parameter or (testcov, testresp) should be specified, but not both.
336
+
337
+ :param respfile: response variables for the normative model
338
+ :param covfile: covariates used to predict the response variable
339
+ :param maskfile: mask used to apply to the data (nifti only)
340
+ :param cvfolds: Number of cross-validation folds
341
+ :param testcov: Test covariates
342
+ :param testresp: Test responses
343
+ :param alg: Algorithm for normative model
344
+ :param configparam: Parameters controlling the estimation algorithm
345
+ :param saveoutput: Save the output to disk? Otherwise returned as arrays
346
+ :param outputsuffix: Text string to add to the output filenames
347
+ :param inscaler: Scaling approach for input covariates, could be 'None' (Default),
348
+ 'standardize', 'minmax', or 'robminmax'.
349
+ :param outscaler: Scaling approach for output responses, could be 'None' (Default),
350
+ 'standardize', 'minmax', or 'robminmax'.
351
+
352
+ All outputs are written to disk in the same format as the input. These are:
353
+
354
+ :outputs: * yhat - predictive mean
355
+ * ys2 - predictive variance
356
+ * nm - normative model
357
+ * Z - deviance scores
358
+ * Rho - Pearson correlation between true and predicted responses
359
+ * pRho - parametric p-value for this correlation
360
+ * rmse - root mean squared error between true/predicted responses
361
+ * smse - standardised mean squared error
362
+
363
+ The outputsuffix may be useful to estimate multiple normative models in the
364
+ same directory (e.g. for custom cross-validation schemes)
365
+ """
366
+
367
+ # parse keyword arguments
368
+ maskfile = kwargs.pop('maskfile', None)
369
+ cvfolds = kwargs.pop('cvfolds', None)
370
+ testcov = kwargs.pop('testcov', None)
371
+ testresp = kwargs.pop('testresp', None)
372
+ alg = kwargs.pop('alg', 'gpr')
373
+ outputsuffix = kwargs.pop('outputsuffix', 'estimate')
374
+ # Making sure there is only one
375
+ outputsuffix = "_" + outputsuffix.replace("_", "")
376
+ # '_' is in the outputsuffix to
377
+ # avoid file name parsing problem.
378
+ inscaler = kwargs.pop('inscaler', 'None')
379
+ print(f"inscaler: {inscaler}")
380
+ outscaler = kwargs.pop('outscaler', 'None')
381
+ print(f"outscaler: {outscaler}")
382
+ warp = kwargs.get('warp', None)
383
+
384
+ # convert from strings if necessary
385
+ saveoutput = kwargs.pop('saveoutput', 'True')
386
+ if type(saveoutput) is str:
387
+ saveoutput = saveoutput == 'True'
388
+ savemodel = kwargs.pop('savemodel', 'False')
389
+ if type(savemodel) is str:
390
+ savemodel = savemodel == 'True'
391
+
392
+ if savemodel and not os.path.isdir('Models'):
393
+ os.mkdir('Models')
394
+
395
+ # which output metrics to compute
396
+ metrics = ['Rho', 'RMSE', 'SMSE', 'EXPV', 'MSLL', 'NLL', 'BIC']
397
+
398
+ # load data
399
+ print("Processing data in " + respfile)
400
+ X = fileio.load(covfile)
401
+ Y, maskvol = load_response_vars(respfile, maskfile)
402
+ if len(Y.shape) == 1:
403
+ Y = Y[:, np.newaxis]
404
+ if len(X.shape) == 1:
405
+ X = X[:, np.newaxis]
406
+ Nmod = Y.shape[1]
407
+
408
+ if (testcov is not None) and (cvfolds is None): # a separate test dataset
409
+
410
+ run_cv = False
411
+ cvfolds = 1
412
+ Xte = fileio.load(testcov)
413
+ if len(Xte.shape) == 1:
414
+ Xte = Xte[:, np.newaxis]
415
+ if testresp is not None:
416
+ Yte, testmask = load_response_vars(testresp, maskfile)
417
+ if len(Yte.shape) == 1:
418
+ Yte = Yte[:, np.newaxis]
419
+ else:
420
+ sub_te = Xte.shape[0]
421
+ Yte = np.zeros([sub_te, Nmod])
422
+
423
+ # treat as a single train-test split
424
+ testids = range(X.shape[0], X.shape[0]+Xte.shape[0])
425
+ splits = CustomCV((range(0, X.shape[0]),), (testids,))
426
+
427
+ Y = np.concatenate((Y, Yte), axis=0)
428
+ X = np.concatenate((X, Xte), axis=0)
429
+
430
+ else:
431
+ run_cv = True
432
+ # we are running under cross-validation
433
+ splits = KFold(n_splits=cvfolds, shuffle=True)
434
+ testids = range(0, X.shape[0])
435
+ if alg == 'hbr':
436
+ trbefile = kwargs.get('trbefile', None)
437
+ if trbefile is not None:
438
+ be = fileio.load(trbefile)
439
+ if len(be.shape) == 1:
440
+ be = be[:, np.newaxis]
441
+ else:
442
+ print('No batch-effects file! Initilizing all as zeros!')
443
+ be = np.zeros([X.shape[0], 1])
444
+
445
+ # find and remove bad variables from the response variables
446
+ # note: the covariates are assumed to have already been checked
447
+ nz = np.where(np.bitwise_and(np.isfinite(Y).any(axis=0),
448
+ np.var(Y, axis=0) != 0))[0]
449
+
450
+ # run cross-validation loop
451
+ Yhat = np.zeros_like(Y)
452
+ S2 = np.zeros_like(Y)
453
+ Z = np.zeros_like(Y)
454
+ nlZ = np.zeros((Nmod, cvfolds))
455
+
456
+ scaler_resp = []
457
+ scaler_cov = []
458
+ mean_resp = [] # this is just for computing MSLL
459
+ std_resp = [] # this is just for computing MSLL
460
+
461
+ if warp is not None:
462
+ Ywarp = np.zeros_like(Yhat)
463
+
464
+ # for warping we need to compute metrics separately for each fold
465
+ results_folds = dict()
466
+ for m in metrics:
467
+ results_folds[m] = np.zeros((Nmod, cvfolds))
468
+
469
+ for idx in enumerate(splits.split(X)):
470
+
471
+ fold = idx[0]
472
+ tr = idx[1][0]
473
+ ts = idx[1][1]
474
+
475
+ # standardize responses and covariates, ignoring invalid entries
476
+ iy_tr, jy_tr = np.ix_(tr, nz)
477
+ iy_ts, jy_ts = np.ix_(ts, nz)
478
+ mY = np.mean(Y[iy_tr, jy_tr], axis=0)
479
+ sY = np.std(Y[iy_tr, jy_tr], axis=0)
480
+ mean_resp.append(mY)
481
+ std_resp.append(sY)
482
+
483
+ if inscaler in ['standardize', 'minmax', 'robminmax']:
484
+ X_scaler = scaler(inscaler)
485
+ Xz_tr = X_scaler.fit_transform(X[tr, :])
486
+ Xz_ts = X_scaler.transform(X[ts, :])
487
+ scaler_cov.append(X_scaler)
488
+ else:
489
+ Xz_tr = X[tr, :]
490
+ Xz_ts = X[ts, :]
491
+
492
+ if outscaler in ['standardize', 'minmax', 'robminmax']:
493
+ Y_scaler = scaler(outscaler)
494
+ Yz_tr = Y_scaler.fit_transform(Y[iy_tr, jy_tr])
495
+ scaler_resp.append(Y_scaler)
496
+ else:
497
+ Yz_tr = Y[iy_tr, jy_tr]
498
+
499
+ if (run_cv == True and alg == 'hbr'):
500
+ fileio.save(be[tr, :], 'be_kfold_tr_tempfile.pkl')
501
+ fileio.save(be[ts, :], 'be_kfold_ts_tempfile.pkl')
502
+ kwargs['trbefile'] = 'be_kfold_tr_tempfile.pkl'
503
+ kwargs['tsbefile'] = 'be_kfold_ts_tempfile.pkl'
504
+
505
+ # estimate the models for all response variables
506
+ for i in range(0, len(nz)):
507
+ print("Estimating model ", i+1, "of", len(nz))
508
+ nm = norm_init(Xz_tr, Yz_tr[:, i], alg=alg, **kwargs)
509
+
510
+ try:
511
+ nm = nm.estimate(Xz_tr, Yz_tr[:, i], **kwargs)
512
+ yhat, s2 = nm.predict(Xz_ts, Xz_tr, Yz_tr[:, i], **kwargs)
513
+
514
+ if savemodel:
515
+ nm.save('Models/NM_' + str(fold) + '_' + str(nz[i]) +
516
+ outputsuffix + '.pkl')
517
+
518
+ if outscaler == 'standardize':
519
+ Yhat[ts, nz[i]] = Y_scaler.inverse_transform(yhat, index=i)
520
+ S2[ts, nz[i]] = s2 * sY[i]**2
521
+ elif outscaler in ['minmax', 'robminmax']:
522
+ Yhat[ts, nz[i]] = Y_scaler.inverse_transform(yhat, index=i)
523
+ S2[ts, nz[i]] = s2 * (Y_scaler.max[i] - Y_scaler.min[i])**2
524
+ else:
525
+ Yhat[ts, nz[i]] = yhat
526
+ S2[ts, nz[i]] = s2
527
+
528
+ nlZ[nz[i], fold] = nm.neg_log_lik
529
+
530
+ if (run_cv or testresp is not None):
531
+ if warp is not None:
532
+ # TODO: Warping for scaled data
533
+ if outscaler is not None and outscaler != 'None':
534
+ raise ValueError(
535
+ "outscaler not yet supported warping")
536
+ warp_param = nm.blr.hyp[1:nm.blr.warp.get_n_params()+1]
537
+ Ywarp[ts, nz[i]] = nm.blr.warp.f(
538
+ Y[ts, nz[i]], warp_param)
539
+ Ytest = Ywarp[ts, nz[i]]
540
+
541
+ # Save warped mean of the training data (for MSLL)
542
+ yw = nm.blr.warp.f(Y[tr, nz[i]], warp_param)
543
+
544
+ # create arrays for evaluation
545
+ Yhati = Yhat[ts, nz[i]]
546
+ Yhati = Yhati[:, np.newaxis]
547
+ S2i = S2[ts, nz[i]]
548
+ S2i = S2i[:, np.newaxis]
549
+
550
+ # evaluate and save results
551
+ mf = evaluate(Ytest[:, np.newaxis], Yhati, S2=S2i,
552
+ mY=np.mean(yw), sY=np.std(yw),
553
+ nlZ=nm.neg_log_lik, nm=nm, Xz_tr=Xz_tr,
554
+ alg=alg, metrics=metrics)
555
+ for k in metrics:
556
+ results_folds[k][nz[i]][fold] = mf[k]
557
+ else:
558
+ Ytest = Y[ts, nz[i]]
559
+
560
+ if alg == 'hbr':
561
+ if outscaler in ['standardize', 'minmax', 'robminmax']:
562
+ Ytestz = Y_scaler.transform(
563
+ Ytest.reshape(-1, 1), index=i)
564
+ else:
565
+ Ytestz = Ytest.reshape(-1, 1)
566
+ Z[ts, nz[i]] = nm.get_mcmc_zscores(
567
+ Xz_ts, Ytestz, **kwargs)
568
+ else:
569
+ Z[ts, nz[i]] = (Ytest - Yhat[ts, nz[i]]) / \
570
+ np.sqrt(S2[ts, nz[i]])
571
+
572
+ except Exception as e:
573
+ exc_type, exc_obj, exc_tb = sys.exc_info()
574
+ fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
575
+ print("Model ", i+1, "of", len(nz),
576
+ "FAILED!..skipping and writing NaN to outputs")
577
+ print("Exception:")
578
+ print(e)
579
+ print(exc_type, fname, exc_tb.tb_lineno)
580
+
581
+ Yhat[ts, nz[i]] = float('nan')
582
+ S2[ts, nz[i]] = float('nan')
583
+ nlZ[nz[i], fold] = float('nan')
584
+ if testcov is None:
585
+ Z[ts, nz[i]] = float('nan')
586
+ else:
587
+ if testresp is not None:
588
+ Z[ts, nz[i]] = float('nan')
589
+
590
+ if savemodel:
591
+ print('Saving model meta-data...')
592
+ v = get_package_versions()
593
+ with open('Models/meta_data.md', 'wb') as file:
594
+ pickle.dump({'valid_voxels': nz, 'fold_num': cvfolds,
595
+ 'mean_resp': mean_resp, 'std_resp': std_resp,
596
+ 'scaler_cov': scaler_cov, 'scaler_resp': scaler_resp,
597
+ 'regressor': alg, 'inscaler': inscaler,
598
+ 'outscaler': outscaler, 'versions': v},
599
+ file, protocol=PICKLE_PROTOCOL)
600
+
601
+ # compute performance metrics
602
+ if (run_cv or testresp is not None):
603
+ print("Evaluating the model ...")
604
+ if warp is None:
605
+ results = evaluate(Y[testids, :], Yhat[testids, :],
606
+ S2=S2[testids, :], mY=mean_resp[0],
607
+ sY=std_resp[0], nlZ=nlZ, nm=nm, Xz_tr=Xz_tr, alg=alg,
608
+ metrics=metrics)
609
+ else:
610
+ # for warped data we just aggregate across folds
611
+ results = dict()
612
+ for m in ['Rho', 'RMSE', 'SMSE', 'EXPV', 'MSLL']:
613
+ results[m] = np.mean(results_folds[m], axis=1)
614
+ results['NLL'] = results_folds['NLL']
615
+ results['BIC'] = results_folds['BIC']
616
+
617
+ # Set writing options
618
+ if saveoutput:
619
+ if (run_cv or testresp is not None):
620
+ save_results(respfile, Yhat[testids, :], S2[testids, :], maskvol,
621
+ Z=Z[testids, :], results=results,
622
+ outputsuffix=outputsuffix)
623
+
624
+ else:
625
+ save_results(respfile, Yhat[testids, :], S2[testids, :], maskvol,
626
+ outputsuffix=outputsuffix)
627
+
628
+ else:
629
+ if (run_cv or testresp is not None):
630
+ output = (Yhat[testids, :], S2[testids, :], nm, Z[testids, :],
631
+ results)
632
+ else:
633
+ output = (Yhat[testids, :], S2[testids, :], nm)
634
+
635
+ return output
636
+
637
+
638
+ def fit(covfile, respfile, **kwargs):
639
+ """
640
+ Fits a normative model to the data.
641
+
642
+ Parameters:
643
+ covfile (str): The path to the covariates file.
644
+ respfile (str): The path to the response variables file.
645
+ maskfile (str, optional): The path to the mask file. Defaults to None.
646
+ alg (str, optional): The algorithm to use. Defaults to 'gpr'.
647
+ savemodel (bool, optional): Whether to save the model. Defaults to True.
648
+ outputsuffix (str, optional): The suffix to append to the output files. Defaults to 'fit'.
649
+ inscaler (str, optional): The scaler to use for the input data. Defaults to 'None'.
650
+ outscaler (str, optional): The scaler to use for the output data. Defaults to 'None'.
651
+
652
+ Returns:
653
+ None
654
+ """
655
+
656
+ # parse keyword arguments
657
+ maskfile = kwargs.pop('maskfile', None)
658
+ alg = kwargs.pop('alg', 'gpr')
659
+ savemodel = kwargs.pop('savemodel', 'True') == 'True'
660
+ outputsuffix = kwargs.pop('outputsuffix', 'fit')
661
+ outputsuffix = "_" + outputsuffix.replace("_", "")
662
+ inscaler = kwargs.pop('inscaler', 'None')
663
+ outscaler = kwargs.pop('outscaler', 'None')
664
+ print(f"inscaler: {inscaler}")
665
+ print(f"outscaler: {outscaler}")
666
+
667
+ if savemodel and not os.path.isdir('Models'):
668
+ os.mkdir('Models')
669
+
670
+ # load data
671
+ print("Processing data in " + respfile)
672
+ X = fileio.load(covfile)
673
+ Y, maskvol = load_response_vars(respfile, maskfile)
674
+ if len(Y.shape) == 1:
675
+ Y = Y[:, np.newaxis]
676
+ if len(X.shape) == 1:
677
+ X = X[:, np.newaxis]
678
+
679
+ scaler_resp = []
680
+ scaler_cov = []
681
+ mean_resp = [] # this is just for computing MSLL
682
+ std_resp = [] # this is just for computing MSLL
683
+
684
+ # standardize responses and covariates, ignoring invalid entries
685
+ mY = np.mean(Y, axis=0)
686
+ sY = np.std(Y, axis=0)
687
+ mean_resp.append(mY)
688
+ std_resp.append(sY)
689
+
690
+ if inscaler in ['standardize', 'minmax', 'robminmax']:
691
+ X_scaler = scaler(inscaler)
692
+ Xz = X_scaler.fit_transform(X)
693
+ scaler_cov.append(X_scaler)
694
+ else:
695
+ Xz = X
696
+
697
+ if outscaler in ['standardize', 'minmax', 'robminmax']:
698
+ Yz = np.zeros_like(Y)
699
+ Y_scaler = scaler(outscaler)
700
+ Yz= Y_scaler.fit_transform(Y)
701
+ scaler_resp.append(Y_scaler)
702
+ else:
703
+ Yz = Y
704
+
705
+ # estimate the models for all subjects
706
+ for i in range(Y.shape[1]):
707
+ print("Estimating model ", i+1, "of", Y.shape[1])
708
+ nm = norm_init(Xz, Yz[:, i], alg=alg, **kwargs)
709
+ nm = nm.estimate(Xz, Yz[:, i], **kwargs)
710
+
711
+ if savemodel:
712
+ nm.save('Models/NM_' + str(0) + '_' + str(i) + outputsuffix +
713
+ '.pkl')
714
+
715
+ if savemodel:
716
+ print('Saving model meta-data...')
717
+ v = get_package_versions()
718
+ with open('Models/meta_data.md', 'wb') as file:
719
+ pickle.dump({'mean_resp': mean_resp, 'std_resp': std_resp,
720
+ 'scaler_cov': scaler_cov, 'scaler_resp': scaler_resp,
721
+ 'regressor': alg, 'inscaler': inscaler,
722
+ 'outscaler': outscaler, 'versions': v},
723
+ file, protocol=PICKLE_PROTOCOL)
724
+
725
+ return nm
726
+
727
+
728
+ def predict(covfile, respfile, maskfile=None, **kwargs):
729
+ '''
730
+ Make predictions on the basis of a pre-estimated normative model
731
+ If only the covariates are specified then only predicted mean and variance
732
+ will be returned. If the test responses are also specified then quantities
733
+ That depend on those will also be returned (Z scores and error metrics)
734
+
735
+ Basic usage::
736
+
737
+ predict(covfile, [extra_arguments])
738
+
739
+ where the variables are defined below.
740
+
741
+ :param covfile: test covariates used to predict the response variable
742
+ :param respfile: test response variables for the normative model
743
+ :param maskfile: mask used to apply to the data (nifti only)
744
+ :param model_path: Directory containing the normative model and metadata.
745
+ When using parallel prediction, do not pass the model path. It will be
746
+ automatically decided.
747
+ :param outputsuffix: Text string to add to the output filenames
748
+ :param batch_size: batch size (for use with normative_parallel)
749
+ :param job_id: batch id, 'None' when non-parallel module is used.
750
+ :param fold: which cross-validation fold to use (default = 0)
751
+ :param fold: list of model IDs to predict (if not specified all are computed)
752
+ :param return_y: return the (transformed) response variable (default = False)
753
+
754
+ All outputs are written to disk in the same format as the input. These are:
755
+
756
+ :outputs: * Yhat - predictive mean
757
+ * S2 - predictive variance
758
+ * Z - Z scores
759
+ * Y - response variable (if return_y is True)
760
+ '''
761
+
762
+ model_path = kwargs.pop('model_path', 'Models')
763
+ job_id = kwargs.pop('job_id', None)
764
+ batch_size = kwargs.pop('batch_size', None)
765
+ outputsuffix = kwargs.pop('outputsuffix', 'predict')
766
+ outputsuffix = "_" + outputsuffix.replace("_", "")
767
+ inputsuffix = kwargs.pop('inputsuffix', 'estimate')
768
+ inputsuffix = "_" + inputsuffix.replace("_", "")
769
+ alg = kwargs.pop('alg')
770
+ models = kwargs.pop('models', None)
771
+ fold = kwargs.pop('fold', 0)
772
+ return_y = kwargs.pop('return_y', False)
773
+
774
+ if alg == 'gpr':
775
+ raise ValueError("gpr is not supported with predict()")
776
+
777
+ if respfile is not None and not os.path.exists(respfile):
778
+ print("Response file does not exist. Only returning predictions")
779
+ respfile = None
780
+ if not os.path.isdir(model_path):
781
+ print('Models directory does not exist!')
782
+ return
783
+ else:
784
+ if os.path.exists(os.path.join(model_path, 'meta_data.md')):
785
+ with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file:
786
+ meta_data = pickle.load(file)
787
+ inscaler = meta_data['inscaler']
788
+ outscaler = meta_data['outscaler']
789
+ mY = meta_data['mean_resp']
790
+ sY = meta_data['std_resp']
791
+ scaler_cov = meta_data['scaler_cov']
792
+ scaler_resp = meta_data['scaler_resp']
793
+ meta_data = True
794
+ else:
795
+ print("No meta-data file is found!")
796
+ inscaler = 'None'
797
+ outscaler = 'None'
798
+ meta_data = False
799
+
800
+ if batch_size is not None:
801
+ batch_size = int(batch_size)
802
+
803
+ if job_id is not None:
804
+ job_id = int(job_id) - 1
805
+ parallel = True
806
+ else:
807
+ parallel = False
808
+ job_id = 0
809
+
810
+
811
+ # load data
812
+ print("Loading data ...")
813
+ X = fileio.load(covfile)
814
+ if len(X.shape) == 1:
815
+ X = X[:, np.newaxis]
816
+ if respfile is not None:
817
+ Y, maskvol = load_response_vars(respfile, maskfile)
818
+ if len(Y.shape) == 1:
819
+ Y = Y[:, np.newaxis]
820
+
821
+ sample_num = X.shape[0]
822
+ if models is not None:
823
+ feature_num = len(models)
824
+ else:
825
+ feature_num = len(glob.glob(os.path.join(model_path, 'NM_' + str(fold) + '_' +
826
+ '*' + inputsuffix + '.pkl')))
827
+ models = range(feature_num)
828
+
829
+ Yhat = np.zeros([sample_num, feature_num])
830
+ S2 = np.zeros([sample_num, feature_num])
831
+ Z = np.zeros([sample_num, feature_num])
832
+
833
+ if inscaler in ['standardize', 'minmax', 'robminmax']:
834
+ Xz = scaler_cov[job_id].transform(X)
835
+ else:
836
+ Xz = X
837
+ if respfile is not None:
838
+ if outscaler in ['standardize', 'minmax', 'robminmax']:
839
+ Yz = scaler_resp[job_id].transform(Y)
840
+ else:
841
+ Yz = Y
842
+
843
+ for i, m in enumerate(models):
844
+ print("Prediction by model ", i+1, "of", feature_num)
845
+ nm = norm_init(Xz)
846
+ nm = nm.load(os.path.join(model_path, 'NM_' + str(fold) + '_' +
847
+ str(m) + inputsuffix + '.pkl'))
848
+ if (alg != 'hbr' or nm.configs['transferred'] == False):
849
+ yhat, s2 = nm.predict(Xz, **kwargs)
850
+ else: # only for hbr and in the transfer scenario
851
+ tsbefile = kwargs.get('tsbefile')
852
+ batch_effects_test = fileio.load(tsbefile)
853
+ yhat, s2 = nm.predict_on_new_sites(Xz, batch_effects_test)
854
+
855
+ if outscaler == 'standardize':
856
+ Yhat[:, i] = scaler_resp[job_id].inverse_transform(yhat, index=i)
857
+ S2[:, i] = s2.squeeze() * scaler_resp[job_id].s[i]**2
858
+ elif outscaler in ['minmax', 'robminmax']:
859
+ Yhat[:, i] = scaler_resp[job_id].inverse_transform(yhat, index=i)
860
+ S2[:, i] = s2 * (scaler_resp[job_id].max[i] -
861
+ scaler_resp[job_id].min[i])**2
862
+ else:
863
+ Yhat[:, i] = yhat.squeeze()
864
+ S2[:, i] = s2.squeeze()
865
+ if respfile is not None:
866
+ if alg == 'hbr':
867
+ # Z scores for HBR must be computed independently for each model
868
+ Z[:, i] = nm.get_mcmc_zscores(Xz, Yz[:, i:i+1], **kwargs)
869
+ else:
870
+ Z[:, i] = np.squeeze((Yz[:, i:i+1] - Yhat[:, i:i+1]) / np.sqrt(S2[:, i:i+1]))
871
+
872
+ if respfile is None:
873
+ save_results(None, Yhat, S2, None, outputsuffix=outputsuffix)
874
+
875
+ return (Yhat, S2)
876
+
877
+ else:
878
+ if models is not None and len(Y.shape) > 1:
879
+ Y = Y[:, models]
880
+ # TODO: Needs simplification
881
+ if meta_data:
882
+ if type(mY) is list: # This happens when non-parallel or when using meta data from batches
883
+ mY = mY[0][models]
884
+ sY = sY[0][models]
885
+ else: # This happens when parallel on collected metadata
886
+ mY = mY[models]
887
+ sY = sY[models]
888
+
889
+ if len(Y.shape) == 1:
890
+ Y = Y[:, np.newaxis]
891
+
892
+ # warp the targets?
893
+ if alg == 'blr' and nm.blr.warp is not None:
894
+ warp = True
895
+ Yw = np.zeros_like(Y)
896
+ for i, m in enumerate(models):
897
+ nm = norm_init(Xz)
898
+ nm = nm.load(os.path.join(model_path, 'NM_0_' +
899
+ str(m) + inputsuffix + '.pkl'))
900
+
901
+ warp_param = nm.blr.hyp[1:nm.blr.warp.get_n_params()+1]
902
+ Yw[:, i] = nm.blr.warp.f(Y[:, i], warp_param)
903
+ Y = Yw
904
+ else:
905
+ warp = False
906
+
907
+ if alg != 'hbr':
908
+ # For HBR the Z scores are already computed
909
+ Z = (Y - Yhat) / np.sqrt(S2)
910
+
911
+ print("Evaluating the model ...")
912
+ if meta_data and not warp:
913
+
914
+ results = evaluate(Y, Yhat, S2=S2, mY=mY, sY=sY)
915
+ else:
916
+ results = evaluate(Y, Yhat, S2=S2,
917
+ metrics=['Rho', 'RMSE', 'SMSE', 'EXPV'])
918
+
919
+ print("Evaluations Writing outputs ...")
920
+
921
+ if return_y:
922
+ save_results(respfile, Yhat, S2, maskvol, Z=Z, Y=Y,
923
+ outputsuffix=outputsuffix, results=results)
924
+ return (Yhat, S2, Z, Y)
925
+ else:
926
+ save_results(respfile, Yhat, S2, maskvol, Z=Z,
927
+ outputsuffix=outputsuffix, results=results)
928
+ return (Yhat, S2, Z)
929
+
930
+
931
+ def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None,
932
+ **kwargs):
933
+ '''
934
+ Transfer learning on the basis of a pre-estimated normative model by using
935
+ the posterior distribution over the parameters as an informed prior for
936
+ new data. currently only supported for HBR.
937
+
938
+ Basic usage::
939
+
940
+ transfer(covfile, respfile, trbefile, model_path, output_path, inputsuffix [extra_arguments])
941
+
942
+ where the variables are defined below.
943
+
944
+ :param covfile: transfer covariates used to predict the response variable
945
+ :param respfile: transfer response variables for the normative model
946
+ :param maskfile: mask used to apply to the data (nifti only)
947
+ :param trbefile: Training batch effects file
948
+ :param testcov: Test covariates
949
+ :param testresp: Test responses
950
+ :param model_path: Directory containing the normative model and metadata
951
+ :param output_path: Address to output directory to save the transferred models
952
+ :param inputsuffix: The suffix for the inout models (default='estimate')
953
+ :param batch_size: batch size (for use with normative_parallel)
954
+ :param job_id: batch id
955
+
956
+ All outputs are written to disk in the same format as the input. These are:
957
+
958
+ :outputs: * Yhat - predictive mean
959
+ * S2 - predictive variance
960
+ * Z - Z scores
961
+ '''
962
+ alg = kwargs.pop('alg').lower()
963
+
964
+ if alg != 'hbr' and alg != 'blr':
965
+ print('Model transfer function is only possible for HBR and BLR models.')
966
+ return
967
+ # testing should not be obligatory for HBR,
968
+ # but should be for BLR (since it doesn't produce transfer models)
969
+ elif ('model_path' not in list(kwargs.keys())) or \
970
+ ('trbefile' not in list(kwargs.keys())):
971
+ print('InputError: model_path or trbefile are missing.')
972
+ return
973
+ # hbr has one additional mandatory arguments
974
+ elif alg == 'hbr':
975
+ if ('output_path' not in list(kwargs.keys())):
976
+ print('InputError: output_path is missing.')
977
+ return
978
+ else:
979
+ output_path = kwargs.pop('output_path', None)
980
+ if not os.path.isdir(output_path):
981
+ os.mkdir(output_path)
982
+
983
+ # for hbr, testing is not mandatory, for blr's predict/transfer it is. This will be an architectural choice.
984
+ # or (testresp==None)
985
+ elif alg == 'blr':
986
+ if (testcov == None) or \
987
+ ('tsbefile' not in list(kwargs.keys())):
988
+ print('InputError: Some mandatory arguments for blr are missing.')
989
+ return
990
+ # general arguments
991
+ log_path = kwargs.pop('log_path', None)
992
+ model_path = kwargs.pop('model_path')
993
+ outputsuffix = kwargs.pop('outputsuffix', 'transfer')
994
+ outputsuffix = "_" + outputsuffix.replace("_", "")
995
+ inputsuffix = kwargs.pop('inputsuffix', 'estimate')
996
+ inputsuffix = "_" + inputsuffix.replace("_", "")
997
+ tsbefile = kwargs.pop('tsbefile', None)
998
+ trbefile = kwargs.pop('trbefile', None)
999
+ job_id = kwargs.pop('job_id', None)
1000
+ batch_size = kwargs.pop('batch_size', None)
1001
+ fold = kwargs.pop('fold', 0) # This is almost always 0 in the transfer scenario.
1002
+
1003
+ # for PCNonline automated parallel jobs loop
1004
+ count_jobsdone = kwargs.pop('count_jobsdone', 'False')
1005
+ if type(count_jobsdone) is str:
1006
+ count_jobsdone = count_jobsdone == 'True'
1007
+
1008
+ if batch_size is not None:
1009
+ batch_size = int(batch_size)
1010
+
1011
+ if job_id is not None:
1012
+ job_id = int(job_id) - 1
1013
+ parallel = True
1014
+ else:
1015
+ parallel = False
1016
+ job_id = 0
1017
+
1018
+ if not os.path.isdir(model_path):
1019
+ print('Models directory does not exist!')
1020
+ return
1021
+ else:
1022
+ if os.path.exists(os.path.join(model_path, 'meta_data.md')):
1023
+ with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file:
1024
+ my_meta_data = pickle.load(file)
1025
+ inscaler = my_meta_data['inscaler']
1026
+ outscaler = my_meta_data['outscaler']
1027
+ scaler_cov = my_meta_data['scaler_cov']
1028
+ scaler_resp = my_meta_data['scaler_resp']
1029
+ meta_data = True
1030
+ else:
1031
+ print("No meta-data file is found!")
1032
+ inscaler = 'None'
1033
+ outscaler = 'None'
1034
+ meta_data = False
1035
+
1036
+ # load adaptation data
1037
+ print("Loading data ...")
1038
+ X = fileio.load(covfile)
1039
+ Y, maskvol = load_response_vars(respfile, maskfile)
1040
+ if len(Y.shape) == 1:
1041
+ Y = Y[:, np.newaxis]
1042
+ if len(X.shape) == 1:
1043
+ X = X[:, np.newaxis]
1044
+
1045
+ if inscaler in ['standardize', 'minmax', 'robminmax']:
1046
+ if parallel:
1047
+ scaler_cov[job_id][fold].extend(X)
1048
+ X = scaler_cov[job_id][fold].transform(X)
1049
+ else:
1050
+ scaler_cov[fold].extend(X)
1051
+ X = scaler_cov[fold].transform(X)
1052
+
1053
+ if outscaler in ['standardize', 'minmax', 'robminmax']:
1054
+ if parallel:
1055
+ scaler_resp[job_id][fold].extend(Y)
1056
+ Y = scaler_resp[job_id][fold].transform(Y)
1057
+ else:
1058
+ scaler_resp[fold].extend(Y)
1059
+ Y = scaler_resp[fold].transform(Y)
1060
+
1061
+ feature_num = Y.shape[1]
1062
+
1063
+ # mean and std of training data only used for calculating the MSLL
1064
+ mY = np.mean(Y, axis=0)
1065
+ sY = np.std(Y, axis=0)
1066
+
1067
+
1068
+ batch_effects_train = fileio.load(trbefile)
1069
+
1070
+ # load test data
1071
+ if testcov is not None:
1072
+ # we have a separate test dataset
1073
+ Xte = fileio.load(testcov)
1074
+ if len(Xte.shape) == 1:
1075
+ Xte = Xte[:, np.newaxis]
1076
+ ts_sample_num = Xte.shape[0]
1077
+
1078
+ if inscaler in ['standardize', 'minmax', 'robminmax']:
1079
+ if parallel:
1080
+ Xte = scaler_cov[job_id][fold].transform(Xte)
1081
+ else:
1082
+ Xte = scaler_cov[fold].transform(Xte)
1083
+
1084
+ if testresp is not None:
1085
+ Yte, testmask = load_response_vars(testresp, maskfile)
1086
+ if len(Yte.shape) == 1:
1087
+ Yte = Yte[:, np.newaxis]
1088
+ if outscaler in ['standardize', 'minmax', 'robminmax']:
1089
+ if parallel:
1090
+ Yte = scaler_resp[job_id][fold].transform(Yte)
1091
+ else:
1092
+ Yte = scaler_resp[fold].transform(Yte)
1093
+
1094
+ else:
1095
+ Yte = np.zeros([ts_sample_num, feature_num])
1096
+
1097
+ if tsbefile is not None:
1098
+ batch_effects_test = fileio.load(tsbefile)
1099
+ else:
1100
+ batch_effects_test = np.zeros([Xte.shape[0], 2])
1101
+ else:
1102
+ ts_sample_num = 0
1103
+
1104
+ Yhat = np.zeros([ts_sample_num, feature_num])
1105
+ S2 = np.zeros([ts_sample_num, feature_num])
1106
+ Z = np.zeros([ts_sample_num, feature_num])
1107
+
1108
+ if meta_data:
1109
+ my_meta_data['mean_resp'] = mY
1110
+ my_meta_data['std_resp'] = sY
1111
+ if inscaler not in ['None']:
1112
+ my_meta_data['scaler_cov'] = scaler_cov
1113
+ if outscaler not in ['None']:
1114
+ my_meta_data['scaler_resp'] = scaler_resp
1115
+ if parallel:
1116
+ pickle.dump(my_meta_data, open(os.path.join('Models', 'meta_data.md'), 'wb'))
1117
+ else:
1118
+ pickle.dump(my_meta_data, open(os.path.join(output_path, 'meta_data.md'), 'wb'))
1119
+
1120
+ # estimate the models for all subjects
1121
+ for i in range(feature_num):
1122
+
1123
+ if alg == 'hbr':
1124
+ print("Using HBR transform...")
1125
+ nm = norm_init(X)
1126
+ if batch_size is not None: # when using normative_parallel
1127
+ print("Transferring model ", job_id*batch_size+i)
1128
+ nm = nm.load(os.path.join(model_path, 'NM_0_' +
1129
+ str(job_id*batch_size+i) + inputsuffix +
1130
+ '.pkl'))
1131
+ else:
1132
+ print("Transferring model ", i+1, "of", feature_num)
1133
+ nm = nm.load(os.path.join(model_path, 'NM_0_' + str(i) +
1134
+ inputsuffix + '.pkl'))
1135
+
1136
+ nm = nm.transfer(X, Y[:, i], batch_effects_train)
1137
+
1138
+ if batch_size is not None:
1139
+ nm.save(os.path.join(output_path, 'NM_0_' +
1140
+ str(job_id*batch_size+i) + outputsuffix + '.pkl'))
1141
+ else:
1142
+ nm.save(os.path.join(output_path, 'NM_0_' +
1143
+ str(i) + outputsuffix + '.pkl'))
1144
+
1145
+ if testcov is not None:
1146
+ yhat, s2 = nm.predict_on_new_sites(Xte, batch_effects_test)
1147
+ if testresp is not None:
1148
+ Z[:, i] = nm.get_mcmc_zscores(Xte, Yte[:, i:i+1], **kwargs)
1149
+
1150
+ # We basically use normative.predict script here.
1151
+ if alg == 'blr':
1152
+ print("Using BLR transform...")
1153
+ print("Transferring model ", i+1, "of", feature_num)
1154
+ nm = norm_init(X)
1155
+ nm = nm.load(os.path.join(model_path, 'NM_' + str(fold) + '_' +
1156
+ str(i) + inputsuffix + '.pkl'))
1157
+
1158
+ # translate the syntax to what blr understands
1159
+ # first strip existing blr keyword arguments to avoid redundancy
1160
+ adapt_cov = kwargs.pop('adaptcovfile', None)
1161
+ adapt_res = kwargs.pop('adaptrespfile', None)
1162
+ adapt_vg = kwargs.pop('adaptvargroupfile', None)
1163
+ test_vg = kwargs.pop('testvargroupfile', None)
1164
+ if adapt_cov is not None or adapt_res is not None \
1165
+ or adapt_vg is not None or test_vg is not None:
1166
+ print(
1167
+ "Warning: redundant batch effect parameterisation. Using HBR syntax")
1168
+
1169
+ yhat, s2 = nm.predict(Xte, X, Y[:, i],
1170
+ adaptcov=X,
1171
+ adaptresp=Y[:, i],
1172
+ adaptvargroup=batch_effects_train,
1173
+ testvargroup=batch_effects_test,
1174
+ **kwargs)
1175
+
1176
+ if testcov is not None:
1177
+ if outscaler == 'standardize':
1178
+ if parallel:
1179
+ Yhat[:, i] = scaler_resp[job_id][fold].inverse_transform(
1180
+ yhat.squeeze(), index=i)
1181
+ S2[:, i] = s2.squeeze() * scaler_resp[job_id][fold].s[i]**2
1182
+ else:
1183
+ Yhat[:, i] = scaler_resp[fold].inverse_transform(
1184
+ yhat.squeeze(), index=i)
1185
+ S2[:, i] = s2.squeeze() * scaler_resp[fold].s[i]**2
1186
+
1187
+ elif outscaler in ['minmax', 'robminmax']:
1188
+ if parallel:
1189
+ Yhat[:, i] = scaler_resp[job_id][fold].inverse_transform(yhat, index=i)
1190
+ S2[:, i] = s2 * (scaler_resp[job_id][fold].max[i] -
1191
+ scaler_resp[job_id][fold].min[i])**2
1192
+ else:
1193
+ Yhat[:, i] = scaler_resp[fold].inverse_transform(yhat, index=i)
1194
+ S2[:, i] = s2 * (scaler_resp[fold].max[i] -
1195
+ scaler_resp[fold].min[i])**2
1196
+ else:
1197
+ Yhat[:, i] = yhat.squeeze()
1198
+ S2[:, i] = s2.squeeze()
1199
+
1200
+ if testresp is None:
1201
+ save_results(respfile, Yhat, S2, maskvol, outputsuffix=outputsuffix)
1202
+ return (Yhat, S2)
1203
+ else:
1204
+ # warp the targets?
1205
+ if alg == 'blr' and nm.blr.warp is not None:
1206
+ warp = True
1207
+ Yw = np.zeros_like(Yte)
1208
+ for i in range(feature_num):
1209
+ nm = norm_init(Xte)
1210
+ nm = nm.load(os.path.join(model_path, 'NM_' + str(fold) + '_' +
1211
+ str(i) + inputsuffix + '.pkl'))
1212
+
1213
+ warp_param = nm.blr.hyp[1:nm.blr.warp.get_n_params()+1]
1214
+ Yw[:, i] = nm.blr.warp.f(Yte[:, i], warp_param)
1215
+ Yte = Yw
1216
+ else:
1217
+ warp = False
1218
+ # For HBR the Z scores are already computed
1219
+ if alg != 'hbr':
1220
+ Z = (Yte - Yhat) / np.sqrt(S2)
1221
+
1222
+ print("Evaluating the model ...")
1223
+ if meta_data and not warp:
1224
+ results = evaluate(Yte, Yhat, S2=S2, mY=mY, sY=sY)
1225
+ else:
1226
+ results = evaluate(Yte, Yhat, S2=S2,
1227
+ metrics=['Rho', 'RMSE', 'SMSE', 'EXPV'])
1228
+
1229
+ save_results(respfile, Yhat, S2, maskvol, Z=Z, results=results,
1230
+ outputsuffix=outputsuffix)
1231
+
1232
+ # Creates a file for every job succesfully completed (for tracking failed jobs).
1233
+ if count_jobsdone == True:
1234
+ done_path = os.path.join(log_path, str(job_id)+".jobsdone")
1235
+ Path(done_path).touch()
1236
+
1237
+ return (Yhat, S2, Z)
1238
+
1239
+ # Creates a file for every job succesfully completed (for tracking failed jobs).
1240
+ if count_jobsdone == True:
1241
+ done_path = os.path.join(log_path, str(job_id)+".jobsdone")
1242
+ Path(done_path).touch()
1243
+
1244
+
1245
+ def extend(covfile, respfile, maskfile=None, **kwargs):
1246
+ '''
1247
+ This function extends an existing HBR model with data from new sites/scanners.
1248
+
1249
+ Basic usage::
1250
+
1251
+ transfer(covfile, respfile, trbefile, model_path, output_path, inputsuffix [extra_arguments])
1252
+
1253
+ where the variables are defined below.
1254
+
1255
+ :param covfile: covariates for new data
1256
+ :param respfile: response variables for new data
1257
+ :param maskfile: mask used to apply to the data (nifti only)
1258
+ :param model_path: Directory containing the normative model and metadata
1259
+ :param trbefile: file address to batch effects file for new data
1260
+ :param batch_size: batch size (for use with normative_parallel)
1261
+ :param job_id: batch id
1262
+ :param output_path: the path for saving the the extended model
1263
+ :param inputsuffix: The suffix for the input models (default='extend')
1264
+ :param informative_prior: use initial model prior or learn from scratch (default is False).
1265
+ :param generation_factor: generation factor refers to the number of samples generated for each
1266
+ combination of covariates and batch effects. Default is 10.
1267
+
1268
+
1269
+ All outputs are written to disk in the same format as the input.
1270
+
1271
+ '''
1272
+
1273
+ alg = kwargs.pop('alg')
1274
+ if alg != 'hbr':
1275
+ print('Model extention is only possible for HBR models.')
1276
+ return
1277
+ elif ('model_path' not in list(kwargs.keys())) or \
1278
+ ('output_path' not in list(kwargs.keys())) or \
1279
+ ('trbefile' not in list(kwargs.keys())):
1280
+ print('InputError: Please specify model_path, output_path, and trbefile.')
1281
+ return
1282
+ else:
1283
+ model_path = kwargs.pop('model_path')
1284
+ output_path = kwargs.pop('output_path')
1285
+ trbefile = kwargs.pop('trbefile')
1286
+
1287
+ outputsuffix = kwargs.pop('outputsuffix', 'extend')
1288
+ outputsuffix = "_" + outputsuffix.replace("_", "")
1289
+ inputsuffix = kwargs.pop('inputsuffix', 'extend')
1290
+ inputsuffix = "_" + inputsuffix.replace("_", "")
1291
+ informative_prior = kwargs.pop('informative_prior', 'False') == 'True'
1292
+ generation_factor = int(kwargs.pop('generation_factor', '10'))
1293
+ job_id = kwargs.pop('job_id', None)
1294
+ batch_size = kwargs.pop('batch_size', None)
1295
+ fold = kwargs.pop('fold', 0) # This is almost always 0 in the extend scenario.
1296
+
1297
+
1298
+
1299
+ if batch_size is not None:
1300
+ batch_size = int(batch_size)
1301
+
1302
+ if job_id is not None:
1303
+ job_id = int(job_id) - 1
1304
+ parallel = True
1305
+ else:
1306
+ parallel = False
1307
+ job_id = 0
1308
+
1309
+ if not os.path.isdir(model_path):
1310
+ print('Models directory does not exist!')
1311
+ return
1312
+ else:
1313
+ if os.path.exists(os.path.join(model_path, 'meta_data.md')):
1314
+ with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file:
1315
+ my_meta_data = pickle.load(file)
1316
+ inscaler = my_meta_data['inscaler']
1317
+ outscaler = my_meta_data['outscaler']
1318
+ scaler_cov = my_meta_data['scaler_cov']
1319
+ scaler_resp = my_meta_data['scaler_resp']
1320
+ meta_data = True
1321
+ else:
1322
+ print("No meta-data file is found!")
1323
+ inscaler = 'None'
1324
+ outscaler = 'None'
1325
+ meta_data = False
1326
+
1327
+ if not os.path.isdir(output_path):
1328
+ os.mkdir(output_path)
1329
+
1330
+ # load data
1331
+ print("Loading data ...")
1332
+ X = fileio.load(covfile)
1333
+ Y, maskvol = load_response_vars(respfile, maskfile)
1334
+ batch_effects_train = fileio.load(trbefile)
1335
+
1336
+ if len(Y.shape) == 1:
1337
+ Y = Y[:, np.newaxis]
1338
+ if len(X.shape) == 1:
1339
+ X = X[:, np.newaxis]
1340
+
1341
+ if inscaler in ['standardize', 'minmax', 'robminmax']:
1342
+ if parallel:
1343
+ scaler_cov[job_id][fold].extend(X)
1344
+ X = scaler_cov[job_id][fold].transform(X)
1345
+ else:
1346
+ scaler_cov[fold].extend(X)
1347
+ X = scaler_cov[fold].transform(X)
1348
+
1349
+ if outscaler in ['standardize', 'minmax', 'robminmax']:
1350
+ if parallel:
1351
+ scaler_resp[job_id][fold].extend(Y)
1352
+ Y = scaler_resp[job_id][fold].transform(Y)
1353
+ else:
1354
+ scaler_resp[fold].extend(Y)
1355
+ Y = scaler_resp[fold].transform(Y)
1356
+
1357
+ feature_num = Y.shape[1]
1358
+
1359
+ if meta_data:
1360
+ if inscaler not in ['None']:
1361
+ my_meta_data['scaler_cov'] = scaler_cov
1362
+ if outscaler not in ['None']:
1363
+ my_meta_data['scaler_resp'] = scaler_resp
1364
+ if parallel:
1365
+ pickle.dump(my_meta_data, open(os.path.join('Models', 'meta_data.md'), 'wb'))
1366
+ else:
1367
+ pickle.dump(my_meta_data, open(os.path.join(output_path, 'meta_data.md'), 'wb'))
1368
+
1369
+
1370
+ # estimate the models for all subjects
1371
+ for i in range(feature_num):
1372
+
1373
+ nm = norm_init(X)
1374
+ if parallel: # when using normative_parallel
1375
+ print("Extending model ", job_id*batch_size+i)
1376
+ nm = nm.load(os.path.join(model_path, 'NM_0_' +
1377
+ str(job_id*batch_size+i) + inputsuffix +
1378
+ '.pkl'))
1379
+ else:
1380
+ print("Extending model ", i+1, "of", feature_num)
1381
+ nm = nm.load(os.path.join(model_path, 'NM_0_' + str(i) +
1382
+ inputsuffix + '.pkl'))
1383
+
1384
+ nm = nm.extend(X, Y[:, i:i+1], batch_effects_train,
1385
+ samples=generation_factor,
1386
+ informative_prior=informative_prior)
1387
+
1388
+ if parallel: # The model is save into both output_path and temporary parallel folders
1389
+ nm.save(os.path.join(output_path, 'NM_0_' +
1390
+ str(job_id*batch_size+i) + outputsuffix + '.pkl'))
1391
+ nm.save(os.path.join('Models', 'NM_0_' +
1392
+ str(i) + outputsuffix + '.pkl'))
1393
+ else:
1394
+ nm.save(os.path.join(output_path, 'NM_0_' +
1395
+ str(i) + outputsuffix + '.pkl'))
1396
+
1397
+
1398
+ def tune(covfile, respfile, maskfile=None, **kwargs):
1399
+ '''
1400
+ This function tunes an existing HBR model with real data.
1401
+
1402
+ Basic usage::
1403
+
1404
+ tune(covfile, respfile [extra_arguments])
1405
+
1406
+ where the variables are defined below.
1407
+
1408
+ :param covfile: covariates for new data
1409
+ :param respfile: response variables for new data
1410
+ :param maskfile: mask used to apply to the data (nifti only)
1411
+ :param model_path: Directory containing the normative model and metadata
1412
+ :param trbefile: file address to batch effects file for new data
1413
+ :param batch_size: batch size (for use with normative_parallel)
1414
+ :param job_id: batch id
1415
+ :param output_path: the path for saving the the extended model
1416
+ :param informative_prior: use initial model prior or learn from scracth (default is False).
1417
+ :param generation_factor: see below
1418
+
1419
+
1420
+ generation factor refers to the number of samples generated for each
1421
+ combination of covariates and batch effects. Default is 10.
1422
+
1423
+
1424
+ All outputs are written to disk in the same format as the input.
1425
+
1426
+ '''
1427
+
1428
+ alg = kwargs.pop('alg')
1429
+ if alg != 'hbr':
1430
+ print('Model extention is only possible for HBR models.')
1431
+ return
1432
+ elif ('model_path' not in list(kwargs.keys())) or \
1433
+ ('output_path' not in list(kwargs.keys())) or \
1434
+ ('trbefile' not in list(kwargs.keys())):
1435
+ print('InputError: Some mandatory arguments are missing.')
1436
+ return
1437
+ else:
1438
+ model_path = kwargs.pop('model_path')
1439
+ output_path = kwargs.pop('output_path')
1440
+ trbefile = kwargs.pop('trbefile')
1441
+
1442
+ outputsuffix = kwargs.pop('outputsuffix', 'tuned')
1443
+ outputsuffix = "_" + outputsuffix.replace("_", "")
1444
+ inputsuffix = kwargs.pop('inputsuffix', 'estimate')
1445
+ inputsuffix = "_" + inputsuffix.replace("_", "")
1446
+ informative_prior = kwargs.pop('informative_prior', 'False') == 'True'
1447
+ generation_factor = int(kwargs.pop('generation_factor', '10'))
1448
+ job_id = kwargs.pop('job_id', None)
1449
+ batch_size = kwargs.pop('batch_size', None)
1450
+ if batch_size is not None:
1451
+ batch_size = int(batch_size)
1452
+ job_id = int(job_id) - 1
1453
+
1454
+ if not os.path.isdir(model_path):
1455
+ print('Models directory does not exist!')
1456
+ return
1457
+ else:
1458
+ if os.path.exists(os.path.join(model_path, 'meta_data.md')):
1459
+ with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file:
1460
+ meta_data = pickle.load(file)
1461
+ if (meta_data['inscaler'] != 'None' or
1462
+ meta_data['outscaler'] != 'None'):
1463
+ print('Models extention on scaled data is not possible!')
1464
+ return
1465
+
1466
+ if not os.path.isdir(output_path):
1467
+ os.mkdir(output_path)
1468
+
1469
+ # load data
1470
+ print("Loading data ...")
1471
+ X = fileio.load(covfile)
1472
+ Y, maskvol = load_response_vars(respfile, maskfile)
1473
+ batch_effects_train = fileio.load(trbefile)
1474
+
1475
+ if len(Y.shape) == 1:
1476
+ Y = Y[:, np.newaxis]
1477
+ if len(X.shape) == 1:
1478
+ X = X[:, np.newaxis]
1479
+ feature_num = Y.shape[1]
1480
+
1481
+ # estimate the models for all subjects
1482
+ for i in range(feature_num):
1483
+
1484
+ nm = norm_init(X)
1485
+ if batch_size is not None: # when using nirmative_parallel
1486
+ print("Tuning model ", job_id*batch_size+i)
1487
+ nm = nm.load(os.path.join(model_path, 'NM_0_' +
1488
+ str(job_id*batch_size+i) + inputsuffix +
1489
+ '.pkl'))
1490
+ else:
1491
+ print("Tuning model ", i+1, "of", feature_num)
1492
+ nm = nm.load(os.path.join(model_path, 'NM_0_' + str(i) +
1493
+ inputsuffix + '.pkl'))
1494
+
1495
+ nm = nm.tune(X, Y[:, i:i+1], batch_effects_train,
1496
+ samples=generation_factor,
1497
+ informative_prior=informative_prior)
1498
+
1499
+ if batch_size is not None:
1500
+ nm.save(os.path.join(output_path, 'NM_0_' +
1501
+ str(job_id*batch_size+i) + outputsuffix + '.pkl'))
1502
+ nm.save(os.path.join('Models', 'NM_0_' +
1503
+ str(i) + outputsuffix + '.pkl'))
1504
+ else:
1505
+ nm.save(os.path.join(output_path, 'NM_0_' +
1506
+ str(i) + outputsuffix + '.pkl'))
1507
+
1508
+
1509
+ def merge(covfile=None, respfile=None, **kwargs):
1510
+ '''
1511
+ This function extends an existing HBR model with data from new sites/scanners.
1512
+
1513
+ Basic usage::
1514
+
1515
+ merge(model_path1, model_path2 [extra_arguments])
1516
+
1517
+ where the variables are defined below.
1518
+
1519
+ :param covfile: Not required. Always set to None.
1520
+ :param respfile: Not required. Always set to None.
1521
+ :param model_path1: Directory containing the model and metadata (1st model)
1522
+ :param model_path2: Directory containing the model and metadata (2nd model)
1523
+ :param batch_size: batch size (for use with normative_parallel)
1524
+ :param job_id: batch id
1525
+ :param output_path: the path for saving the the extended model
1526
+ :param generation_factor: see below
1527
+
1528
+ The generation factor refers tothe number of samples generated for each
1529
+ combination of covariates and batch effects. Default is 10.
1530
+
1531
+
1532
+ All outputs are written to disk in the same format as the input.
1533
+
1534
+ '''
1535
+
1536
+ alg = kwargs.pop('alg')
1537
+ if alg != 'hbr':
1538
+ print('Merging models is only possible for HBR models.')
1539
+ return
1540
+ elif ('model_path1' not in list(kwargs.keys())) or \
1541
+ ('model_path2' not in list(kwargs.keys())) or \
1542
+ ('output_path' not in list(kwargs.keys())):
1543
+ print('InputError: Some mandatory arguments are missing.')
1544
+ return
1545
+ else:
1546
+ model_path1 = kwargs.pop('model_path1')
1547
+ model_path2 = kwargs.pop('model_path2')
1548
+ output_path = kwargs.pop('output_path')
1549
+
1550
+ outputsuffix = kwargs.pop('outputsuffix', 'merge')
1551
+ outputsuffix = "_" + outputsuffix.replace("_", "")
1552
+ inputsuffix = kwargs.pop('inputsuffix', 'estimate')
1553
+ inputsuffix = "_" + inputsuffix.replace("_", "")
1554
+ generation_factor = int(kwargs.pop('generation_factor', '10'))
1555
+ job_id = kwargs.pop('job_id', None)
1556
+ batch_size = kwargs.pop('batch_size', None)
1557
+ if batch_size is not None:
1558
+ batch_size = int(batch_size)
1559
+ job_id = int(job_id) - 1
1560
+
1561
+ if (not os.path.isdir(model_path1)) or (not os.path.isdir(model_path2)):
1562
+ print('Models directory does not exist!')
1563
+ return
1564
+ else:
1565
+ if batch_size is None:
1566
+ with open(os.path.join(model_path1, 'meta_data.md'), 'rb') as file:
1567
+ meta_data1 = pickle.load(file)
1568
+ with open(os.path.join(model_path2, 'meta_data.md'), 'rb') as file:
1569
+ meta_data2 = pickle.load(file)
1570
+ if meta_data1['valid_voxels'].shape[0] != meta_data2['valid_voxels'].shape[0]:
1571
+ print('Two models are trained on different features!')
1572
+ return
1573
+ else:
1574
+ feature_num = meta_data1['valid_voxels'].shape[0]
1575
+ else:
1576
+ feature_num = batch_size
1577
+
1578
+ if not os.path.isdir(output_path):
1579
+ os.mkdir(output_path)
1580
+
1581
+ # mergeing the models
1582
+ for i in range(feature_num):
1583
+
1584
+ nm1 = norm_init(np.random.rand(100, 10))
1585
+ nm2 = norm_init(np.random.rand(100, 10))
1586
+ if batch_size is not None: # when using nirmative_parallel
1587
+ print("Merging model ", job_id*batch_size+i)
1588
+ nm1 = nm1.load(os.path.join(model_path1, 'NM_0_' +
1589
+ str(job_id*batch_size+i) + inputsuffix +
1590
+ '.pkl'))
1591
+ nm2 = nm2.load(os.path.join(model_path2, 'NM_0_' +
1592
+ str(job_id*batch_size+i) + inputsuffix +
1593
+ '.pkl'))
1594
+ else:
1595
+ print("Merging model ", i+1, "of", feature_num)
1596
+ nm1 = nm1.load(os.path.join(model_path1, 'NM_0_' + str(i) +
1597
+ inputsuffix + '.pkl'))
1598
+ nm2 = nm1.load(os.path.join(model_path2, 'NM_0_' + str(i) +
1599
+ inputsuffix + '.pkl'))
1600
+
1601
+ nm_merged = nm1.merge(nm2, samples=generation_factor)
1602
+
1603
+ if batch_size is not None:
1604
+ nm_merged.save(os.path.join(output_path, 'NM_0_' +
1605
+ str(job_id*batch_size+i) + outputsuffix + '.pkl'))
1606
+ nm_merged.save(os.path.join('Models', 'NM_0_' +
1607
+ str(i) + outputsuffix + '.pkl'))
1608
+ else:
1609
+ nm_merged.save(os.path.join(output_path, 'NM_0_' +
1610
+ str(i) + outputsuffix + '.pkl'))
1611
+
1612
+
1613
+ def main(*args):
1614
+ """ Parse arguments and estimate model
1615
+ """
1616
+
1617
+ np.seterr(invalid='ignore')
1618
+
1619
+ rfile, mfile, cfile, cv, tcfile, trfile, func, alg, cfg, kw = get_args(
1620
+ args)
1621
+
1622
+ # collect required arguments
1623
+ pos_args = ['cfile', 'rfile']
1624
+
1625
+ # collect basic keyword arguments controlling model estimation
1626
+ kw_args = ['maskfile=mfile',
1627
+ 'cvfolds=cv',
1628
+ 'testcov=tcfile',
1629
+ 'testresp=trfile',
1630
+ 'alg=alg',
1631
+ 'configparam=cfg']
1632
+
1633
+ # add additional keyword arguments
1634
+ for k in kw:
1635
+ kw_args.append(k + '=' + "'" + kw[k] + "'")
1636
+ all_args = ', '.join(pos_args + kw_args)
1637
+
1638
+ # Executing the target function
1639
+ exec(func + '(' + all_args + ')')
1640
+
1641
+ def entrypoint():
1642
+ main(sys.argv[1:])
1643
+
1644
+
1645
+ # For running from the command line:
1646
+ if __name__ == "__main__":
1647
+ main(sys.argv[1:])