pcntoolkit 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pcntoolkit/__init__.py +4 -0
- pcntoolkit/configs.py +9 -0
- pcntoolkit/dataio/__init__.py +1 -0
- pcntoolkit/dataio/fileio.py +608 -0
- pcntoolkit/model/KnuOp.py +48 -0
- pcntoolkit/model/NP.py +88 -0
- pcntoolkit/model/NPR.py +86 -0
- pcntoolkit/model/SHASH.py +509 -0
- pcntoolkit/model/__init__.py +6 -0
- pcntoolkit/model/architecture.py +219 -0
- pcntoolkit/model/bayesreg.py +585 -0
- pcntoolkit/model/core.21290 +0 -0
- pcntoolkit/model/gp.py +489 -0
- pcntoolkit/model/hbr.py +1584 -0
- pcntoolkit/model/rfa.py +245 -0
- pcntoolkit/normative.py +1647 -0
- pcntoolkit/normative_NP.py +336 -0
- pcntoolkit/normative_model/__init__.py +6 -0
- pcntoolkit/normative_model/norm_base.py +62 -0
- pcntoolkit/normative_model/norm_blr.py +303 -0
- pcntoolkit/normative_model/norm_gpr.py +112 -0
- pcntoolkit/normative_model/norm_hbr.py +752 -0
- pcntoolkit/normative_model/norm_np.py +333 -0
- pcntoolkit/normative_model/norm_rfa.py +109 -0
- pcntoolkit/normative_model/norm_utils.py +29 -0
- pcntoolkit/normative_parallel.py +1420 -0
- pcntoolkit/regression_model/blr/warp.py +1 -0
- pcntoolkit/trendsurf.py +315 -0
- pcntoolkit/util/__init__.py +1 -0
- pcntoolkit/util/bspline.py +149 -0
- pcntoolkit/util/hbr_utils.py +242 -0
- pcntoolkit/util/utils.py +1698 -0
- pcntoolkit-0.32.0.dist-info/LICENSE +674 -0
- pcntoolkit-0.32.0.dist-info/METADATA +134 -0
- pcntoolkit-0.32.0.dist-info/RECORD +37 -0
- pcntoolkit-0.32.0.dist-info/WHEEL +4 -0
- pcntoolkit-0.32.0.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,752 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Created on Thu Jul 25 17:01:24 2019
|
|
5
|
+
|
|
6
|
+
@author: seykia
|
|
7
|
+
@author: augub
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import division, print_function
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
from sys import exit
|
|
15
|
+
|
|
16
|
+
import arviz as az
|
|
17
|
+
import numpy as np
|
|
18
|
+
import xarray
|
|
19
|
+
from scipy import special as spp
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from pcntoolkit.dataio import fileio
|
|
23
|
+
from pcntoolkit.model.hbr import HBR
|
|
24
|
+
from pcntoolkit.normative_model.norm_base import NormBase
|
|
25
|
+
except ImportError:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
path = os.path.abspath(os.path.dirname(__file__))
|
|
29
|
+
if path not in sys.path:
|
|
30
|
+
sys.path.append(path)
|
|
31
|
+
del path
|
|
32
|
+
import dataio.fileio as fileio
|
|
33
|
+
from model.hbr import HBR
|
|
34
|
+
from norm_base import NormBase
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class NormHBR(NormBase):
|
|
38
|
+
"""HBR multi-batch normative modelling class. By default, this function
|
|
39
|
+
estimates a linear model with random intercept, random slope, and random
|
|
40
|
+
homoscedastic noise.
|
|
41
|
+
|
|
42
|
+
:param X: [N×P] array of clinical covariates
|
|
43
|
+
:param y: [N×1] array of neuroimaging measures
|
|
44
|
+
:param trbefile: the address to the batch effects file for the training set.
|
|
45
|
+
the batch effect array should be a [N×M] array where M is the number of
|
|
46
|
+
the type of batch effects. For example when the site and gender is modeled
|
|
47
|
+
as batch effects M=2. Each column in the batch effect array contains the
|
|
48
|
+
batch ID (starting from 0) for each sample. If not specified (default=None)
|
|
49
|
+
then all samples assumed to be from the same batch (i.e., the batch effect
|
|
50
|
+
is not modelled).
|
|
51
|
+
:param tsbefile: Similar to trbefile for the test set.
|
|
52
|
+
:param model_type: Specifies the type of the model from 'linear', 'plynomial',
|
|
53
|
+
and 'bspline' (defauls is 'linear').
|
|
54
|
+
:param likelihood: specifies the type of likelihood among 'Normal' 'SHASHb','SHASHo',
|
|
55
|
+
and 'SHASHo2' (defauls is normal).
|
|
56
|
+
:param linear_mu: Boolean (default='True') to decide whether the mean (mu) is
|
|
57
|
+
parametrized on a linear function (thus changes with covariates) or it is fixed.
|
|
58
|
+
:param linear_sigma: Boolean (default='False') to decide whether the variance (sigma) is
|
|
59
|
+
parametrized on a linear function (heteroscedastic noise) or it is fixed for
|
|
60
|
+
each batch (homoscedastic noise).
|
|
61
|
+
:param linear_epsilon: Boolean (default='False') to decide the parametrization
|
|
62
|
+
of epsilon for the SHASH likelihood that controls its skewness.
|
|
63
|
+
If True, epsilon is parametrized on a linear function
|
|
64
|
+
(thus changes with covariates) otherwise it is fixed for each batch.
|
|
65
|
+
:param linear_delta: Boolean (default='False') to decide the parametrization
|
|
66
|
+
of delta for the SHASH likelihood that controls its kurtosis.
|
|
67
|
+
If True, delta is parametrized on a linear function
|
|
68
|
+
(thus changes with covariates) otherwise it is fixed for each batch.
|
|
69
|
+
:param random_intercept_{parameter}: if parameters mu (default='True'),
|
|
70
|
+
sigma (default='False'), epsilon (default='False'), and delta (default='False')
|
|
71
|
+
are parametrized on a linear function, then this boolean decides
|
|
72
|
+
whether the intercept can vary across batches.
|
|
73
|
+
:param random_slope_{parameter}: if parameters mu (default='True'),
|
|
74
|
+
sigma (default='False'), epsilon (default='False'), and delta (default='False')
|
|
75
|
+
are parametrized on a linear function, then this boolean decides
|
|
76
|
+
whether the slope can vary across batches.
|
|
77
|
+
:param centered_intercept_{parameter}: if parameters mu (default='False'),
|
|
78
|
+
sigma (default='False'), epsilon (default='False'), and delta (default='False')
|
|
79
|
+
are parametrized on a linear function, then this boolean decides
|
|
80
|
+
whether the parameters of intercept are estimated in a centered or
|
|
81
|
+
non-centered manner (default). While centered estimation runs faster
|
|
82
|
+
it may cause some problems for the sampler (the funnel of hell).
|
|
83
|
+
:param centered_slope_{parameter}: if parameters mu (default='False'),
|
|
84
|
+
sigma (default='False'), epsilon (default='False'), and delta (default='False')
|
|
85
|
+
are parametrized on a linear function, then this boolean decides
|
|
86
|
+
whether the parameters of slope are estimated in a centered or
|
|
87
|
+
non-centered manner (default). While centered estimation runs faster
|
|
88
|
+
it may cause some problems for the sampler (the funnel of hell).
|
|
89
|
+
:param sampler: specifies the type of PyMC sampler (Defauls is 'NUTS').
|
|
90
|
+
:param n_samples: The number of samples to draw (Default is '1000'). Please
|
|
91
|
+
note that this parameter must be specified in a string fromat ('1000' and
|
|
92
|
+
not 1000).
|
|
93
|
+
:param n_tuning: String that specifies the number of iterations to adjust
|
|
94
|
+
the samplers's step sizes, scalings or similar (defauls is '500').
|
|
95
|
+
:param n_chains: String that specifies the number of chains to sample. Defauls
|
|
96
|
+
is '1' for faster estimation, but note that sampling independent chains
|
|
97
|
+
is important for some convergence checks.
|
|
98
|
+
:param cores: String that specifies the number of chains to run in parallel.
|
|
99
|
+
(defauls is '1').
|
|
100
|
+
:param init: Initialization method to use for auto-assigned NUTS samplers. The
|
|
101
|
+
defauls is 'jitter+adapt_diag' that starts with a identity mass matrix
|
|
102
|
+
and then adapt a diagonal based on the variance of the tuning samples
|
|
103
|
+
while adding a uniform jitter in [-1, 1] to the starting point in each chain.
|
|
104
|
+
:param target_accept: String that of a float in [0, 1] that regulates the
|
|
105
|
+
step size such that we approximate this acceptance rate. The defauls is '0.8'
|
|
106
|
+
but higher values like 0.9 or 0.95 often work better for problematic posteriors.
|
|
107
|
+
:param order: String that defines the order of bspline or polynomial model.
|
|
108
|
+
The defauls is '3'.
|
|
109
|
+
:param nknots: String that defines the numbers of interior knots for the bspline model.
|
|
110
|
+
The defauls is '3'. Two knots will be added to this number for boundries. So final
|
|
111
|
+
number of knots will be nknots+2. Higher values increase the model complexity with negative
|
|
112
|
+
effect on the spped of estimations.
|
|
113
|
+
:param nn_hidden_layers_num: String the specifies the number of hidden layers
|
|
114
|
+
in neural network model. It can be either '1' or '2'. The default is set to '2'.
|
|
115
|
+
:param nn_hidden_neuron_num: String that specifies the number of neurons in
|
|
116
|
+
the hidden layers. The defauls is set to '2'.
|
|
117
|
+
|
|
118
|
+
Written by S.de Boer and S.M. Kia
|
|
119
|
+
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
def __init__(self, **kwargs):
|
|
123
|
+
self.configs = dict()
|
|
124
|
+
# inputs
|
|
125
|
+
self.configs["trbefile"] = kwargs.get("trbefile", None)
|
|
126
|
+
self.configs["tsbefile"] = kwargs.get("tsbefile", None)
|
|
127
|
+
# Model settings
|
|
128
|
+
self.configs["type"] = kwargs.get("model_type", "linear")
|
|
129
|
+
self.configs["random_noise"] = kwargs.get(
|
|
130
|
+
"random_noise", "True") == "True"
|
|
131
|
+
self.configs["likelihood"] = kwargs.get("likelihood", "Normal")
|
|
132
|
+
# sampler settings
|
|
133
|
+
self.configs["nuts_sampler"] = kwargs.get("nuts_sampler", "pymc")
|
|
134
|
+
self.configs["n_samples"] = int(kwargs.get("n_samples", "1000"))
|
|
135
|
+
self.configs["n_tuning"] = int(kwargs.get("n_tuning", "500"))
|
|
136
|
+
self.configs["n_chains"] = int(kwargs.get("n_chains", "1"))
|
|
137
|
+
self.configs["sampler"] = kwargs.get("sampler", "NUTS")
|
|
138
|
+
self.configs["target_accept"] = float(
|
|
139
|
+
kwargs.get("target_accept", "0.8"))
|
|
140
|
+
self.configs["init"] = kwargs.get("init", "jitter+adapt_diag_grad")
|
|
141
|
+
self.configs["cores"] = int(kwargs.get("cores", "1"))
|
|
142
|
+
self.configs["remove_datapoints_from_posterior"] = (
|
|
143
|
+
kwargs.get("remove_datapoints_from_posterior", "True") == "True"
|
|
144
|
+
)
|
|
145
|
+
# model transfer setting
|
|
146
|
+
self.configs["freedom"] = int(kwargs.get("freedom", "1"))
|
|
147
|
+
self.configs["transferred"] = False
|
|
148
|
+
# deprecated settings
|
|
149
|
+
self.configs["skewed_likelihood"] = (
|
|
150
|
+
kwargs.get("skewed_likelihood", "False") == "True"
|
|
151
|
+
)
|
|
152
|
+
# misc
|
|
153
|
+
self.configs["pred_type"] = kwargs.get("pred_type", "single")
|
|
154
|
+
|
|
155
|
+
if self.configs["type"] == "bspline":
|
|
156
|
+
self.configs["order"] = int(kwargs.get("order", "3"))
|
|
157
|
+
self.configs["nknots"] = int(kwargs.get("nknots", "3"))
|
|
158
|
+
elif self.configs["type"] == "polynomial":
|
|
159
|
+
self.configs["order"] = int(kwargs.get("order", "3"))
|
|
160
|
+
elif self.configs["type"] == "nn":
|
|
161
|
+
self.configs["nn_hidden_neuron_num"] = int(
|
|
162
|
+
kwargs.get("nn_hidden_neuron_num", "2")
|
|
163
|
+
)
|
|
164
|
+
self.configs["nn_hidden_layers_num"] = int(
|
|
165
|
+
kwargs.get("nn_hidden_layers_num", "2")
|
|
166
|
+
)
|
|
167
|
+
if self.configs["nn_hidden_layers_num"] > 2:
|
|
168
|
+
raise ValueError(
|
|
169
|
+
"Using "
|
|
170
|
+
+ str(self.configs["nn_hidden_layers_num"])
|
|
171
|
+
+ " layers was not implemented. The number of "
|
|
172
|
+
+ " layers has to be less than 3."
|
|
173
|
+
)
|
|
174
|
+
elif self.configs["type"] == "linear":
|
|
175
|
+
pass
|
|
176
|
+
else:
|
|
177
|
+
raise ValueError(
|
|
178
|
+
"Unknown model type, please specify from 'linear', \
|
|
179
|
+
'polynomial', 'bspline', or 'nn'."
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if self.configs["type"] in ["bspline", "polynomial", "linear"]:
|
|
183
|
+
for p in ["mu", "sigma", "epsilon", "delta"]:
|
|
184
|
+
self.configs[f"linear_{p}"] = (
|
|
185
|
+
kwargs.get(f"linear_{p}", "False") == "True"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Deprecations (remove in later version)
|
|
189
|
+
if f"{p}_linear" in kwargs.keys():
|
|
190
|
+
print(
|
|
191
|
+
f"The keyword '{p}_linear' is deprecated. It is now automatically replaced with 'linear_{p}'"
|
|
192
|
+
)
|
|
193
|
+
self.configs[f"linear_{p}"] = (
|
|
194
|
+
kwargs.get(f"{p}_linear", "False") == "True"
|
|
195
|
+
)
|
|
196
|
+
# End Deprecations
|
|
197
|
+
|
|
198
|
+
for c in ["centered", "random"]:
|
|
199
|
+
self.configs[f"{c}_{p}"] = kwargs.get(
|
|
200
|
+
f"{c}_{p}", "False") == "True"
|
|
201
|
+
for sp in ["slope", "intercept"]:
|
|
202
|
+
self.configs[f"{c}_{sp}_{p}"] = (
|
|
203
|
+
kwargs.get(f"{c}_{sp}_{p}", "False") == "True"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Deprecations (remove in later version)
|
|
207
|
+
if self.configs["linear_sigma"]:
|
|
208
|
+
if "random_noise" in kwargs.keys():
|
|
209
|
+
print(
|
|
210
|
+
"The keyword 'random_noise' is deprecated. It is now automatically replaced with 'random_intercept_sigma', because sigma is linear"
|
|
211
|
+
)
|
|
212
|
+
self.configs["random_intercept_sigma"] = (
|
|
213
|
+
kwargs.get("random_noise", "True") == "True"
|
|
214
|
+
)
|
|
215
|
+
elif "random_noise" in kwargs.keys():
|
|
216
|
+
print(
|
|
217
|
+
"The keyword 'random_noise' is deprecated. It is now automatically replaced with 'random_sigma', because sigma is fixed"
|
|
218
|
+
)
|
|
219
|
+
self.configs["random_sigma"] = (
|
|
220
|
+
kwargs.get("random_noise", "True") == "True"
|
|
221
|
+
)
|
|
222
|
+
if "random_slope" in kwargs.keys():
|
|
223
|
+
print(
|
|
224
|
+
"The keyword 'random_slope' is deprecated. It is now automatically replaced with 'random_intercept_mu'"
|
|
225
|
+
)
|
|
226
|
+
self.configs["random_slope_mu"] = (
|
|
227
|
+
kwargs.get("random_slope", "True") == "True"
|
|
228
|
+
)
|
|
229
|
+
# End Deprecations
|
|
230
|
+
|
|
231
|
+
# Default parameters
|
|
232
|
+
self.configs["linear_mu"] = kwargs.get("linear_mu", "True") == "True"
|
|
233
|
+
self.configs["random_mu"] = kwargs.get("random_mu", "True") == "True"
|
|
234
|
+
self.configs["random_intercept_mu"] = (
|
|
235
|
+
kwargs.get("random_intercept_mu", "True") == "True"
|
|
236
|
+
)
|
|
237
|
+
self.configs["random_slope_mu"] = (
|
|
238
|
+
kwargs.get("random_slope_mu", "True") == "True"
|
|
239
|
+
)
|
|
240
|
+
self.configs["random_sigma"] = kwargs.get(
|
|
241
|
+
"random_sigma", "True") == "True"
|
|
242
|
+
self.configs["centered_sigma"] = kwargs.get(
|
|
243
|
+
"centered_sigma", "True") == "True"
|
|
244
|
+
# End default parameters
|
|
245
|
+
|
|
246
|
+
self.hbr = HBR(self.configs)
|
|
247
|
+
|
|
248
|
+
@property
|
|
249
|
+
def n_params(self):
|
|
250
|
+
return 1
|
|
251
|
+
|
|
252
|
+
@property
|
|
253
|
+
def neg_log_lik(self):
|
|
254
|
+
return -1
|
|
255
|
+
|
|
256
|
+
def estimate(self, X, y, **kwargs):
|
|
257
|
+
"""
|
|
258
|
+
Sample from the posterior of the Hierarchical Bayesian Regression model.
|
|
259
|
+
|
|
260
|
+
This function samples from the posterior distribution of the Hierarchical Bayesian Regression (HBR) model given the data matrix 'X' and target 'y'.
|
|
261
|
+
If 'trbefile' is provided in kwargs, it is used as batch effects for the training data.
|
|
262
|
+
Otherwise, the batch effects are initialized as zeros.
|
|
263
|
+
|
|
264
|
+
:param X: Data matrix.
|
|
265
|
+
:param y: Target values.
|
|
266
|
+
:param kwargs: Keyword arguments which may include:
|
|
267
|
+
- 'trbefile': File containing the batch effects for the training data. Optional.
|
|
268
|
+
:return: The instance of the NormHBR object.
|
|
269
|
+
"""
|
|
270
|
+
trbefile = kwargs.get("trbefile", None)
|
|
271
|
+
if trbefile is not None:
|
|
272
|
+
batch_effects_train = fileio.load(trbefile)
|
|
273
|
+
else:
|
|
274
|
+
print("Could not find batch-effects file! Initilizing all as zeros ...")
|
|
275
|
+
batch_effects_train = np.zeros([X.shape[0], 1])
|
|
276
|
+
|
|
277
|
+
self.batch_effects_maps = [
|
|
278
|
+
{v: i for i, v in enumerate(np.unique(batch_effects_train[:, j]))}
|
|
279
|
+
for j in range(batch_effects_train.shape[1])
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
self.hbr.estimate(X, y, batch_effects_train)
|
|
283
|
+
|
|
284
|
+
return self
|
|
285
|
+
|
|
286
|
+
def predict(self, Xs, X=None, Y=None, **kwargs):
|
|
287
|
+
"""
|
|
288
|
+
Predict the target values for the given test data.
|
|
289
|
+
|
|
290
|
+
This function predicts the target values for the given test data 'Xs' using the Hierarchical Bayesian Regression (HBR) model.
|
|
291
|
+
If 'X' and 'Y' are provided, they are used to update the model before prediction.
|
|
292
|
+
If 'tsbefile' is provided in kwargs, it is used to as batch effects for the test data.
|
|
293
|
+
Otherwise, the batch effects are initialized as zeros.
|
|
294
|
+
|
|
295
|
+
:param Xs: Test data matrix.
|
|
296
|
+
:param X: Training data matrix. Optional.
|
|
297
|
+
:param Y: Training target values. Optional.
|
|
298
|
+
:param kwargs: Keyword arguments which may include:
|
|
299
|
+
- 'tsbefile': File containing the batch effects for the test data. Optional.
|
|
300
|
+
:return: A tuple containing the predicted target values and the marginal variances for the test data.
|
|
301
|
+
:raises ValueError: If the model is a transferred model. In this case, use the predict_on_new_sites function.
|
|
302
|
+
"""
|
|
303
|
+
tsbefile = kwargs.get("tsbefile", None)
|
|
304
|
+
if tsbefile is not None:
|
|
305
|
+
batch_effects_test = fileio.load(tsbefile)
|
|
306
|
+
else:
|
|
307
|
+
print("Could not find batch-effects file! Initilizing all as zeros ...")
|
|
308
|
+
batch_effects_test = np.zeros([Xs.shape[0], 1])
|
|
309
|
+
|
|
310
|
+
pred_type = self.configs["pred_type"]
|
|
311
|
+
|
|
312
|
+
# if self.configs["transferred"] == False:
|
|
313
|
+
yhat, s2 = self.hbr.predict(
|
|
314
|
+
X=Xs,
|
|
315
|
+
batch_effects=batch_effects_test,
|
|
316
|
+
batch_effects_maps=self.batch_effects_maps,
|
|
317
|
+
pred=pred_type,
|
|
318
|
+
**kwargs,
|
|
319
|
+
)
|
|
320
|
+
# else:
|
|
321
|
+
# raise ValueError(
|
|
322
|
+
# "This is a transferred model. Please use predict_on_new_sites function."
|
|
323
|
+
# )
|
|
324
|
+
|
|
325
|
+
return yhat.squeeze(), s2.squeeze()
|
|
326
|
+
|
|
327
|
+
def transfer(self, X, y, batch_effects):
|
|
328
|
+
"""
|
|
329
|
+
Samples from the posterior of the Hierarchical Bayesian Regression model.
|
|
330
|
+
|
|
331
|
+
This function samples from the posterior of the Hierarchical Bayesian Regression (HBR) model given the data matrix 'X' and target 'y'. The posterior samples from the previous iteration are used to construct the priors for this one.
|
|
332
|
+
If 'trbefile' is provided in kwargs, it is used as batch effects for the training data.
|
|
333
|
+
Otherwise, the batch effects are initialized as zeros.
|
|
334
|
+
|
|
335
|
+
:param X: Data matrix.
|
|
336
|
+
:param y: Target values.
|
|
337
|
+
:param kwargs: Keyword arguments which may include:
|
|
338
|
+
- 'trbefile': File containing the batch effects for the training data. Optional.
|
|
339
|
+
:return: The instance of the NormHBR object.
|
|
340
|
+
"""
|
|
341
|
+
self.hbr.transfer(X, y, batch_effects)
|
|
342
|
+
self.configs["transferred"] = True
|
|
343
|
+
return self
|
|
344
|
+
|
|
345
|
+
def predict_on_new_sites(self, X, batch_effects):
|
|
346
|
+
"""
|
|
347
|
+
Predict the target values for the given test data on new sites.
|
|
348
|
+
|
|
349
|
+
This function predicts the target values for the given test data 'X' on new sites using the Hierarchical Bayesian Regression (HBR) model.
|
|
350
|
+
The batch effects for the new sites must be provided.
|
|
351
|
+
|
|
352
|
+
:param X: Test data matrix for the new sites.
|
|
353
|
+
:param batch_effects: Batch effects for the new sites.
|
|
354
|
+
:return: A tuple containing the predicted target values and the marginal variances for the test data on the new sites.
|
|
355
|
+
"""
|
|
356
|
+
|
|
357
|
+
yhat, s2 = self.hbr.predict(
|
|
358
|
+
X,
|
|
359
|
+
batch_effects=batch_effects,
|
|
360
|
+
batch_effects_maps=self.batch_effects_maps
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
return yhat, s2
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def extend(
|
|
367
|
+
self,
|
|
368
|
+
X,
|
|
369
|
+
y,
|
|
370
|
+
batch_effects,
|
|
371
|
+
X_dummy_ranges=[[0.1, 0.9, 0.01]],
|
|
372
|
+
merge_batch_dim=0,
|
|
373
|
+
samples=10,
|
|
374
|
+
informative_prior=False
|
|
375
|
+
):
|
|
376
|
+
"""
|
|
377
|
+
Extend the Hierarchical Bayesian Regression model using data sampled from the posterior predictive distribution.
|
|
378
|
+
|
|
379
|
+
This function extends the Hierarchical Bayesian Regression (HBR) model, given the data matrix 'X' and target 'y'.
|
|
380
|
+
It also generates data from the posterior predictive distribution and merges it with the new data before estimation.
|
|
381
|
+
If 'informative_prior' is True, it uses the adapt method for estimation. Otherwise, it uses the estimate method.
|
|
382
|
+
|
|
383
|
+
:param X: Data matrix for the new sites.
|
|
384
|
+
:param y: Target values for the new sites.
|
|
385
|
+
:param batch_effects: Batch effects for the new sites.
|
|
386
|
+
:param X_dummy_ranges: Ranges for generating the dummy data. Default is [[0.1, 0.9, 0.01]].
|
|
387
|
+
:param merge_batch_dim: Dimension for merging the batch effects. Default is 0.
|
|
388
|
+
:param samples: Number of samples to generate for the dummy data. Default is 10.
|
|
389
|
+
:param informative_prior: Whether to use the adapt method for estimation. Default is False.
|
|
390
|
+
:return: The instance of the NormHBR object.
|
|
391
|
+
"""
|
|
392
|
+
|
|
393
|
+
X_dummy, batch_effects_dummy = self.hbr.create_dummy_inputs(X)
|
|
394
|
+
|
|
395
|
+
X_dummy, batch_effects_dummy, Y_dummy = self.hbr.generate(
|
|
396
|
+
X_dummy, batch_effects_dummy, samples, batch_effects_maps=self.batch_effects_maps
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
batch_effects[:, merge_batch_dim] = (
|
|
400
|
+
batch_effects[:, merge_batch_dim]
|
|
401
|
+
+ np.max(batch_effects_dummy[:, merge_batch_dim])
|
|
402
|
+
+ 1
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
X = np.concatenate((X_dummy, X))
|
|
406
|
+
y = np.concatenate((Y_dummy, y))
|
|
407
|
+
batch_effects = np.concatenate((batch_effects_dummy, batch_effects))
|
|
408
|
+
|
|
409
|
+
self.batch_effects_maps = [ {v: i for i, v in enumerate(np.unique(batch_effects[:, j]))}
|
|
410
|
+
for j in range(batch_effects.shape[1])
|
|
411
|
+
]
|
|
412
|
+
|
|
413
|
+
if informative_prior:
|
|
414
|
+
#raise NotImplementedError("The extension with informaitve prior is not implemented yet.")
|
|
415
|
+
self.hbr.transfer(X, y, batch_effects)
|
|
416
|
+
else:
|
|
417
|
+
|
|
418
|
+
self.hbr.estimate(X, y, batch_effects)
|
|
419
|
+
|
|
420
|
+
return self
|
|
421
|
+
|
|
422
|
+
def tune(
|
|
423
|
+
self,
|
|
424
|
+
X,
|
|
425
|
+
y,
|
|
426
|
+
batch_effects,
|
|
427
|
+
X_dummy_ranges=[[0.1, 0.9, 0.01]],
|
|
428
|
+
merge_batch_dim=0,
|
|
429
|
+
samples=10,
|
|
430
|
+
informative_prior=False,
|
|
431
|
+
):
|
|
432
|
+
"""
|
|
433
|
+
This function tunes the Hierarchical Bayesian Regression model using data sampled from the posterior predictive distribution. Its behavior is not tested, and it is unclear if the desired behavior is achieved.
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
# TODO need to check if this is correct
|
|
437
|
+
|
|
438
|
+
print(
|
|
439
|
+
"The 'tune' function is being called, but it is currently in development and its behavior is not tested. It is unclear if the desired behavior is achieved. Any output following this should be treated as unreliable."
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
tune_ids = list(np.unique(batch_effects[:, merge_batch_dim]))
|
|
443
|
+
|
|
444
|
+
X_dummy, batch_effects_dummy = self.hbr.create_dummy_inputs(
|
|
445
|
+
X_dummy_ranges)
|
|
446
|
+
|
|
447
|
+
for idx in tune_ids:
|
|
448
|
+
X_dummy = X_dummy[batch_effects_dummy[:,
|
|
449
|
+
merge_batch_dim] != idx, :]
|
|
450
|
+
batch_effects_dummy = batch_effects_dummy[
|
|
451
|
+
batch_effects_dummy[:, merge_batch_dim] != idx, :
|
|
452
|
+
]
|
|
453
|
+
|
|
454
|
+
X_dummy, batch_effects_dummy, Y_dummy = self.hbr.generate(
|
|
455
|
+
X_dummy, batch_effects_dummy, samples
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
if informative_prior:
|
|
459
|
+
self.hbr.adapt(
|
|
460
|
+
np.concatenate((X_dummy, X)),
|
|
461
|
+
np.concatenate((Y_dummy, y)),
|
|
462
|
+
np.concatenate((batch_effects_dummy, batch_effects)),
|
|
463
|
+
)
|
|
464
|
+
else:
|
|
465
|
+
self.hbr.estimate(
|
|
466
|
+
np.concatenate((X_dummy, X)),
|
|
467
|
+
np.concatenate((Y_dummy, y)),
|
|
468
|
+
np.concatenate((batch_effects_dummy, batch_effects)),
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
return self
|
|
472
|
+
|
|
473
|
+
def merge(
|
|
474
|
+
self, nm, X_dummy_ranges=[[0.1, 0.9, 0.01]], merge_batch_dim=0, samples=10
|
|
475
|
+
):
|
|
476
|
+
"""
|
|
477
|
+
Samples from the posterior predictive distribitions of two models, merges them, and estimates a model on the merged data.
|
|
478
|
+
|
|
479
|
+
This function samples from the posterior predictive distribitions of two models, merges them, and estimates a model on the merged data.
|
|
480
|
+
|
|
481
|
+
:param nm: The other NormHBR object.
|
|
482
|
+
:param X_dummy_ranges: Ranges for generating the dummy data. Default is [[0.1, 0.9, 0.01]].
|
|
483
|
+
:param merge_batch_dim: Dimension for merging the batch effects. Default is 0.
|
|
484
|
+
:param samples: Number of samples to generate for the dummy data. Default is 10.
|
|
485
|
+
"""
|
|
486
|
+
|
|
487
|
+
X_dummy1, batch_effects_dummy1 = self.hbr.create_dummy_inputs(
|
|
488
|
+
X_dummy_ranges)
|
|
489
|
+
X_dummy2, batch_effects_dummy2 = nm.hbr.create_dummy_inputs(
|
|
490
|
+
X_dummy_ranges)
|
|
491
|
+
|
|
492
|
+
X_dummy1, batch_effects_dummy1, Y_dummy1 = self.hbr.generate(
|
|
493
|
+
X_dummy1, batch_effects_dummy1, samples
|
|
494
|
+
)
|
|
495
|
+
X_dummy2, batch_effects_dummy2, Y_dummy2 = nm.hbr.generate(
|
|
496
|
+
X_dummy2, batch_effects_dummy2, samples
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
batch_effects_dummy2[:, merge_batch_dim] = (
|
|
500
|
+
batch_effects_dummy2[:, merge_batch_dim]
|
|
501
|
+
+ np.max(batch_effects_dummy1[:, merge_batch_dim])
|
|
502
|
+
+ 1
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
self.hbr.estimate(
|
|
506
|
+
np.concatenate((X_dummy1, X_dummy2)),
|
|
507
|
+
np.concatenate((Y_dummy1, Y_dummy2)),
|
|
508
|
+
np.concatenate((batch_effects_dummy1, batch_effects_dummy2)),
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
return self
|
|
512
|
+
|
|
513
|
+
def generate(self, X, batch_effects, samples=10):
|
|
514
|
+
X, batch_effects, generated_samples = self.hbr.generate(
|
|
515
|
+
X, batch_effects, samples
|
|
516
|
+
)
|
|
517
|
+
return X, batch_effects, generated_samples
|
|
518
|
+
|
|
519
|
+
def get_mcmc_quantiles(self, X, batch_effects=None, z_scores=None):
|
|
520
|
+
"""
|
|
521
|
+
Computes quantiles of an estimated normative model.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
X ([N*p]ndarray): covariates for which the quantiles are computed (must be scaled if scaler is set)
|
|
525
|
+
batch_effects (ndarray): the batch effects corresponding to X
|
|
526
|
+
z_scores (ndarray): Use this to determine which quantiles will be computed. The resulting quantiles will have the z-scores given in this list.
|
|
527
|
+
"""
|
|
528
|
+
# Set batch effects to zero if none are provided
|
|
529
|
+
if batch_effects is None:
|
|
530
|
+
batch_effects = np.zeros([X.shape[0], 1])
|
|
531
|
+
|
|
532
|
+
# Set the z_scores for which the quantiles are computed
|
|
533
|
+
if z_scores is None:
|
|
534
|
+
z_scores = np.arange(-3, 4)
|
|
535
|
+
elif len(z_scores.shape) == 2:
|
|
536
|
+
if not z_scores.shape[0] == X.shape[0]:
|
|
537
|
+
raise ValueError("The number of columns in z_scores must match the number of columns in X")
|
|
538
|
+
z_scores = z_scores.T
|
|
539
|
+
|
|
540
|
+
# Determine the variables to predict
|
|
541
|
+
match self.configs["likelihood"]:
|
|
542
|
+
case "Normal":
|
|
543
|
+
var_names = ["mu_samples", "sigma_samples", "sigma_plus_samples"]
|
|
544
|
+
case "SHASHo" | "SHASHo2" | "SHASHb":
|
|
545
|
+
var_names = [
|
|
546
|
+
"mu_samples",
|
|
547
|
+
"sigma_samples",
|
|
548
|
+
"sigma_plus_samples",
|
|
549
|
+
"epsilon_samples",
|
|
550
|
+
"delta_samples",
|
|
551
|
+
"delta_plus_samples",
|
|
552
|
+
]
|
|
553
|
+
case _:
|
|
554
|
+
exit("Unknown likelihood: " + self.configs["likelihood"])
|
|
555
|
+
|
|
556
|
+
# Delete the posterior predictive if it already exists
|
|
557
|
+
if "posterior_predictive" in self.hbr.idata.groups():
|
|
558
|
+
del self.hbr.idata.posterior_predictive
|
|
559
|
+
|
|
560
|
+
self.hbr.predict(
|
|
561
|
+
X=X,
|
|
562
|
+
batch_effects=batch_effects,
|
|
563
|
+
batch_effects_maps=self.batch_effects_maps,
|
|
564
|
+
pred="single",
|
|
565
|
+
var_names=var_names + ["y_like"],
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
# Extract the relevant samples from the idata
|
|
569
|
+
post_pred = az.extract(
|
|
570
|
+
self.hbr.idata, "posterior_predictive", var_names=var_names
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# Remove superfluous var_nammes
|
|
574
|
+
var_names.remove("sigma_samples")
|
|
575
|
+
if "delta_samples" in var_names:
|
|
576
|
+
var_names.remove("delta_samples")
|
|
577
|
+
|
|
578
|
+
# Separate the samples into a list so that they can be unpacked
|
|
579
|
+
array_of_vars = list(map(lambda x: post_pred[x], var_names))
|
|
580
|
+
|
|
581
|
+
# Create an array to hold the quantiles
|
|
582
|
+
len_synth_data, n_mcmc_samples = post_pred["mu_samples"].shape
|
|
583
|
+
quantiles = np.zeros(
|
|
584
|
+
(z_scores.shape[0], len_synth_data, n_mcmc_samples))
|
|
585
|
+
|
|
586
|
+
# Compute the quantile iteratively for each z-score
|
|
587
|
+
|
|
588
|
+
for i, j in enumerate(z_scores):
|
|
589
|
+
if len(z_scores.shape) == 1:
|
|
590
|
+
zs = np.full((len_synth_data, n_mcmc_samples), j, dtype=float)
|
|
591
|
+
else:
|
|
592
|
+
zs = np.repeat(j[:,None], n_mcmc_samples, axis=1)
|
|
593
|
+
quantiles[i] = xarray.apply_ufunc(
|
|
594
|
+
quantile,
|
|
595
|
+
*array_of_vars,
|
|
596
|
+
kwargs={"zs": zs, "likelihood": self.configs["likelihood"]},
|
|
597
|
+
)
|
|
598
|
+
return quantiles.mean(axis=-1)
|
|
599
|
+
|
|
600
|
+
def get_mcmc_zscores(self, X, y, **kwargs):
|
|
601
|
+
"""
|
|
602
|
+
Computes zscores of data given an estimated model
|
|
603
|
+
|
|
604
|
+
Args:
|
|
605
|
+
X ([N*p]ndarray): covariates
|
|
606
|
+
y ([N*1]ndarray): response variables
|
|
607
|
+
"""
|
|
608
|
+
|
|
609
|
+
print(self.configs["likelihood"])
|
|
610
|
+
|
|
611
|
+
tsbefile = kwargs.get("tsbefile", None)
|
|
612
|
+
if tsbefile is not None:
|
|
613
|
+
batch_effects_test = fileio.load(tsbefile)
|
|
614
|
+
else: # Set batch effects to zero if none are provided
|
|
615
|
+
print("Could not find batch-effects file! Initializing all as zeros ...")
|
|
616
|
+
batch_effects_test = np.zeros([X.shape[0], 1])
|
|
617
|
+
|
|
618
|
+
# Determine the variables to predict
|
|
619
|
+
if self.configs["likelihood"] == "Normal":
|
|
620
|
+
var_names = ["mu_samples", "sigma_samples", "sigma_plus_samples"]
|
|
621
|
+
elif self.configs["likelihood"].startswith("SHASH"):
|
|
622
|
+
var_names = [
|
|
623
|
+
"mu_samples",
|
|
624
|
+
"sigma_samples",
|
|
625
|
+
"sigma_plus_samples",
|
|
626
|
+
"epsilon_samples",
|
|
627
|
+
"delta_samples",
|
|
628
|
+
"delta_plus_samples",
|
|
629
|
+
]
|
|
630
|
+
else:
|
|
631
|
+
exit("Unknown likelihood: " + self.configs["likelihood"])
|
|
632
|
+
|
|
633
|
+
# Delete the posterior predictive if it already exists
|
|
634
|
+
if "posterior_predictive" in self.hbr.idata.groups():
|
|
635
|
+
del self.hbr.idata.posterior_predictive
|
|
636
|
+
|
|
637
|
+
# Do a forward to get the posterior predictive in the idata
|
|
638
|
+
self.hbr.predict(
|
|
639
|
+
X=X,
|
|
640
|
+
batch_effects=batch_effects_test,
|
|
641
|
+
batch_effects_maps=self.batch_effects_maps,
|
|
642
|
+
pred="single",
|
|
643
|
+
var_names=var_names + ["y_like"],
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
# Extract the relevant samples from the idata
|
|
647
|
+
post_pred = az.extract(
|
|
648
|
+
self.hbr.idata, "posterior_predictive", var_names=var_names
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
# Remove superfluous var_names
|
|
652
|
+
var_names.remove("sigma_samples")
|
|
653
|
+
if "delta_samples" in var_names:
|
|
654
|
+
var_names.remove("delta_samples")
|
|
655
|
+
|
|
656
|
+
# Separate the samples into a list so that they can be unpacked
|
|
657
|
+
array_of_vars = list(map(lambda x: post_pred[x], var_names))
|
|
658
|
+
|
|
659
|
+
# Create an array to hold the quantiles
|
|
660
|
+
len_data, n_mcmc_samples = post_pred["mu_samples"].shape
|
|
661
|
+
|
|
662
|
+
# Compute the quantile iteratively for each z-score
|
|
663
|
+
z_scores = xarray.apply_ufunc(
|
|
664
|
+
z_score,
|
|
665
|
+
*array_of_vars,
|
|
666
|
+
kwargs={"y": y, "likelihood": self.configs["likelihood"]},
|
|
667
|
+
)
|
|
668
|
+
return z_scores.mean(axis=-1).values
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def S_inv(x, e, d):
|
|
672
|
+
return np.sinh((np.arcsinh(x) + e) / d)
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def K(p, x):
|
|
676
|
+
"""
|
|
677
|
+
Computes the values of spp.kv(p,x) for only the unique values of p
|
|
678
|
+
"""
|
|
679
|
+
|
|
680
|
+
ps, idxs = np.unique(p, return_inverse=True)
|
|
681
|
+
return spp.kv(ps, x)[idxs].reshape(p.shape)
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def P(q):
|
|
685
|
+
"""
|
|
686
|
+
The P function as given in Jones et al.
|
|
687
|
+
:param q:
|
|
688
|
+
:return:
|
|
689
|
+
"""
|
|
690
|
+
frac = np.exp(1 / 4) / np.sqrt(8 * np.pi)
|
|
691
|
+
K1 = K((q + 1) / 2, 1 / 4)
|
|
692
|
+
K2 = K((q - 1) / 2, 1 / 4)
|
|
693
|
+
a = (K1 + K2) * frac
|
|
694
|
+
return a
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def m(epsilon, delta, r):
|
|
698
|
+
"""
|
|
699
|
+
The r'th uncentered moment. Given by Jones et al.
|
|
700
|
+
"""
|
|
701
|
+
frac1 = 1 / np.power(2, r)
|
|
702
|
+
acc = 0
|
|
703
|
+
for i in range(r + 1):
|
|
704
|
+
combs = spp.comb(r, i)
|
|
705
|
+
flip = np.power(-1, i)
|
|
706
|
+
ex = np.exp((r - 2 * i) * epsilon / delta)
|
|
707
|
+
p = P((r - 2 * i) / delta)
|
|
708
|
+
acc += combs * flip * ex * p
|
|
709
|
+
return frac1 * acc
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
def quantile(mu, sigma, epsilon=None, delta=None, zs=0, likelihood="Normal"):
|
|
713
|
+
"""Get the zs'th quantiles given likelihood parameters"""
|
|
714
|
+
if likelihood.startswith("SHASH"):
|
|
715
|
+
if likelihood == "SHASHo":
|
|
716
|
+
quantiles = S_inv(zs, epsilon, delta) * sigma + mu
|
|
717
|
+
elif likelihood == "SHASHo2":
|
|
718
|
+
sigma_d = sigma / delta
|
|
719
|
+
quantiles = S_inv(zs, epsilon, delta) * sigma_d + mu
|
|
720
|
+
elif likelihood == "SHASHb":
|
|
721
|
+
true_mu = m(epsilon, delta, 1)
|
|
722
|
+
true_sigma = np.sqrt((m(epsilon, delta, 2) - true_mu**2))
|
|
723
|
+
SHASH_c = (S_inv(zs, epsilon, delta) - true_mu) / true_sigma
|
|
724
|
+
quantiles = SHASH_c * sigma + mu
|
|
725
|
+
elif likelihood == "Normal":
|
|
726
|
+
quantiles = zs * sigma + mu
|
|
727
|
+
else:
|
|
728
|
+
exit("Unsupported likelihood")
|
|
729
|
+
return quantiles
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
def z_score(mu, sigma, epsilon=None, delta=None, y=None, likelihood="Normal"):
|
|
733
|
+
"""Get the z-scores of Y, given likelihood parameters"""
|
|
734
|
+
if likelihood.startswith("SHASH"):
|
|
735
|
+
if likelihood == "SHASHo":
|
|
736
|
+
SHASH = (y - mu) / sigma
|
|
737
|
+
Z = np.sinh(np.arcsinh(SHASH) * delta - epsilon)
|
|
738
|
+
elif likelihood == "SHASHo2":
|
|
739
|
+
sigma_d = sigma / delta
|
|
740
|
+
SHASH = (y - mu) / sigma_d
|
|
741
|
+
Z = np.sinh(np.arcsinh(SHASH) * delta - epsilon)
|
|
742
|
+
elif likelihood == "SHASHb":
|
|
743
|
+
true_mu = m(epsilon, delta, 1)
|
|
744
|
+
true_sigma = np.sqrt((m(epsilon, delta, 2) - true_mu**2))
|
|
745
|
+
SHASH_c = (y - mu) / sigma
|
|
746
|
+
SHASH = SHASH_c * true_sigma + true_mu
|
|
747
|
+
Z = np.sinh(np.arcsinh(SHASH) * delta - epsilon)
|
|
748
|
+
elif likelihood == "Normal":
|
|
749
|
+
Z = (y - mu) / sigma
|
|
750
|
+
else:
|
|
751
|
+
exit("Unsupported likelihood")
|
|
752
|
+
return Z
|