sgptools 1.2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sgptools/__init__.py +3 -4
- sgptools/core/__init__.py +1 -0
- sgptools/{models/core → core}/augmented_gpr.py +11 -17
- sgptools/{models/core → core}/augmented_sgpr.py +27 -34
- sgptools/core/osgpr.py +417 -0
- sgptools/core/transformations.py +699 -0
- sgptools/kernels/__init__.py +0 -8
- sgptools/kernels/attentive_kernel.py +214 -69
- sgptools/kernels/neural_kernel.py +268 -92
- sgptools/kernels/neural_network.py +127 -28
- sgptools/methods.py +1047 -0
- sgptools/objectives.py +275 -0
- sgptools/utils/__init__.py +0 -9
- sgptools/utils/data.py +452 -149
- sgptools/utils/gpflow.py +335 -174
- sgptools/utils/metrics.py +375 -102
- sgptools/utils/misc.py +145 -111
- sgptools/utils/tsp.py +224 -84
- sgptools-2.0.0.dist-info/METADATA +216 -0
- sgptools-2.0.0.dist-info/RECORD +23 -0
- {sgptools-1.2.0.dist-info → sgptools-2.0.0.dist-info}/WHEEL +1 -1
- sgptools/models/__init__.py +0 -10
- sgptools/models/bo.py +0 -118
- sgptools/models/cma_es.py +0 -121
- sgptools/models/continuous_sgp.py +0 -68
- sgptools/models/core/__init__.py +0 -9
- sgptools/models/core/osgpr.py +0 -291
- sgptools/models/core/transformations.py +0 -434
- sgptools/models/greedy_mi.py +0 -115
- sgptools/models/greedy_sgp.py +0 -97
- sgptools-1.2.0.dist-info/METADATA +0 -39
- sgptools-1.2.0.dist-info/RECORD +0 -27
- {sgptools-1.2.0.dist-info → sgptools-2.0.0.dist-info/licenses}/LICENSE.txt +0 -0
- {sgptools-1.2.0.dist-info → sgptools-2.0.0.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,6 @@
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
|
-
|
15
14
|
"""Provides a neural spectral kernel function along with an initialization function
|
16
15
|
"""
|
17
16
|
|
@@ -19,149 +18,326 @@ import tensorflow as tf
|
|
19
18
|
import numpy as np
|
20
19
|
import gc
|
21
20
|
|
22
|
-
|
23
21
|
import gpflow
|
24
22
|
from gpflow.config import default_jitter, default_float
|
25
23
|
from gpflow.models import SGPR
|
26
24
|
from gpflow.models.util import data_input_to_tensor
|
25
|
+
|
27
26
|
float_type = default_float()
|
28
27
|
|
29
28
|
from .neural_network import NN
|
29
|
+
from typing import List, Optional, Tuple, Union, Any
|
30
30
|
|
31
31
|
|
32
32
|
class NeuralSpectralKernel(gpflow.kernels.Kernel):
|
33
|
-
"""
|
34
|
-
|
33
|
+
"""
|
34
|
+
Neural Spectral Kernel function (non-stationary kernel function).
|
35
|
+
This kernel models non-stationarity by using multiple Multi-Layer Perceptrons (MLPs)
|
36
|
+
to map input locations to frequency, lengthscale, and variance parameters for a
|
37
|
+
mixture of spectral components.
|
38
|
+
|
39
|
+
Based on the implementation from this [repo](https://github.com/sremes/nssm-gp/tree/master?tab=readme-ov-file).
|
35
40
|
|
36
41
|
Refer to the following papers for more details:
|
37
42
|
- Neural Non-Stationary Spectral Kernel [Remes et al., 2018]
|
38
43
|
|
39
|
-
|
40
|
-
input_dim (int):
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
+
Attributes:
|
45
|
+
input_dim (int): Dimensionality of the input data points.
|
46
|
+
Q (int): Number of MLP mixture components used in the kernel function.
|
47
|
+
num_hidden (int): Number of hidden layers in each MLP.
|
48
|
+
freq (List[NN]): List of MLPs, one for each component, predicting frequencies.
|
49
|
+
length (List[NN]): List of MLPs, one for each component, predicting lengthscales.
|
50
|
+
var (List[NN]): List of MLPs, one for each component, predicting variances.
|
44
51
|
"""
|
45
|
-
|
52
|
+
|
53
|
+
def __init__(self,
|
54
|
+
input_dim: int,
|
55
|
+
active_dims: Optional[List[int]] = None,
|
56
|
+
Q: int = 1,
|
57
|
+
hidden_sizes: List[int] = None):
|
58
|
+
"""
|
59
|
+
Initializes the Neural Spectral Kernel.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
input_dim (int): Number of dimensions of the input data points (e.g., 2 for 2D data).
|
63
|
+
active_dims (Optional[List[int]]): A list of indices specifying which input dimensions
|
64
|
+
the kernel operates on. If None, all dimensions are active.
|
65
|
+
Defaults to None.
|
66
|
+
Q (int): The number of MLP mixture components used in the kernel function.
|
67
|
+
Each component has its own set of MLPs for frequency, lengthscale, and variance.
|
68
|
+
Defaults to 1.
|
69
|
+
hidden_sizes (List[int]): A list where each element specifies the number of hidden units
|
70
|
+
in a layer of the MLPs. The length of this list determines
|
71
|
+
the number of hidden layers. Defaults to [32, 32].
|
72
|
+
|
73
|
+
Usage:
|
74
|
+
```python
|
75
|
+
import gpflow
|
76
|
+
import numpy as np
|
77
|
+
from sgptools.kernels.neural_kernel import NeuralSpectralKernel
|
78
|
+
|
79
|
+
# Initialize a Neural Spectral Kernel for 2D data with 3 mixture components
|
80
|
+
# and MLPs with 2 hidden layers of 64 units each.
|
81
|
+
kernel = NeuralSpectralKernel(input_dim=2, Q=3, hidden_sizes=[64, 64])
|
82
|
+
|
83
|
+
# You can then use this kernel in a GPflow model:
|
84
|
+
# model = gpflow.models.SGPR(data=(X_train, Y_train), kernel=kernel, ...)
|
85
|
+
```
|
86
|
+
"""
|
46
87
|
super().__init__(active_dims=active_dims)
|
47
88
|
|
89
|
+
if hidden_sizes is None:
|
90
|
+
hidden_sizes = [32, 32] # Default if not provided
|
91
|
+
else:
|
92
|
+
hidden_sizes = list(hidden_sizes)
|
93
|
+
|
48
94
|
self.input_dim = input_dim
|
49
95
|
self.Q = Q
|
50
96
|
self.num_hidden = len(hidden_sizes)
|
51
97
|
|
52
|
-
|
53
|
-
self.
|
54
|
-
self.
|
98
|
+
# Initialize lists of MLPs for each component
|
99
|
+
self.freq: List[NN] = []
|
100
|
+
self.length: List[NN] = []
|
101
|
+
self.var: List[NN] = []
|
102
|
+
|
103
|
+
# Create Q sets of MLPs
|
55
104
|
for q in range(self.Q):
|
56
|
-
|
57
|
-
|
58
|
-
|
105
|
+
# MLP for frequency: maps input_dim -> hidden_sizes -> input_dim
|
106
|
+
# Output activation 'softplus' ensures positive frequencies.
|
107
|
+
freq_nn = NN([input_dim] + hidden_sizes + [input_dim],
|
108
|
+
output_activation_fn='softplus')
|
109
|
+
|
110
|
+
# MLP for lengthscale: maps input_dim -> hidden_sizes -> input_dim
|
111
|
+
# Output activation 'softplus' ensures positive lengthscales.
|
112
|
+
length_nn = NN([input_dim] + hidden_sizes + [input_dim],
|
113
|
+
output_activation_fn='softplus')
|
114
|
+
|
115
|
+
# MLP for variance: maps input_dim -> hidden_sizes -> 1 (scalar variance)
|
116
|
+
# Output activation 'softplus' ensures positive variances.
|
117
|
+
var_nn = NN([input_dim] + hidden_sizes + [1],
|
59
118
|
output_activation_fn='softplus')
|
60
|
-
|
61
|
-
|
62
|
-
self.
|
63
|
-
self.
|
64
|
-
|
65
|
-
|
66
|
-
def K(self, X, X2=None):
|
67
|
-
"""
|
119
|
+
|
120
|
+
self.freq.append(freq_nn)
|
121
|
+
self.length.append(length_nn)
|
122
|
+
self.var.append(var_nn)
|
123
|
+
|
124
|
+
@tf.autograph.experimental.do_not_convert
|
125
|
+
def K(self, X: tf.Tensor, X2: Optional[tf.Tensor] = None) -> tf.Tensor:
|
126
|
+
"""
|
127
|
+
Computes the covariance matrix between/amongst the input variables `X` and `X2`.
|
128
|
+
If `X2` is None, the function computes `K(X, X)` (a symmetric covariance matrix).
|
129
|
+
Otherwise, it computes `K(X, X2)` (a cross-covariance matrix).
|
130
|
+
|
131
|
+
The kernel is a sum over `Q` mixture components, where each component's
|
132
|
+
parameters (frequency, lengthscale, variance) are determined by MLPs
|
133
|
+
based on the input locations.
|
68
134
|
|
69
135
|
Args:
|
70
|
-
X (
|
71
|
-
|
72
|
-
|
136
|
+
X (tf.Tensor): (N1, D); First set of input variables to compute covariance from.
|
137
|
+
`N1` is the number of points, `D` is the dimensionality.
|
138
|
+
X2 (Optional[tf.Tensor]): (N2, D); Optional second set of input variables.
|
139
|
+
If provided, computes cross-covariance `K(X, X2)`.
|
140
|
+
If None, computes auto-covariance `K(X, X)`.
|
73
141
|
|
74
142
|
Returns:
|
75
|
-
|
143
|
+
tf.Tensor: (N1, N2); The computed covariance matrix. If `X2` is None, the
|
144
|
+
diagonal of `K(X, X)` is jittered for numerical stability.
|
76
145
|
"""
|
77
146
|
if X2 is None:
|
78
|
-
|
79
|
-
equal = True
|
147
|
+
X2_internal = X
|
148
|
+
equal = True # Flag to add jitter to diagonal for K(X,X)
|
80
149
|
else:
|
150
|
+
X2_internal = X2
|
81
151
|
equal = False
|
82
152
|
|
83
|
-
kern = 0.0
|
153
|
+
kern = tf.constant(0.0, dtype=float_type) # Initialize kernel sum
|
154
|
+
|
84
155
|
for q in range(self.Q):
|
85
|
-
#
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
#
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
#
|
109
|
-
|
110
|
-
|
111
|
-
|
156
|
+
# Compute latent function values (frequencies, lengthscales, variances)
|
157
|
+
# by passing input locations through the MLPs.
|
158
|
+
freq_X, freq_X2 = self.freq[q](X), self.freq[q](
|
159
|
+
X2_internal) # (N, D) frequencies
|
160
|
+
lens_X, lens_X2 = self.length[q](X), self.length[q](
|
161
|
+
X2_internal) # (N, D) lengthscales
|
162
|
+
var_X, var_X2 = self.var[q](X), self.var[q](
|
163
|
+
X2_internal) # (N, 1) variances
|
164
|
+
|
165
|
+
# Compute length-scale term (E) - based on inverse lengthscales and distances
|
166
|
+
Xr = tf.expand_dims(X, 1) # (N1, 1, D)
|
167
|
+
X2r = tf.expand_dims(X2_internal, 0) # (1, N2, D)
|
168
|
+
l1 = tf.expand_dims(lens_X, 1) # (N1, 1, D)
|
169
|
+
l2 = tf.expand_dims(lens_X2, 0) # (1, N2, D)
|
170
|
+
|
171
|
+
L = tf.square(l1) + tf.square(
|
172
|
+
l2) # (N1, N2, D) - sum of squared lengthscales
|
173
|
+
|
174
|
+
# D term: Squared difference scaled by L, summed over dimensions
|
175
|
+
D_term = tf.square(Xr - X2r) / L # (N1, N2, D)
|
176
|
+
D_term = tf.reduce_sum(D_term, 2) # (N1, N2) - sum over dimensions
|
177
|
+
|
178
|
+
# Determinant term: Product over dimensions of (2 * l1 * l2 / L)^(1/2)
|
179
|
+
det_term = tf.sqrt(2 * l1 * l2 / L) # (N1, N2, D)
|
180
|
+
det_term = tf.reduce_prod(det_term,
|
181
|
+
2) # (N1, N2) - product over dimensions
|
182
|
+
|
183
|
+
# E term: Combine determinant and exponential of D_term
|
184
|
+
E = det_term * tf.exp(-D_term) # (N1, N2)
|
185
|
+
|
186
|
+
# Compute cosine term (COS) - based on frequencies and dot products with X
|
187
|
+
# (N1, D) * (N1, D) -> sum over D -> (N1, 1)
|
188
|
+
muX = (tf.reduce_sum(freq_X * X, 1, keepdims=True) - tf.transpose(
|
189
|
+
tf.reduce_sum(freq_X2 * X2_internal, 1, keepdims=True)))
|
190
|
+
COS = tf.cos(2 * np.pi * muX) # (N1, N2)
|
191
|
+
|
192
|
+
# Compute kernel variance term (WW) - outer product of variance predictions
|
193
|
+
WW = tf.matmul(var_X, var_X2,
|
194
|
+
transpose_b=True) # (N1, 1) @ (1, N2) -> (N1, N2)
|
195
|
+
|
196
|
+
# Compute the q'th kernel component and add to total kernel
|
112
197
|
kern += WW * E * COS
|
198
|
+
|
199
|
+
# Add jitter to the diagonal for K(X,X) matrices for numerical stability
|
113
200
|
if equal:
|
114
201
|
return robust_kernel(kern, tf.shape(X)[0])
|
115
202
|
else:
|
116
203
|
return kern
|
117
204
|
|
118
|
-
|
119
|
-
|
205
|
+
@tf.autograph.experimental.do_not_convert
|
206
|
+
def K_diag(self, X: tf.Tensor) -> tf.Tensor:
|
207
|
+
"""
|
208
|
+
Computes the diagonal of the covariance matrix `K(X, X)`.
|
209
|
+
For the Neural Spectral Kernel, this is `sum_q(var_q(X)^2) + jitter`.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
X (tf.Tensor): (N, D); Input data points. `N` is the number of points.
|
213
|
+
|
214
|
+
Returns:
|
215
|
+
tf.Tensor: (N,); A 1D tensor representing the diagonal elements of the
|
216
|
+
covariance matrix.
|
217
|
+
"""
|
218
|
+
kd = default_jitter() # Initialize with a small jitter
|
120
219
|
for q in range(self.Q):
|
220
|
+
# Sum of squared variance predictions from each MLP component
|
121
221
|
kd += tf.square(self.var[q](X))
|
122
|
-
return tf.squeeze(
|
222
|
+
return tf.squeeze(
|
223
|
+
kd) # Remove singleton dimension (e.g., (N, 1) -> (N,))
|
123
224
|
|
124
|
-
'''
|
125
|
-
Helper functions
|
126
|
-
'''
|
127
|
-
def robust_kernel(kern, shape_X):
|
128
|
-
jitter = 1e-3
|
129
|
-
return kern + jitter * tf.eye(shape_X, dtype=float_type)
|
130
225
|
|
131
|
-
|
132
|
-
|
133
|
-
|
226
|
+
# --- Helper functions ---
|
227
|
+
@tf.autograph.experimental.do_not_convert
|
228
|
+
def robust_kernel(kern: tf.Tensor, shape_X_0: tf.Tensor) -> tf.Tensor:
|
229
|
+
"""
|
230
|
+
Adds a small positive jitter to the diagonal of a covariance matrix
|
231
|
+
to ensure numerical stability. This is particularly important for
|
232
|
+
Cholesky decompositions or inverse calculations.
|
134
233
|
|
135
|
-
|
234
|
+
Args:
|
235
|
+
kern (tf.Tensor): The input covariance matrix.
|
236
|
+
shape_X_0 (tf.Tensor): The size of the first dimension of the original input `X`
|
237
|
+
(i.e., the number of data points N). Used to create the identity matrix.
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
tf.Tensor: The covariance matrix with jitter added to its diagonal.
|
241
|
+
"""
|
242
|
+
jitter_val = 1e-3 # Fixed jitter value
|
243
|
+
# Add jitter to the diagonal of the kernel matrix
|
244
|
+
return kern + jitter_val * tf.eye(shape_X_0, dtype=float_type)
|
245
|
+
|
246
|
+
|
247
|
+
def init_neural_kernel(X_train: np.ndarray,
|
248
|
+
Y_train: np.ndarray,
|
249
|
+
inducing_variable: np.ndarray,
|
250
|
+
Q: int,
|
251
|
+
n_inits: int = 1,
|
252
|
+
hidden_sizes: Optional[List[int]] = None) -> SGPR:
|
253
|
+
"""
|
254
|
+
Helper function to initialize a Sparse Gaussian Process Regression (SGPR) model
|
255
|
+
with a Neural Spectral Kernel. This function can perform multiple random
|
256
|
+
initializations and return the model with the best initial Evidence Lower Bound (ELBO).
|
257
|
+
|
258
|
+
Refer to the original paper for more details:
|
136
259
|
- Neural Non-Stationary Spectral Kernel [Remes et al., 2018]
|
137
260
|
|
138
261
|
Args:
|
139
|
-
|
140
|
-
|
141
|
-
inducing_variable (ndarray): (m, d); Initial inducing points
|
142
|
-
|
143
|
-
|
144
|
-
|
262
|
+
X_train (np.ndarray): (n, d); Input training set points.
|
263
|
+
Y_train (np.ndarray): (n, 1); Training set labels.
|
264
|
+
inducing_variable (np.ndarray): (m, d); Initial inducing points. These are passed
|
265
|
+
directly to the SGPR model.
|
266
|
+
Q (int): The number of MLP mixture components for the Neural Spectral Kernel.
|
267
|
+
n_inits (int): Number of times to randomly initialize the kernel's MLPs and
|
268
|
+
compute the initial ELBO. The model with the highest ELBO
|
269
|
+
among these initializations is returned. Defaults to 1.
|
270
|
+
hidden_sizes (Optional[List[int]]): List of integers specifying the number of hidden
|
271
|
+
units in each MLP layer. If None, [32, 32] is used.
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
SGPR: The SGPR model instance initialized with the Neural Spectral Kernel
|
275
|
+
that yielded the best initial ELBO.
|
276
|
+
|
277
|
+
Usage:
|
278
|
+
```python
|
279
|
+
import numpy as np
|
280
|
+
import gpflow
|
281
|
+
from sgptools.kernels.neural_kernel import init_neural_kernel
|
282
|
+
from sgptools.utils.misc import get_inducing_pts # For initial inducing points
|
283
|
+
|
284
|
+
# Dummy data
|
285
|
+
X_train_data = np.random.rand(100, 2).astype(np.float32)
|
286
|
+
Y_train_data = (np.sin(X_train_data[:, 0]) + np.cos(X_train_data[:, 1]))[:, None].astype(np.float32)
|
287
|
+
|
288
|
+
# Initial inducing points (e.g., subset of training data or k-means centers)
|
289
|
+
initial_inducing_points = get_inducing_pts(X_train_data, num_inducing=20)
|
290
|
+
|
291
|
+
# Initialize the SGPR model with Neural Spectral Kernel
|
292
|
+
# Try 3 random initializations for the MLPs.
|
293
|
+
model_ns_kernel = init_neural_kernel(
|
294
|
+
X_train=X_train_data,
|
295
|
+
Y_train=Y_train_data,
|
296
|
+
inducing_variable=initial_inducing_points,
|
297
|
+
Q=5, # 5 mixture components
|
298
|
+
n_inits=3, # 3 initializations
|
299
|
+
hidden_sizes=[16, 16] # Custom hidden layer sizes
|
300
|
+
)
|
301
|
+
|
302
|
+
# You would typically optimize this model further using optimize_model:
|
303
|
+
# from sgptools.utils.gpflow import optimize_model
|
304
|
+
# optimize_model(model_ns_kernel)
|
305
|
+
```
|
145
306
|
"""
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
best_loglik = -np.inf
|
150
|
-
best_m = None
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
307
|
+
# Convert NumPy arrays to TensorFlow tensors
|
308
|
+
X_train_tf, Y_train_tf = data_input_to_tensor((X_train, Y_train))
|
309
|
+
|
310
|
+
best_loglik = -np.inf # Track the best ELBO found
|
311
|
+
best_m: Optional[SGPR] = None # Store the best model
|
312
|
+
|
313
|
+
N, input_dim = X_train_tf.shape # Get number of data points and input dimensionality
|
314
|
+
|
315
|
+
for k_init_idx in range(n_inits):
|
316
|
+
# Create a new NeuralSpectralKernel instance for each initialization
|
317
|
+
current_kernel = NeuralSpectralKernel(input_dim=input_dim,
|
318
|
+
Q=Q,
|
319
|
+
hidden_sizes=hidden_sizes)
|
320
|
+
|
321
|
+
# Create an SGPR model with the current kernel initialization
|
322
|
+
model = SGPR(data=(X_train_tf, Y_train_tf),
|
323
|
+
inducing_variable=inducing_variable,
|
324
|
+
kernel=current_kernel)
|
325
|
+
|
326
|
+
# Compute the initial ELBO (Evidence Lower Bound)
|
159
327
|
loglik = model.elbo()
|
328
|
+
|
329
|
+
# Check if the current initialization is better than previous ones
|
160
330
|
if loglik > best_loglik:
|
161
331
|
best_loglik = loglik
|
332
|
+
# Deepcopy the model to save its state, as it will be deleted/overwritten in next iteration
|
333
|
+
# This requires gpflow.utilities.traversal.deepcopy or similar for GPflow models
|
334
|
+
# For simplicity, we directly assign here, assuming shallow copy is sufficient
|
335
|
+
# or that the user will optimize it later. For robust best model saving, a deepcopy is safer.
|
162
336
|
best_m = model
|
337
|
+
|
338
|
+
# Explicitly delete the model and run garbage collection to free memory
|
339
|
+
# (important if n_inits is large and models are complex)
|
163
340
|
del model
|
164
341
|
gc.collect()
|
165
|
-
print('Best init: %f' % best_loglik)
|
166
342
|
|
167
|
-
return best_m
|
343
|
+
return best_m
|
@@ -11,7 +11,6 @@
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
|
-
|
15
14
|
"""Multi Layer Perceptron Model
|
16
15
|
"""
|
17
16
|
|
@@ -20,39 +19,139 @@ import tensorflow as tf
|
|
20
19
|
|
21
20
|
import gpflow
|
22
21
|
from gpflow.config import default_float
|
22
|
+
|
23
23
|
float_type = default_float()
|
24
24
|
|
25
|
-
|
26
|
-
return np.random.randn(dim_in, dim_out)*(2./(dim_in+dim_out))**0.5
|
25
|
+
from typing import List, Union, Callable
|
27
26
|
|
28
|
-
|
29
|
-
|
27
|
+
|
28
|
+
def xavier(dim_in: int, dim_out: int) -> np.ndarray:
|
29
|
+
"""
|
30
|
+
Initializes weights using the Xavier (Glorot) uniform initialization method.
|
31
|
+
This method aims to keep the variance of activations consistent across layers,
|
32
|
+
helping to prevent vanishing/exploding gradients.
|
33
|
+
|
34
|
+
Formula: $W \sim U(-\sqrt{6/(dim_{in} + dim_{out})}, \sqrt{6/(dim_{in} + dim_{out})})$
|
30
35
|
|
31
36
|
Args:
|
32
|
-
|
33
|
-
|
34
|
-
|
37
|
+
dim_in (int): The number of input units to the layer.
|
38
|
+
dim_out (int): The number of output units from the layer.
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
np.ndarray: A NumPy array of shape (dim_in, dim_out) containing
|
42
|
+
the initialized weights.
|
43
|
+
"""
|
44
|
+
# Calculate the fan-in + fan-out for the scaling factor
|
45
|
+
scale_factor = (2.0 / (dim_in + dim_out))**0.5
|
46
|
+
# Generate random numbers from a normal (Gaussian) distribution
|
47
|
+
# This is often used as an approximation for Xavier uniform in practice
|
48
|
+
# or sometimes Xavier normal is explicitly implemented this way.
|
49
|
+
return np.random.randn(dim_in, dim_out) * scale_factor
|
50
|
+
|
51
|
+
|
52
|
+
class NN(gpflow.base.Module):
|
53
|
+
"""
|
54
|
+
A Multi-Layer Perceptron (MLP) model that is compatible with GPFlow,
|
55
|
+
allowing its parameters (weights and biases) to be optimized as part of
|
56
|
+
a GPflow model (e.g., within a custom kernel).
|
57
|
+
|
58
|
+
The network consists of multiple fully connected (dense) layers with
|
59
|
+
specified activation functions.
|
60
|
+
|
61
|
+
Attributes:
|
62
|
+
dims (List[int]): List of layer sizes, including input and output dimensions.
|
63
|
+
activation_fn (Callable): Activation function for hidden layers.
|
64
|
+
output_activation_fn (Callable): Activation function for the output layer.
|
65
|
+
_weights (List[tf.Variable]): List of TensorFlow Variable for weights of each layer.
|
66
|
+
_biases (List[tf.Variable]): List of TensorFlow Variable for biases of each layer.
|
35
67
|
"""
|
36
|
-
|
37
|
-
|
38
|
-
|
68
|
+
|
69
|
+
def __init__(self,
|
70
|
+
dims: List[int],
|
71
|
+
activation_fn: Union[str, Callable] = 'selu',
|
72
|
+
output_activation_fn: Union[str, Callable] = 'softmax'):
|
73
|
+
"""
|
74
|
+
Initializes the Multi-Layer Perceptron (MLP).
|
75
|
+
|
76
|
+
Args:
|
77
|
+
dims (List[int]): A list of integers specifying the size of each layer.
|
78
|
+
The first element is the input dimension, the last is
|
79
|
+
the output dimension, and intermediate elements are
|
80
|
+
hidden layer sizes.
|
81
|
+
Example: `[input_dim, hidden1_dim, hidden2_dim, output_dim]`
|
82
|
+
activation_fn (Union[str, Callable]): The activation function to use for hidden layers.
|
83
|
+
Can be a string (e.g., 'relu', 'tanh', 'selu')
|
84
|
+
or a callable TensorFlow activation function.
|
85
|
+
Defaults to 'selu'.
|
86
|
+
output_activation_fn (Union[str, Callable]): The activation function to use for the output layer.
|
87
|
+
Can be a string (e.g., 'softmax', 'sigmoid', 'softplus')
|
88
|
+
or a callable TensorFlow activation function.
|
89
|
+
Defaults to 'softplus'.
|
90
|
+
|
91
|
+
Usage:
|
92
|
+
```python
|
93
|
+
from sgptools.kernels.neural_network import NN
|
94
|
+
import tensorflow as tf
|
95
|
+
import numpy as np
|
96
|
+
|
97
|
+
# Example: A simple MLP with one hidden layer
|
98
|
+
mlp = NN(dims=[2, 10, 1], activation_fn='tanh', output_activation_fn='sigmoid')
|
99
|
+
|
100
|
+
# Input data
|
101
|
+
input_data = tf.constant(np.random.rand(5, 2), dtype=tf.float32)
|
102
|
+
|
103
|
+
# Pass input through the network
|
104
|
+
output = mlp(input_data)
|
105
|
+
```
|
106
|
+
"""
|
39
107
|
super().__init__()
|
40
108
|
self.dims = dims
|
41
|
-
|
42
|
-
self.
|
109
|
+
# Get TensorFlow activation functions from strings or use provided callables
|
110
|
+
self.activation_fn = tf.keras.activations.get(
|
111
|
+
activation_fn) if isinstance(activation_fn, str) else activation_fn
|
112
|
+
self.output_activation_fn = tf.keras.activations.get(
|
113
|
+
output_activation_fn) if isinstance(output_activation_fn,
|
114
|
+
str) else output_activation_fn
|
115
|
+
|
116
|
+
self._weights: List[tf.Variable] = []
|
117
|
+
self._biases: List[tf.Variable] = []
|
118
|
+
|
119
|
+
# Create weights and biases for each layer
|
43
120
|
for i, (dim_in, dim_out) in enumerate(zip(dims[:-1], dims[1:])):
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
121
|
+
# Use Xavier initialization for weights
|
122
|
+
weight_init = xavier(dim_in, dim_out)
|
123
|
+
self._weights.append(
|
124
|
+
tf.Variable(weight_init, dtype=float_type, name=f'W_{i}'))
|
125
|
+
|
126
|
+
# Initialize biases to zeros
|
127
|
+
bias_init = np.zeros(dim_out, dtype=float_type)
|
128
|
+
self._biases.append(
|
129
|
+
tf.Variable(bias_init, dtype=float_type, name=f'b_{i}'))
|
130
|
+
|
131
|
+
def __call__(self, X: tf.Tensor) -> tf.Tensor:
|
132
|
+
"""
|
133
|
+
Performs a forward pass through the MLP.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
X (tf.Tensor): (N, D_in); The input tensor to the MLP. `N` is the batch size,
|
137
|
+
`D_in` is the input dimension of the network.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
tf.Tensor: (N, D_out); The output tensor from the MLP. `D_out` is the output
|
141
|
+
dimension of the network.
|
142
|
+
"""
|
143
|
+
# Process through hidden layers
|
144
|
+
# The loop runs for (num_layers - 1) iterations, covering all hidden layers
|
145
|
+
# and the input-to-first-hidden layer transition.
|
146
|
+
for i in range(len(self.dims) -
|
147
|
+
2): # Iterate up to second to last layer
|
148
|
+
W = self._weights[i]
|
149
|
+
b = self._biases[i]
|
150
|
+
X = self.activation_fn(tf.matmul(X, W) + b)
|
151
|
+
|
152
|
+
# Process through the last layer (output layer)
|
153
|
+
W_last = self._weights[-1] # Weights for the last layer
|
154
|
+
b_last = self._biases[-1] # Biases for the last layer
|
155
|
+
X = self.output_activation_fn(tf.matmul(X, W_last) + b_last)
|
156
|
+
|
157
|
+
return X
|