sgptools 1.2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,6 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  """Provides a neural spectral kernel function along with an initialization function
16
15
  """
17
16
 
@@ -19,149 +18,326 @@ import tensorflow as tf
19
18
  import numpy as np
20
19
  import gc
21
20
 
22
-
23
21
  import gpflow
24
22
  from gpflow.config import default_jitter, default_float
25
23
  from gpflow.models import SGPR
26
24
  from gpflow.models.util import data_input_to_tensor
25
+
27
26
  float_type = default_float()
28
27
 
29
28
  from .neural_network import NN
29
+ from typing import List, Optional, Tuple, Union, Any
30
30
 
31
31
 
32
32
  class NeuralSpectralKernel(gpflow.kernels.Kernel):
33
- """Neural Spectral Kernel function (non-stationary kernel function).
34
- Based on the implementation from this [repo](https://github.com/sremes/nssm-gp/tree/master?tab=readme-ov-file)
33
+ """
34
+ Neural Spectral Kernel function (non-stationary kernel function).
35
+ This kernel models non-stationarity by using multiple Multi-Layer Perceptrons (MLPs)
36
+ to map input locations to frequency, lengthscale, and variance parameters for a
37
+ mixture of spectral components.
38
+
39
+ Based on the implementation from this [repo](https://github.com/sremes/nssm-gp/tree/master?tab=readme-ov-file).
35
40
 
36
41
  Refer to the following papers for more details:
37
42
  - Neural Non-Stationary Spectral Kernel [Remes et al., 2018]
38
43
 
39
- Args:
40
- input_dim (int): Number of data dimensions
41
- active_dims (int): Number of data dimensions that are used for computing the covariances
42
- Q (int): Number of MLP mixture components used in the kernel function
43
- hidden_sizes (list): Number of hidden units in each MLP layer. Length of the list determines the number of layers.
44
+ Attributes:
45
+ input_dim (int): Dimensionality of the input data points.
46
+ Q (int): Number of MLP mixture components used in the kernel function.
47
+ num_hidden (int): Number of hidden layers in each MLP.
48
+ freq (List[NN]): List of MLPs, one for each component, predicting frequencies.
49
+ length (List[NN]): List of MLPs, one for each component, predicting lengthscales.
50
+ var (List[NN]): List of MLPs, one for each component, predicting variances.
44
51
  """
45
- def __init__(self, input_dim, active_dims=None, Q=1, hidden_sizes=[32, 32]):
52
+
53
+ def __init__(self,
54
+ input_dim: int,
55
+ active_dims: Optional[List[int]] = None,
56
+ Q: int = 1,
57
+ hidden_sizes: List[int] = None):
58
+ """
59
+ Initializes the Neural Spectral Kernel.
60
+
61
+ Args:
62
+ input_dim (int): Number of dimensions of the input data points (e.g., 2 for 2D data).
63
+ active_dims (Optional[List[int]]): A list of indices specifying which input dimensions
64
+ the kernel operates on. If None, all dimensions are active.
65
+ Defaults to None.
66
+ Q (int): The number of MLP mixture components used in the kernel function.
67
+ Each component has its own set of MLPs for frequency, lengthscale, and variance.
68
+ Defaults to 1.
69
+ hidden_sizes (List[int]): A list where each element specifies the number of hidden units
70
+ in a layer of the MLPs. The length of this list determines
71
+ the number of hidden layers. Defaults to [32, 32].
72
+
73
+ Usage:
74
+ ```python
75
+ import gpflow
76
+ import numpy as np
77
+ from sgptools.kernels.neural_kernel import NeuralSpectralKernel
78
+
79
+ # Initialize a Neural Spectral Kernel for 2D data with 3 mixture components
80
+ # and MLPs with 2 hidden layers of 64 units each.
81
+ kernel = NeuralSpectralKernel(input_dim=2, Q=3, hidden_sizes=[64, 64])
82
+
83
+ # You can then use this kernel in a GPflow model:
84
+ # model = gpflow.models.SGPR(data=(X_train, Y_train), kernel=kernel, ...)
85
+ ```
86
+ """
46
87
  super().__init__(active_dims=active_dims)
47
88
 
89
+ if hidden_sizes is None:
90
+ hidden_sizes = [32, 32] # Default if not provided
91
+ else:
92
+ hidden_sizes = list(hidden_sizes)
93
+
48
94
  self.input_dim = input_dim
49
95
  self.Q = Q
50
96
  self.num_hidden = len(hidden_sizes)
51
97
 
52
- self.freq = []
53
- self.length = []
54
- self.var = []
98
+ # Initialize lists of MLPs for each component
99
+ self.freq: List[NN] = []
100
+ self.length: List[NN] = []
101
+ self.var: List[NN] = []
102
+
103
+ # Create Q sets of MLPs
55
104
  for q in range(self.Q):
56
- freq = NN([input_dim]+[hidden_sizes[i] for i in range(self.num_hidden)]+[input_dim],
57
- output_activation_fn='softplus')
58
- length = NN([input_dim]+[hidden_sizes[i] for i in range(self.num_hidden)]+[input_dim],
105
+ # MLP for frequency: maps input_dim -> hidden_sizes -> input_dim
106
+ # Output activation 'softplus' ensures positive frequencies.
107
+ freq_nn = NN([input_dim] + hidden_sizes + [input_dim],
108
+ output_activation_fn='softplus')
109
+
110
+ # MLP for lengthscale: maps input_dim -> hidden_sizes -> input_dim
111
+ # Output activation 'softplus' ensures positive lengthscales.
112
+ length_nn = NN([input_dim] + hidden_sizes + [input_dim],
113
+ output_activation_fn='softplus')
114
+
115
+ # MLP for variance: maps input_dim -> hidden_sizes -> 1 (scalar variance)
116
+ # Output activation 'softplus' ensures positive variances.
117
+ var_nn = NN([input_dim] + hidden_sizes + [1],
59
118
  output_activation_fn='softplus')
60
- var = NN([input_dim]+[hidden_sizes[i] for i in range(self.num_hidden)]+[1],
61
- output_activation_fn='softplus')
62
- self.freq.append(freq)
63
- self.length.append(length)
64
- self.var.append(var)
65
-
66
- def K(self, X, X2=None):
67
- """Computes the covariances between/amongst the input variables
119
+
120
+ self.freq.append(freq_nn)
121
+ self.length.append(length_nn)
122
+ self.var.append(var_nn)
123
+
124
+ @tf.autograph.experimental.do_not_convert
125
+ def K(self, X: tf.Tensor, X2: Optional[tf.Tensor] = None) -> tf.Tensor:
126
+ """
127
+ Computes the covariance matrix between/amongst the input variables `X` and `X2`.
128
+ If `X2` is None, the function computes `K(X, X)` (a symmetric covariance matrix).
129
+ Otherwise, it computes `K(X, X2)` (a cross-covariance matrix).
130
+
131
+ The kernel is a sum over `Q` mixture components, where each component's
132
+ parameters (frequency, lengthscale, variance) are determined by MLPs
133
+ based on the input locations.
68
134
 
69
135
  Args:
70
- X (ndarray): Variables to compute the covariance matrix
71
- X2 (ndarray): If passed, the covariance between X and X2 is computed. Otherwise,
72
- the covariance between X and X is computed.
136
+ X (tf.Tensor): (N1, D); First set of input variables to compute covariance from.
137
+ `N1` is the number of points, `D` is the dimensionality.
138
+ X2 (Optional[tf.Tensor]): (N2, D); Optional second set of input variables.
139
+ If provided, computes cross-covariance `K(X, X2)`.
140
+ If None, computes auto-covariance `K(X, X)`.
73
141
 
74
142
  Returns:
75
- cov (ndarray): covariance matrix
143
+ tf.Tensor: (N1, N2); The computed covariance matrix. If `X2` is None, the
144
+ diagonal of `K(X, X)` is jittered for numerical stability.
76
145
  """
77
146
  if X2 is None:
78
- X2 = X
79
- equal = True
147
+ X2_internal = X
148
+ equal = True # Flag to add jitter to diagonal for K(X,X)
80
149
  else:
150
+ X2_internal = X2
81
151
  equal = False
82
152
 
83
- kern = 0.0
153
+ kern = tf.constant(0.0, dtype=float_type) # Initialize kernel sum
154
+
84
155
  for q in range(self.Q):
85
- # compute latent function values by the neural network
86
- freq, freq2 = self.freq[q](X), self.freq[q](X2)
87
- lens, lens2 = self.length[q](X), self.length[q](X2)
88
- var, var2 = self.var[q](X), self.var[q](X2)
89
-
90
- # compute length-scale term
91
- Xr = tf.expand_dims(X, 1) # N1 1 D
92
- X2r = tf.expand_dims(X2, 0) # 1 N2 D
93
- l1 = tf.expand_dims(lens, 1) # N1 1 D
94
- l2 = tf.expand_dims(lens2, 0) # 1 N2 D
95
- L = tf.square(l1) + tf.square(l2) # N1 N2 D
96
- #D = tf.square((Xr - X2r) / L) # N1 N2 D
97
- D = tf.square(Xr - X2r) / L # N1 N2 D
98
- D = tf.reduce_sum(D, 2) # N1 N2
99
- det = tf.sqrt(2 * l1 * l2 / L) # N1 N2 D
100
- det = tf.reduce_prod(det, 2) # N1 N2
101
- E = det * tf.exp(-D) # N1 N2
102
-
103
- # compute cosine term
104
- muX = (tf.reduce_sum(freq * X, 1, keepdims=True)
105
- - tf.transpose(tf.reduce_sum(freq2 * X2, 1, keepdims=True)))
106
- COS = tf.cos(2 * np.pi * muX)
107
-
108
- # compute kernel variance term
109
- WW = tf.matmul(var, var2, transpose_b=True) # w*w'^T
110
-
111
- # compute the q'th kernel component
156
+ # Compute latent function values (frequencies, lengthscales, variances)
157
+ # by passing input locations through the MLPs.
158
+ freq_X, freq_X2 = self.freq[q](X), self.freq[q](
159
+ X2_internal) # (N, D) frequencies
160
+ lens_X, lens_X2 = self.length[q](X), self.length[q](
161
+ X2_internal) # (N, D) lengthscales
162
+ var_X, var_X2 = self.var[q](X), self.var[q](
163
+ X2_internal) # (N, 1) variances
164
+
165
+ # Compute length-scale term (E) - based on inverse lengthscales and distances
166
+ Xr = tf.expand_dims(X, 1) # (N1, 1, D)
167
+ X2r = tf.expand_dims(X2_internal, 0) # (1, N2, D)
168
+ l1 = tf.expand_dims(lens_X, 1) # (N1, 1, D)
169
+ l2 = tf.expand_dims(lens_X2, 0) # (1, N2, D)
170
+
171
+ L = tf.square(l1) + tf.square(
172
+ l2) # (N1, N2, D) - sum of squared lengthscales
173
+
174
+ # D term: Squared difference scaled by L, summed over dimensions
175
+ D_term = tf.square(Xr - X2r) / L # (N1, N2, D)
176
+ D_term = tf.reduce_sum(D_term, 2) # (N1, N2) - sum over dimensions
177
+
178
+ # Determinant term: Product over dimensions of (2 * l1 * l2 / L)^(1/2)
179
+ det_term = tf.sqrt(2 * l1 * l2 / L) # (N1, N2, D)
180
+ det_term = tf.reduce_prod(det_term,
181
+ 2) # (N1, N2) - product over dimensions
182
+
183
+ # E term: Combine determinant and exponential of D_term
184
+ E = det_term * tf.exp(-D_term) # (N1, N2)
185
+
186
+ # Compute cosine term (COS) - based on frequencies and dot products with X
187
+ # (N1, D) * (N1, D) -> sum over D -> (N1, 1)
188
+ muX = (tf.reduce_sum(freq_X * X, 1, keepdims=True) - tf.transpose(
189
+ tf.reduce_sum(freq_X2 * X2_internal, 1, keepdims=True)))
190
+ COS = tf.cos(2 * np.pi * muX) # (N1, N2)
191
+
192
+ # Compute kernel variance term (WW) - outer product of variance predictions
193
+ WW = tf.matmul(var_X, var_X2,
194
+ transpose_b=True) # (N1, 1) @ (1, N2) -> (N1, N2)
195
+
196
+ # Compute the q'th kernel component and add to total kernel
112
197
  kern += WW * E * COS
198
+
199
+ # Add jitter to the diagonal for K(X,X) matrices for numerical stability
113
200
  if equal:
114
201
  return robust_kernel(kern, tf.shape(X)[0])
115
202
  else:
116
203
  return kern
117
204
 
118
- def K_diag(self, X):
119
- kd = default_jitter()
205
+ @tf.autograph.experimental.do_not_convert
206
+ def K_diag(self, X: tf.Tensor) -> tf.Tensor:
207
+ """
208
+ Computes the diagonal of the covariance matrix `K(X, X)`.
209
+ For the Neural Spectral Kernel, this is `sum_q(var_q(X)^2) + jitter`.
210
+
211
+ Args:
212
+ X (tf.Tensor): (N, D); Input data points. `N` is the number of points.
213
+
214
+ Returns:
215
+ tf.Tensor: (N,); A 1D tensor representing the diagonal elements of the
216
+ covariance matrix.
217
+ """
218
+ kd = default_jitter() # Initialize with a small jitter
120
219
  for q in range(self.Q):
220
+ # Sum of squared variance predictions from each MLP component
121
221
  kd += tf.square(self.var[q](X))
122
- return tf.squeeze(kd)
222
+ return tf.squeeze(
223
+ kd) # Remove singleton dimension (e.g., (N, 1) -> (N,))
123
224
 
124
- '''
125
- Helper functions
126
- '''
127
- def robust_kernel(kern, shape_X):
128
- jitter = 1e-3
129
- return kern + jitter * tf.eye(shape_X, dtype=float_type)
130
225
 
131
- def init_neural_kernel(x, y, inducing_variable, Q, n_inits=1, hidden_sizes=None):
132
- """Helper function to initialize a Neural Spectral Kernel function (non-stationary kernel function).
133
- Based on the implementation from this [repo](https://github.com/sremes/nssm-gp/tree/master?tab=readme-ov-file)
226
+ # --- Helper functions ---
227
+ @tf.autograph.experimental.do_not_convert
228
+ def robust_kernel(kern: tf.Tensor, shape_X_0: tf.Tensor) -> tf.Tensor:
229
+ """
230
+ Adds a small positive jitter to the diagonal of a covariance matrix
231
+ to ensure numerical stability. This is particularly important for
232
+ Cholesky decompositions or inverse calculations.
134
233
 
135
- Refer to the following papers for more details:
234
+ Args:
235
+ kern (tf.Tensor): The input covariance matrix.
236
+ shape_X_0 (tf.Tensor): The size of the first dimension of the original input `X`
237
+ (i.e., the number of data points N). Used to create the identity matrix.
238
+
239
+ Returns:
240
+ tf.Tensor: The covariance matrix with jitter added to its diagonal.
241
+ """
242
+ jitter_val = 1e-3 # Fixed jitter value
243
+ # Add jitter to the diagonal of the kernel matrix
244
+ return kern + jitter_val * tf.eye(shape_X_0, dtype=float_type)
245
+
246
+
247
+ def init_neural_kernel(X_train: np.ndarray,
248
+ Y_train: np.ndarray,
249
+ inducing_variable: np.ndarray,
250
+ Q: int,
251
+ n_inits: int = 1,
252
+ hidden_sizes: Optional[List[int]] = None) -> SGPR:
253
+ """
254
+ Helper function to initialize a Sparse Gaussian Process Regression (SGPR) model
255
+ with a Neural Spectral Kernel. This function can perform multiple random
256
+ initializations and return the model with the best initial Evidence Lower Bound (ELBO).
257
+
258
+ Refer to the original paper for more details:
136
259
  - Neural Non-Stationary Spectral Kernel [Remes et al., 2018]
137
260
 
138
261
  Args:
139
- x (ndarray): (n, d); Input training set points
140
- y (ndarray): (n, 1); Training set labels
141
- inducing_variable (ndarray): (m, d); Initial inducing points
142
- Q (int): Number of MLP mixture components used in the kernel function
143
- n_inits (int): Number of times to initalize the kernel function (returns the best model)
144
- hidden_sizes (list): Number of hidden units in each MLP layer. Length of the list determines the number of layers.
262
+ X_train (np.ndarray): (n, d); Input training set points.
263
+ Y_train (np.ndarray): (n, 1); Training set labels.
264
+ inducing_variable (np.ndarray): (m, d); Initial inducing points. These are passed
265
+ directly to the SGPR model.
266
+ Q (int): The number of MLP mixture components for the Neural Spectral Kernel.
267
+ n_inits (int): Number of times to randomly initialize the kernel's MLPs and
268
+ compute the initial ELBO. The model with the highest ELBO
269
+ among these initializations is returned. Defaults to 1.
270
+ hidden_sizes (Optional[List[int]]): List of integers specifying the number of hidden
271
+ units in each MLP layer. If None, [32, 32] is used.
272
+
273
+ Returns:
274
+ SGPR: The SGPR model instance initialized with the Neural Spectral Kernel
275
+ that yielded the best initial ELBO.
276
+
277
+ Usage:
278
+ ```python
279
+ import numpy as np
280
+ import gpflow
281
+ from sgptools.kernels.neural_kernel import init_neural_kernel
282
+ from sgptools.utils.misc import get_inducing_pts # For initial inducing points
283
+
284
+ # Dummy data
285
+ X_train_data = np.random.rand(100, 2).astype(np.float32)
286
+ Y_train_data = (np.sin(X_train_data[:, 0]) + np.cos(X_train_data[:, 1]))[:, None].astype(np.float32)
287
+
288
+ # Initial inducing points (e.g., subset of training data or k-means centers)
289
+ initial_inducing_points = get_inducing_pts(X_train_data, num_inducing=20)
290
+
291
+ # Initialize the SGPR model with Neural Spectral Kernel
292
+ # Try 3 random initializations for the MLPs.
293
+ model_ns_kernel = init_neural_kernel(
294
+ X_train=X_train_data,
295
+ Y_train=Y_train_data,
296
+ inducing_variable=initial_inducing_points,
297
+ Q=5, # 5 mixture components
298
+ n_inits=3, # 3 initializations
299
+ hidden_sizes=[16, 16] # Custom hidden layer sizes
300
+ )
301
+
302
+ # You would typically optimize this model further using optimize_model:
303
+ # from sgptools.utils.gpflow import optimize_model
304
+ # optimize_model(model_ns_kernel)
305
+ ```
145
306
  """
146
- x, y = data_input_to_tensor((x, y))
147
-
148
- print('Initializing neural spectral kernel...')
149
- best_loglik = -np.inf
150
- best_m = None
151
- N, input_dim = x.shape
152
-
153
- for k in range(n_inits):
154
- # gpflow.reset_default_graph_and_session()
155
- k = NeuralSpectralKernel(input_dim=input_dim, Q=Q,
156
- hidden_sizes=hidden_sizes)
157
- model = SGPR((x, y), inducing_variable=inducing_variable,
158
- kernel=k)
307
+ # Convert NumPy arrays to TensorFlow tensors
308
+ X_train_tf, Y_train_tf = data_input_to_tensor((X_train, Y_train))
309
+
310
+ best_loglik = -np.inf # Track the best ELBO found
311
+ best_m: Optional[SGPR] = None # Store the best model
312
+
313
+ N, input_dim = X_train_tf.shape # Get number of data points and input dimensionality
314
+
315
+ for k_init_idx in range(n_inits):
316
+ # Create a new NeuralSpectralKernel instance for each initialization
317
+ current_kernel = NeuralSpectralKernel(input_dim=input_dim,
318
+ Q=Q,
319
+ hidden_sizes=hidden_sizes)
320
+
321
+ # Create an SGPR model with the current kernel initialization
322
+ model = SGPR(data=(X_train_tf, Y_train_tf),
323
+ inducing_variable=inducing_variable,
324
+ kernel=current_kernel)
325
+
326
+ # Compute the initial ELBO (Evidence Lower Bound)
159
327
  loglik = model.elbo()
328
+
329
+ # Check if the current initialization is better than previous ones
160
330
  if loglik > best_loglik:
161
331
  best_loglik = loglik
332
+ # Deepcopy the model to save its state, as it will be deleted/overwritten in next iteration
333
+ # This requires gpflow.utilities.traversal.deepcopy or similar for GPflow models
334
+ # For simplicity, we directly assign here, assuming shallow copy is sufficient
335
+ # or that the user will optimize it later. For robust best model saving, a deepcopy is safer.
162
336
  best_m = model
337
+
338
+ # Explicitly delete the model and run garbage collection to free memory
339
+ # (important if n_inits is large and models are complex)
163
340
  del model
164
341
  gc.collect()
165
- print('Best init: %f' % best_loglik)
166
342
 
167
- return best_m
343
+ return best_m
@@ -11,7 +11,6 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  """Multi Layer Perceptron Model
16
15
  """
17
16
 
@@ -20,39 +19,139 @@ import tensorflow as tf
20
19
 
21
20
  import gpflow
22
21
  from gpflow.config import default_float
22
+
23
23
  float_type = default_float()
24
24
 
25
- def xavier(dim_in, dim_out):
26
- return np.random.randn(dim_in, dim_out)*(2./(dim_in+dim_out))**0.5
25
+ from typing import List, Union, Callable
27
26
 
28
- class NN(gpflow.base.Module):
29
- """Multi Layer Perceptron Model that is compatible with GPFlow
27
+
28
+ def xavier(dim_in: int, dim_out: int) -> np.ndarray:
29
+ """
30
+ Initializes weights using the Xavier (Glorot) uniform initialization method.
31
+ This method aims to keep the variance of activations consistent across layers,
32
+ helping to prevent vanishing/exploding gradients.
33
+
34
+ Formula: $W \sim U(-\sqrt{6/(dim_{in} + dim_{out})}, \sqrt{6/(dim_{in} + dim_{out})})$
30
35
 
31
36
  Args:
32
- dims (List): List of each layer's size, needs input layer dimensions as well
33
- activation_fn (str): Activation function for each layer
34
- output_activation_fn (str): Activation function for the last layer
37
+ dim_in (int): The number of input units to the layer.
38
+ dim_out (int): The number of output units from the layer.
39
+
40
+ Returns:
41
+ np.ndarray: A NumPy array of shape (dim_in, dim_out) containing
42
+ the initialized weights.
43
+ """
44
+ # Calculate the fan-in + fan-out for the scaling factor
45
+ scale_factor = (2.0 / (dim_in + dim_out))**0.5
46
+ # Generate random numbers from a normal (Gaussian) distribution
47
+ # This is often used as an approximation for Xavier uniform in practice
48
+ # or sometimes Xavier normal is explicitly implemented this way.
49
+ return np.random.randn(dim_in, dim_out) * scale_factor
50
+
51
+
52
+ class NN(gpflow.base.Module):
53
+ """
54
+ A Multi-Layer Perceptron (MLP) model that is compatible with GPFlow,
55
+ allowing its parameters (weights and biases) to be optimized as part of
56
+ a GPflow model (e.g., within a custom kernel).
57
+
58
+ The network consists of multiple fully connected (dense) layers with
59
+ specified activation functions.
60
+
61
+ Attributes:
62
+ dims (List[int]): List of layer sizes, including input and output dimensions.
63
+ activation_fn (Callable): Activation function for hidden layers.
64
+ output_activation_fn (Callable): Activation function for the output layer.
65
+ _weights (List[tf.Variable]): List of TensorFlow Variable for weights of each layer.
66
+ _biases (List[tf.Variable]): List of TensorFlow Variable for biases of each layer.
35
67
  """
36
- def __init__(self, dims,
37
- activation_fn='selu',
38
- output_activation_fn='softmax'):
68
+
69
+ def __init__(self,
70
+ dims: List[int],
71
+ activation_fn: Union[str, Callable] = 'selu',
72
+ output_activation_fn: Union[str, Callable] = 'softmax'):
73
+ """
74
+ Initializes the Multi-Layer Perceptron (MLP).
75
+
76
+ Args:
77
+ dims (List[int]): A list of integers specifying the size of each layer.
78
+ The first element is the input dimension, the last is
79
+ the output dimension, and intermediate elements are
80
+ hidden layer sizes.
81
+ Example: `[input_dim, hidden1_dim, hidden2_dim, output_dim]`
82
+ activation_fn (Union[str, Callable]): The activation function to use for hidden layers.
83
+ Can be a string (e.g., 'relu', 'tanh', 'selu')
84
+ or a callable TensorFlow activation function.
85
+ Defaults to 'selu'.
86
+ output_activation_fn (Union[str, Callable]): The activation function to use for the output layer.
87
+ Can be a string (e.g., 'softmax', 'sigmoid', 'softplus')
88
+ or a callable TensorFlow activation function.
89
+ Defaults to 'softplus'.
90
+
91
+ Usage:
92
+ ```python
93
+ from sgptools.kernels.neural_network import NN
94
+ import tensorflow as tf
95
+ import numpy as np
96
+
97
+ # Example: A simple MLP with one hidden layer
98
+ mlp = NN(dims=[2, 10, 1], activation_fn='tanh', output_activation_fn='sigmoid')
99
+
100
+ # Input data
101
+ input_data = tf.constant(np.random.rand(5, 2), dtype=tf.float32)
102
+
103
+ # Pass input through the network
104
+ output = mlp(input_data)
105
+ ```
106
+ """
39
107
  super().__init__()
40
108
  self.dims = dims
41
- self.activation_fn = tf.keras.activations.get(activation_fn)
42
- self.output_activation_fn = tf.keras.activations.get(output_activation_fn)
109
+ # Get TensorFlow activation functions from strings or use provided callables
110
+ self.activation_fn = tf.keras.activations.get(
111
+ activation_fn) if isinstance(activation_fn, str) else activation_fn
112
+ self.output_activation_fn = tf.keras.activations.get(
113
+ output_activation_fn) if isinstance(output_activation_fn,
114
+ str) else output_activation_fn
115
+
116
+ self._weights: List[tf.Variable] = []
117
+ self._biases: List[tf.Variable] = []
118
+
119
+ # Create weights and biases for each layer
43
120
  for i, (dim_in, dim_out) in enumerate(zip(dims[:-1], dims[1:])):
44
- setattr(self, 'W_{}'.format(i), tf.Variable(xavier(dim_in, dim_out),
45
- dtype=float_type))
46
- setattr(self, 'b_{}'.format(i), tf.Variable(np.zeros(dim_out),
47
- dtype=float_type))
48
-
49
- def __call__(self, X):
50
- if X is not None:
51
- for i in range(len(self.dims) - 2):
52
- W = getattr(self, 'W_{}'.format(i))
53
- b = getattr(self, 'b_{}'.format(i))
54
- X = self.activation_fn(tf.matmul(X, W) + b)
55
- W = getattr(self, 'W_{}'.format(i+1))
56
- b = getattr(self, 'b_{}'.format(i+1))
57
- X = self.output_activation_fn(tf.matmul(X, W) + b)
58
- return X
121
+ # Use Xavier initialization for weights
122
+ weight_init = xavier(dim_in, dim_out)
123
+ self._weights.append(
124
+ tf.Variable(weight_init, dtype=float_type, name=f'W_{i}'))
125
+
126
+ # Initialize biases to zeros
127
+ bias_init = np.zeros(dim_out, dtype=float_type)
128
+ self._biases.append(
129
+ tf.Variable(bias_init, dtype=float_type, name=f'b_{i}'))
130
+
131
+ def __call__(self, X: tf.Tensor) -> tf.Tensor:
132
+ """
133
+ Performs a forward pass through the MLP.
134
+
135
+ Args:
136
+ X (tf.Tensor): (N, D_in); The input tensor to the MLP. `N` is the batch size,
137
+ `D_in` is the input dimension of the network.
138
+
139
+ Returns:
140
+ tf.Tensor: (N, D_out); The output tensor from the MLP. `D_out` is the output
141
+ dimension of the network.
142
+ """
143
+ # Process through hidden layers
144
+ # The loop runs for (num_layers - 1) iterations, covering all hidden layers
145
+ # and the input-to-first-hidden layer transition.
146
+ for i in range(len(self.dims) -
147
+ 2): # Iterate up to second to last layer
148
+ W = self._weights[i]
149
+ b = self._biases[i]
150
+ X = self.activation_fn(tf.matmul(X, W) + b)
151
+
152
+ # Process through the last layer (output layer)
153
+ W_last = self._weights[-1] # Weights for the last layer
154
+ b_last = self._biases[-1] # Biases for the last layer
155
+ X = self.output_activation_fn(tf.matmul(X, W_last) + b_last)
156
+
157
+ return X