quantmllibrary 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. quantml/__init__.py +74 -0
  2. quantml/autograd.py +154 -0
  3. quantml/cli/__init__.py +10 -0
  4. quantml/cli/run_experiment.py +385 -0
  5. quantml/config/__init__.py +28 -0
  6. quantml/config/config.py +259 -0
  7. quantml/data/__init__.py +33 -0
  8. quantml/data/cache.py +149 -0
  9. quantml/data/feature_store.py +234 -0
  10. quantml/data/futures.py +254 -0
  11. quantml/data/loaders.py +236 -0
  12. quantml/data/memory_optimizer.py +234 -0
  13. quantml/data/validators.py +390 -0
  14. quantml/experiments/__init__.py +23 -0
  15. quantml/experiments/logger.py +208 -0
  16. quantml/experiments/results.py +158 -0
  17. quantml/experiments/tracker.py +223 -0
  18. quantml/features/__init__.py +25 -0
  19. quantml/features/base.py +104 -0
  20. quantml/features/gap_features.py +124 -0
  21. quantml/features/registry.py +138 -0
  22. quantml/features/volatility_features.py +140 -0
  23. quantml/features/volume_features.py +142 -0
  24. quantml/functional.py +37 -0
  25. quantml/models/__init__.py +27 -0
  26. quantml/models/attention.py +258 -0
  27. quantml/models/dropout.py +130 -0
  28. quantml/models/gru.py +319 -0
  29. quantml/models/linear.py +112 -0
  30. quantml/models/lstm.py +353 -0
  31. quantml/models/mlp.py +286 -0
  32. quantml/models/normalization.py +289 -0
  33. quantml/models/rnn.py +154 -0
  34. quantml/models/tcn.py +238 -0
  35. quantml/online.py +209 -0
  36. quantml/ops.py +1707 -0
  37. quantml/optim/__init__.py +42 -0
  38. quantml/optim/adafactor.py +206 -0
  39. quantml/optim/adagrad.py +157 -0
  40. quantml/optim/adam.py +267 -0
  41. quantml/optim/lookahead.py +97 -0
  42. quantml/optim/quant_optimizer.py +228 -0
  43. quantml/optim/radam.py +192 -0
  44. quantml/optim/rmsprop.py +203 -0
  45. quantml/optim/schedulers.py +286 -0
  46. quantml/optim/sgd.py +181 -0
  47. quantml/py.typed +0 -0
  48. quantml/streaming.py +175 -0
  49. quantml/tensor.py +462 -0
  50. quantml/time_series.py +447 -0
  51. quantml/training/__init__.py +135 -0
  52. quantml/training/alpha_eval.py +203 -0
  53. quantml/training/backtest.py +280 -0
  54. quantml/training/backtest_analysis.py +168 -0
  55. quantml/training/cv.py +106 -0
  56. quantml/training/data_loader.py +177 -0
  57. quantml/training/ensemble.py +84 -0
  58. quantml/training/feature_importance.py +135 -0
  59. quantml/training/features.py +364 -0
  60. quantml/training/futures_backtest.py +266 -0
  61. quantml/training/gradient_clipping.py +206 -0
  62. quantml/training/losses.py +248 -0
  63. quantml/training/lr_finder.py +127 -0
  64. quantml/training/metrics.py +376 -0
  65. quantml/training/regularization.py +89 -0
  66. quantml/training/trainer.py +239 -0
  67. quantml/training/walk_forward.py +190 -0
  68. quantml/utils/__init__.py +51 -0
  69. quantml/utils/gradient_check.py +274 -0
  70. quantml/utils/logging.py +181 -0
  71. quantml/utils/ops_cpu.py +231 -0
  72. quantml/utils/profiling.py +364 -0
  73. quantml/utils/reproducibility.py +220 -0
  74. quantml/utils/serialization.py +335 -0
  75. quantmllibrary-0.1.0.dist-info/METADATA +536 -0
  76. quantmllibrary-0.1.0.dist-info/RECORD +79 -0
  77. quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
  78. quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
  79. quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
quantml/models/lstm.py ADDED
@@ -0,0 +1,353 @@
1
+ """
2
+ Long Short-Term Memory (LSTM) implementation.
3
+
4
+ LSTM networks are a type of recurrent neural network that can learn
5
+ long-term dependencies through gating mechanisms.
6
+ """
7
+
8
+ from typing import Optional, Tuple, List
9
+ import math
10
+ from quantml.tensor import Tensor
11
+ from quantml import ops
12
+
13
+
14
+ class LSTMCell:
15
+ """
16
+ A single LSTM cell.
17
+
18
+ Implements the LSTM equations:
19
+ f_t = sigmoid(x_t @ W_xf + h_{t-1} @ W_hf + b_f) # forget gate
20
+ i_t = sigmoid(x_t @ W_xi + h_{t-1} @ W_hi + b_i) # input gate
21
+ g_t = tanh(x_t @ W_xg + h_{t-1} @ W_hg + b_g) # cell candidate
22
+ o_t = sigmoid(x_t @ W_xo + h_{t-1} @ W_ho + b_o) # output gate
23
+ c_t = f_t * c_{t-1} + i_t * g_t # cell state
24
+ h_t = o_t * tanh(c_t) # hidden state
25
+
26
+ Attributes:
27
+ input_size: Size of input features
28
+ hidden_size: Size of hidden state
29
+
30
+ Examples:
31
+ >>> cell = LSTMCell(10, 20)
32
+ >>> x = Tensor([[1.0] * 10])
33
+ >>> h, c = cell.forward(x)
34
+ """
35
+
36
+ def __init__(self, input_size: int, hidden_size: int, bias: bool = True):
37
+ """
38
+ Initialize LSTM cell.
39
+
40
+ Args:
41
+ input_size: Number of input features
42
+ hidden_size: Size of hidden state
43
+ bias: Whether to include bias terms
44
+ """
45
+ self.input_size = input_size
46
+ self.hidden_size = hidden_size
47
+ self.bias = bias
48
+
49
+ # Initialize weights using Xavier/Glorot initialization
50
+ # Combined weights for all 4 gates: [i, f, g, o]
51
+ # Input-to-hidden weights: (4 * hidden_size, input_size)
52
+ combined_size = 4 * hidden_size
53
+ limit_ih = math.sqrt(6.0 / (input_size + hidden_size))
54
+
55
+ weight_ih_data = [
56
+ [(2.0 * limit_ih * ((i * input_size + j) % 1000) / 1000 - limit_ih)
57
+ for j in range(input_size)]
58
+ for i in range(combined_size)
59
+ ]
60
+ self.weight_ih = Tensor(weight_ih_data, requires_grad=True)
61
+
62
+ # Hidden-to-hidden weights: (4 * hidden_size, hidden_size)
63
+ limit_hh = math.sqrt(6.0 / (hidden_size + hidden_size))
64
+ weight_hh_data = [
65
+ [(2.0 * limit_hh * ((i * hidden_size + j) % 1000) / 1000 - limit_hh)
66
+ for j in range(hidden_size)]
67
+ for i in range(combined_size)
68
+ ]
69
+ self.weight_hh = Tensor(weight_hh_data, requires_grad=True)
70
+
71
+ # Biases for all 4 gates
72
+ if bias:
73
+ # Initialize forget gate bias to 1.0 for better gradient flow
74
+ bias_data = [[0.0] for _ in range(combined_size)]
75
+ # Forget gate bias starts at index hidden_size
76
+ for i in range(hidden_size, 2 * hidden_size):
77
+ bias_data[i][0] = 1.0
78
+ self.bias_ih = Tensor(bias_data, requires_grad=True)
79
+ self.bias_hh = Tensor([[0.0] for _ in range(combined_size)], requires_grad=True)
80
+ else:
81
+ self.bias_ih = None
82
+ self.bias_hh = None
83
+
84
+ # Hidden and cell state
85
+ self.hidden = None
86
+ self.cell = None
87
+
88
+ def forward(
89
+ self,
90
+ x: Tensor,
91
+ hidden: Optional[Tuple[Tensor, Tensor]] = None
92
+ ) -> Tuple[Tensor, Tensor]:
93
+ """
94
+ Forward pass through LSTM cell.
95
+
96
+ Args:
97
+ x: Input tensor (batch_size x input_size)
98
+ hidden: Optional tuple of (h_{t-1}, c_{t-1})
99
+
100
+ Returns:
101
+ Tuple of (h_t, c_t) - new hidden and cell states
102
+ """
103
+ # Ensure 2D input
104
+ x_data = x.data if isinstance(x.data[0], list) else [x.data]
105
+ x_2d = Tensor(x_data)
106
+ batch_size = len(x_data)
107
+
108
+ # Initialize hidden state if not provided
109
+ if hidden is None:
110
+ if self.hidden is None or self.cell is None:
111
+ h_prev = Tensor([[0.0] * self.hidden_size for _ in range(batch_size)])
112
+ c_prev = Tensor([[0.0] * self.hidden_size for _ in range(batch_size)])
113
+ else:
114
+ h_prev = self.hidden
115
+ c_prev = self.cell
116
+ else:
117
+ h_prev, c_prev = hidden
118
+
119
+ # Compute gates: x @ W_ih^T + h @ W_hh^T + b
120
+ # W_ih is (4*hidden, input), W_hh is (4*hidden, hidden)
121
+ W_ih_T = self._transpose(self.weight_ih)
122
+ W_hh_T = self._transpose(self.weight_hh)
123
+
124
+ gates = ops.matmul(x_2d, W_ih_T)
125
+ gates = ops.add(gates, ops.matmul(h_prev, W_hh_T))
126
+
127
+ if self.bias and self.bias_ih is not None and self.bias_hh is not None:
128
+ # Reshape biases for broadcasting
129
+ bias = ops.add(self.bias_ih, self.bias_hh)
130
+ gates = ops.add(gates, self._transpose(bias))
131
+
132
+ # Split gates: each is (batch_size x hidden_size)
133
+ # Order: input, forget, cell candidate, output
134
+ i_gate, f_gate, g_gate, o_gate = self._split_gates(gates)
135
+
136
+ # Apply activations
137
+ i_t = ops.sigmoid(i_gate)
138
+ f_t = ops.sigmoid(f_gate)
139
+ g_t = ops.tanh(g_gate)
140
+ o_t = ops.sigmoid(o_gate)
141
+
142
+ # Cell state update: c_t = f_t * c_{t-1} + i_t * g_t
143
+ c_t = ops.add(ops.mul(f_t, c_prev), ops.mul(i_t, g_t))
144
+
145
+ # Hidden state: h_t = o_t * tanh(c_t)
146
+ h_t = ops.mul(o_t, ops.tanh(c_t))
147
+
148
+ # Store for next step
149
+ self.hidden = h_t
150
+ self.cell = c_t
151
+
152
+ return h_t, c_t
153
+
154
+ def _transpose(self, t: Tensor) -> Tensor:
155
+ """Transpose a 2D tensor."""
156
+ if not isinstance(t.data[0], list):
157
+ data = [t.data]
158
+ else:
159
+ data = t.data
160
+
161
+ transposed = [[data[j][i] for j in range(len(data))]
162
+ for i in range(len(data[0]))]
163
+ return Tensor(transposed, requires_grad=t.requires_grad)
164
+
165
+ def _split_gates(self, gates: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
166
+ """Split combined gates into individual gate tensors."""
167
+ h = self.hidden_size
168
+ data = gates.data
169
+
170
+ if isinstance(data[0], list):
171
+ i_data = [[row[j] for j in range(0, h)] for row in data]
172
+ f_data = [[row[j] for j in range(h, 2*h)] for row in data]
173
+ g_data = [[row[j] for j in range(2*h, 3*h)] for row in data]
174
+ o_data = [[row[j] for j in range(3*h, 4*h)] for row in data]
175
+ else:
176
+ i_data = [data[j] for j in range(0, h)]
177
+ f_data = [data[j] for j in range(h, 2*h)]
178
+ g_data = [data[j] for j in range(2*h, 3*h)]
179
+ o_data = [data[j] for j in range(3*h, 4*h)]
180
+
181
+ return (
182
+ Tensor(i_data, requires_grad=gates.requires_grad),
183
+ Tensor(f_data, requires_grad=gates.requires_grad),
184
+ Tensor(g_data, requires_grad=gates.requires_grad),
185
+ Tensor(o_data, requires_grad=gates.requires_grad)
186
+ )
187
+
188
+ def reset_hidden(self) -> None:
189
+ """Reset hidden and cell states."""
190
+ self.hidden = None
191
+ self.cell = None
192
+
193
+ def parameters(self) -> List[Tensor]:
194
+ """Get all trainable parameters."""
195
+ params = [self.weight_ih, self.weight_hh]
196
+ if self.bias and self.bias_ih is not None:
197
+ params.extend([self.bias_ih, self.bias_hh])
198
+ return params
199
+
200
+ def zero_grad(self) -> None:
201
+ """Clear gradients for all parameters."""
202
+ for p in self.parameters():
203
+ p.zero_grad()
204
+
205
+
206
+ class LSTM:
207
+ """
208
+ Multi-layer LSTM module.
209
+
210
+ Stacks multiple LSTM layers for deeper sequence processing.
211
+
212
+ Attributes:
213
+ input_size: Size of input features
214
+ hidden_size: Size of hidden state
215
+ num_layers: Number of stacked LSTM layers
216
+
217
+ Examples:
218
+ >>> lstm = LSTM(10, 20, num_layers=2)
219
+ >>> x = Tensor([[[1.0] * 10] * 5]) # batch x seq x features
220
+ >>> outputs, (h_n, c_n) = lstm.forward(x)
221
+ """
222
+
223
+ def __init__(
224
+ self,
225
+ input_size: int,
226
+ hidden_size: int,
227
+ num_layers: int = 1,
228
+ bias: bool = True,
229
+ batch_first: bool = True
230
+ ):
231
+ """
232
+ Initialize multi-layer LSTM.
233
+
234
+ Args:
235
+ input_size: Number of input features
236
+ hidden_size: Size of hidden state
237
+ num_layers: Number of stacked LSTM layers
238
+ bias: Whether to include bias terms
239
+ batch_first: If True, input shape is (batch, seq, features)
240
+ """
241
+ self.input_size = input_size
242
+ self.hidden_size = hidden_size
243
+ self.num_layers = num_layers
244
+ self.batch_first = batch_first
245
+
246
+ # Create LSTM cells for each layer
247
+ self.cells: List[LSTMCell] = []
248
+ for i in range(num_layers):
249
+ cell_input_size = input_size if i == 0 else hidden_size
250
+ self.cells.append(LSTMCell(cell_input_size, hidden_size, bias))
251
+
252
+ def forward(
253
+ self,
254
+ x: Tensor,
255
+ hidden: Optional[Tuple[Tensor, Tensor]] = None
256
+ ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
257
+ """
258
+ Forward pass through LSTM.
259
+
260
+ Args:
261
+ x: Input tensor (batch x seq x features) if batch_first
262
+ hidden: Optional initial (h_0, c_0) for all layers
263
+
264
+ Returns:
265
+ Tuple of (outputs, (h_n, c_n)) where:
266
+ - outputs: All hidden states for each timestep
267
+ - h_n: Final hidden state for each layer
268
+ - c_n: Final cell state for each layer
269
+ """
270
+ # Ensure proper shape
271
+ data = x.data
272
+ if not isinstance(data[0], list):
273
+ data = [[data]]
274
+ elif not isinstance(data[0][0], list):
275
+ data = [data]
276
+
277
+ batch_size = len(data)
278
+ seq_len = len(data[0])
279
+
280
+ # Initialize hidden states for all layers
281
+ if hidden is None:
282
+ h_layers = [None] * self.num_layers
283
+ c_layers = [None] * self.num_layers
284
+ else:
285
+ # Split initial states by layer
286
+ h_0, c_0 = hidden
287
+ h_layers = [None] * self.num_layers # TODO: proper splitting
288
+ c_layers = [None] * self.num_layers
289
+
290
+ # Process sequence
291
+ outputs = []
292
+
293
+ for t in range(seq_len):
294
+ # Get input for this timestep
295
+ x_t_data = [[data[b][t][f] for f in range(len(data[b][t]))]
296
+ for b in range(batch_size)]
297
+ x_t = Tensor(x_t_data)
298
+
299
+ # Process through each layer
300
+ layer_input = x_t
301
+ for layer_idx, cell in enumerate(self.cells):
302
+ h_prev = h_layers[layer_idx]
303
+ c_prev = c_layers[layer_idx]
304
+ hidden_tuple = (h_prev, c_prev) if h_prev is not None else None
305
+
306
+ h_t, c_t = cell.forward(layer_input, hidden_tuple)
307
+
308
+ h_layers[layer_idx] = h_t
309
+ c_layers[layer_idx] = c_t
310
+ layer_input = h_t
311
+
312
+ # Store output from last layer
313
+ outputs.append(layer_input)
314
+
315
+ # Stack outputs: (batch, seq, hidden)
316
+ output_data = [
317
+ [[float(outputs[t].data[b][h])
318
+ for h in range(self.hidden_size)]
319
+ for t in range(seq_len)]
320
+ for b in range(batch_size)
321
+ ]
322
+
323
+ # Stack final states
324
+ h_n_data = [[float(h_layers[l].data[b][h])
325
+ for h in range(self.hidden_size)]
326
+ for l in range(self.num_layers)
327
+ for b in range(batch_size)]
328
+ c_n_data = [[float(c_layers[l].data[b][h])
329
+ for h in range(self.hidden_size)]
330
+ for l in range(self.num_layers)
331
+ for b in range(batch_size)]
332
+
333
+ return (
334
+ Tensor(output_data),
335
+ (Tensor(h_n_data), Tensor(c_n_data))
336
+ )
337
+
338
+ def reset_hidden(self) -> None:
339
+ """Reset hidden states for all layers."""
340
+ for cell in self.cells:
341
+ cell.reset_hidden()
342
+
343
+ def parameters(self) -> List[Tensor]:
344
+ """Get all trainable parameters."""
345
+ params = []
346
+ for cell in self.cells:
347
+ params.extend(cell.parameters())
348
+ return params
349
+
350
+ def zero_grad(self) -> None:
351
+ """Clear gradients for all parameters."""
352
+ for cell in self.cells:
353
+ cell.zero_grad()
quantml/models/mlp.py ADDED
@@ -0,0 +1,286 @@
1
+ """
2
+ Multi-Layer Perceptron (MLP) implementation.
3
+
4
+ Provides a flexible MLP builder for creating feedforward neural networks
5
+ with configurable layers, activations, and regularization.
6
+ """
7
+
8
+ from typing import List, Optional, Union, Callable
9
+ import math
10
+ from quantml.tensor import Tensor
11
+ from quantml import ops
12
+ from quantml.models.linear import Linear
13
+
14
+
15
+ class MLP:
16
+ """
17
+ Multi-Layer Perceptron (feedforward neural network).
18
+
19
+ A flexible MLP builder that creates a sequence of linear layers
20
+ with configurable activations and dropout.
21
+
22
+ Attributes:
23
+ layers: List of Linear layers
24
+ activations: Activation function(s) between layers
25
+ dropout_rate: Dropout probability
26
+
27
+ Examples:
28
+ >>> # Simple 3-layer MLP
29
+ >>> mlp = MLP([10, 64, 32, 1], activation='relu')
30
+ >>> x = Tensor([[1.0] * 10])
31
+ >>> y = mlp.forward(x)
32
+
33
+ >>> # MLP with different activations per layer
34
+ >>> mlp = MLP([10, 64, 1], activation=['relu', None])
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ layer_sizes: List[int],
40
+ activation: Union[str, List[Optional[str]], Callable, None] = 'relu',
41
+ dropout: float = 0.0,
42
+ bias: bool = True,
43
+ final_activation: Optional[str] = None
44
+ ):
45
+ """
46
+ Initialize MLP.
47
+
48
+ Args:
49
+ layer_sizes: List of layer sizes [input_size, hidden1, ..., output_size]
50
+ activation: Activation function(s). Can be:
51
+ - String: 'relu', 'tanh', 'sigmoid', 'leaky_relu', 'gelu', 'swish', None
52
+ - List of strings (one per layer transition)
53
+ - Callable function
54
+ dropout: Dropout probability (0.0 = no dropout)
55
+ bias: Whether to include bias in linear layers
56
+ final_activation: Optional activation for the final layer
57
+
58
+ Raises:
59
+ ValueError: If layer_sizes has fewer than 2 elements
60
+ """
61
+ if len(layer_sizes) < 2:
62
+ raise ValueError("layer_sizes must have at least 2 elements (input and output)")
63
+
64
+ self.layer_sizes = layer_sizes
65
+ self.dropout_rate = dropout
66
+ self.training = True
67
+
68
+ # Build layers
69
+ self.layers: List[Linear] = []
70
+ for i in range(len(layer_sizes) - 1):
71
+ in_size = layer_sizes[i]
72
+ out_size = layer_sizes[i + 1]
73
+ self.layers.append(Linear(in_size, out_size, bias=bias))
74
+
75
+ # Parse activations
76
+ num_transitions = len(layer_sizes) - 1
77
+ if isinstance(activation, list):
78
+ if len(activation) != num_transitions:
79
+ raise ValueError(f"activation list must have {num_transitions} elements")
80
+ self.activations = activation
81
+ else:
82
+ # Same activation for all hidden layers, final_activation for last
83
+ self.activations = [activation] * (num_transitions - 1) + [final_activation]
84
+
85
+ # Convert string activations to functions
86
+ self.activation_fns = [self._get_activation_fn(a) for a in self.activations]
87
+
88
+ def _get_activation_fn(self, name: Optional[str]) -> Optional[Callable]:
89
+ """Get activation function by name."""
90
+ if name is None:
91
+ return None
92
+ if callable(name):
93
+ return name
94
+
95
+ activations = {
96
+ 'relu': ops.relu,
97
+ 'tanh': ops.tanh,
98
+ 'sigmoid': ops.sigmoid,
99
+ 'leaky_relu': ops.leaky_relu,
100
+ 'gelu': ops.gelu,
101
+ 'swish': ops.swish,
102
+ 'softmax': ops.softmax,
103
+ }
104
+
105
+ if name.lower() not in activations:
106
+ raise ValueError(f"Unknown activation: {name}. Available: {list(activations.keys())}")
107
+
108
+ return activations[name.lower()]
109
+
110
+ def forward(self, x: Tensor) -> Tensor:
111
+ """
112
+ Forward pass through MLP.
113
+
114
+ Args:
115
+ x: Input tensor (batch_size x input_size)
116
+
117
+ Returns:
118
+ Output tensor (batch_size x output_size)
119
+ """
120
+ out = x
121
+
122
+ for i, (layer, activation_fn) in enumerate(zip(self.layers, self.activation_fns)):
123
+ # Linear transformation
124
+ out = layer.forward(out)
125
+
126
+ # Apply activation
127
+ if activation_fn is not None:
128
+ out = activation_fn(out)
129
+
130
+ # Apply dropout (except on last layer)
131
+ if self.training and self.dropout_rate > 0 and i < len(self.layers) - 1:
132
+ out = self._apply_dropout(out)
133
+
134
+ return out
135
+
136
+ def _apply_dropout(self, x: Tensor) -> Tensor:
137
+ """Apply dropout during training."""
138
+ if not self.training or self.dropout_rate <= 0:
139
+ return x
140
+
141
+ # Simple dropout: randomly zero out elements
142
+ import random
143
+
144
+ data = x.data
145
+ if isinstance(data[0], list):
146
+ dropped = [
147
+ [val if random.random() > self.dropout_rate
148
+ else 0.0 for val in row]
149
+ for row in data
150
+ ]
151
+ # Scale by 1/(1-p) to maintain expected value
152
+ scale = 1.0 / (1.0 - self.dropout_rate)
153
+ dropped = [[val * scale for val in row] for row in dropped]
154
+ else:
155
+ dropped = [
156
+ val if random.random() > self.dropout_rate else 0.0
157
+ for val in data
158
+ ]
159
+ scale = 1.0 / (1.0 - self.dropout_rate)
160
+ dropped = [val * scale for val in dropped]
161
+
162
+ return Tensor(dropped, requires_grad=x.requires_grad)
163
+
164
+ def train(self, mode: bool = True) -> 'MLP':
165
+ """
166
+ Set training mode.
167
+
168
+ Args:
169
+ mode: If True, enables training mode (dropout active)
170
+
171
+ Returns:
172
+ self
173
+ """
174
+ self.training = mode
175
+ return self
176
+
177
+ def eval(self) -> 'MLP':
178
+ """
179
+ Set evaluation mode (disables dropout).
180
+
181
+ Returns:
182
+ self
183
+ """
184
+ return self.train(False)
185
+
186
+ def parameters(self) -> List[Tensor]:
187
+ """Get all trainable parameters."""
188
+ params = []
189
+ for layer in self.layers:
190
+ params.extend(layer.parameters())
191
+ return params
192
+
193
+ def zero_grad(self) -> None:
194
+ """Clear gradients for all parameters."""
195
+ for layer in self.layers:
196
+ layer.zero_grad()
197
+
198
+ def __repr__(self) -> str:
199
+ """String representation."""
200
+ layers_str = " -> ".join(str(s) for s in self.layer_sizes)
201
+ activations_str = ", ".join(
202
+ str(a) if a is not None else "None" for a in self.activations
203
+ )
204
+ return f"MLP({layers_str}, activations=[{activations_str}], dropout={self.dropout_rate})"
205
+
206
+
207
+ class ResidualMLP(MLP):
208
+ """
209
+ MLP with residual connections.
210
+
211
+ Adds skip connections between layers where dimensions match,
212
+ improving gradient flow for deeper networks.
213
+
214
+ Examples:
215
+ >>> mlp = ResidualMLP([10, 64, 64, 64, 1], activation='relu')
216
+ >>> x = Tensor([[1.0] * 10])
217
+ >>> y = mlp.forward(x)
218
+ """
219
+
220
+ def forward(self, x: Tensor) -> Tensor:
221
+ """
222
+ Forward pass with residual connections.
223
+
224
+ Adds residual connections where input and output dimensions match.
225
+ """
226
+ out = x
227
+
228
+ for i, (layer, activation_fn) in enumerate(zip(self.layers, self.activation_fns)):
229
+ # Store for residual
230
+ residual = out
231
+
232
+ # Linear transformation
233
+ out = layer.forward(out)
234
+
235
+ # Apply activation
236
+ if activation_fn is not None:
237
+ out = activation_fn(out)
238
+
239
+ # Residual connection if dimensions match
240
+ if i > 0 and i < len(self.layers) - 1:
241
+ # Check if dimensions match
242
+ out_shape = out.shape if hasattr(out, 'shape') else len(out.data[0]) if isinstance(out.data[0], list) else len(out.data)
243
+ res_shape = residual.shape if hasattr(residual, 'shape') else len(residual.data[0]) if isinstance(residual.data[0], list) else len(residual.data)
244
+
245
+ if out_shape == res_shape:
246
+ out = ops.add(out, residual)
247
+
248
+ # Apply dropout (except on last layer)
249
+ if self.training and self.dropout_rate > 0 and i < len(self.layers) - 1:
250
+ out = self._apply_dropout(out)
251
+
252
+ return out
253
+
254
+
255
+ def create_mlp(
256
+ input_size: int,
257
+ output_size: int,
258
+ hidden_sizes: List[int],
259
+ activation: str = 'relu',
260
+ output_activation: Optional[str] = None,
261
+ dropout: float = 0.0
262
+ ) -> MLP:
263
+ """
264
+ Convenience function to create an MLP.
265
+
266
+ Args:
267
+ input_size: Input feature dimension
268
+ output_size: Output dimension
269
+ hidden_sizes: List of hidden layer sizes
270
+ activation: Activation for hidden layers
271
+ output_activation: Activation for output layer
272
+ dropout: Dropout probability
273
+
274
+ Returns:
275
+ Configured MLP instance
276
+
277
+ Examples:
278
+ >>> mlp = create_mlp(10, 1, [64, 32], activation='relu')
279
+ """
280
+ layer_sizes = [input_size] + hidden_sizes + [output_size]
281
+ return MLP(
282
+ layer_sizes,
283
+ activation=activation,
284
+ final_activation=output_activation,
285
+ dropout=dropout
286
+ )