quantmllibrary 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantml/__init__.py +74 -0
- quantml/autograd.py +154 -0
- quantml/cli/__init__.py +10 -0
- quantml/cli/run_experiment.py +385 -0
- quantml/config/__init__.py +28 -0
- quantml/config/config.py +259 -0
- quantml/data/__init__.py +33 -0
- quantml/data/cache.py +149 -0
- quantml/data/feature_store.py +234 -0
- quantml/data/futures.py +254 -0
- quantml/data/loaders.py +236 -0
- quantml/data/memory_optimizer.py +234 -0
- quantml/data/validators.py +390 -0
- quantml/experiments/__init__.py +23 -0
- quantml/experiments/logger.py +208 -0
- quantml/experiments/results.py +158 -0
- quantml/experiments/tracker.py +223 -0
- quantml/features/__init__.py +25 -0
- quantml/features/base.py +104 -0
- quantml/features/gap_features.py +124 -0
- quantml/features/registry.py +138 -0
- quantml/features/volatility_features.py +140 -0
- quantml/features/volume_features.py +142 -0
- quantml/functional.py +37 -0
- quantml/models/__init__.py +27 -0
- quantml/models/attention.py +258 -0
- quantml/models/dropout.py +130 -0
- quantml/models/gru.py +319 -0
- quantml/models/linear.py +112 -0
- quantml/models/lstm.py +353 -0
- quantml/models/mlp.py +286 -0
- quantml/models/normalization.py +289 -0
- quantml/models/rnn.py +154 -0
- quantml/models/tcn.py +238 -0
- quantml/online.py +209 -0
- quantml/ops.py +1707 -0
- quantml/optim/__init__.py +42 -0
- quantml/optim/adafactor.py +206 -0
- quantml/optim/adagrad.py +157 -0
- quantml/optim/adam.py +267 -0
- quantml/optim/lookahead.py +97 -0
- quantml/optim/quant_optimizer.py +228 -0
- quantml/optim/radam.py +192 -0
- quantml/optim/rmsprop.py +203 -0
- quantml/optim/schedulers.py +286 -0
- quantml/optim/sgd.py +181 -0
- quantml/py.typed +0 -0
- quantml/streaming.py +175 -0
- quantml/tensor.py +462 -0
- quantml/time_series.py +447 -0
- quantml/training/__init__.py +135 -0
- quantml/training/alpha_eval.py +203 -0
- quantml/training/backtest.py +280 -0
- quantml/training/backtest_analysis.py +168 -0
- quantml/training/cv.py +106 -0
- quantml/training/data_loader.py +177 -0
- quantml/training/ensemble.py +84 -0
- quantml/training/feature_importance.py +135 -0
- quantml/training/features.py +364 -0
- quantml/training/futures_backtest.py +266 -0
- quantml/training/gradient_clipping.py +206 -0
- quantml/training/losses.py +248 -0
- quantml/training/lr_finder.py +127 -0
- quantml/training/metrics.py +376 -0
- quantml/training/regularization.py +89 -0
- quantml/training/trainer.py +239 -0
- quantml/training/walk_forward.py +190 -0
- quantml/utils/__init__.py +51 -0
- quantml/utils/gradient_check.py +274 -0
- quantml/utils/logging.py +181 -0
- quantml/utils/ops_cpu.py +231 -0
- quantml/utils/profiling.py +364 -0
- quantml/utils/reproducibility.py +220 -0
- quantml/utils/serialization.py +335 -0
- quantmllibrary-0.1.0.dist-info/METADATA +536 -0
- quantmllibrary-0.1.0.dist-info/RECORD +79 -0
- quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
- quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
- quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
quantml/models/lstm.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Long Short-Term Memory (LSTM) implementation.
|
|
3
|
+
|
|
4
|
+
LSTM networks are a type of recurrent neural network that can learn
|
|
5
|
+
long-term dependencies through gating mechanisms.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional, Tuple, List
|
|
9
|
+
import math
|
|
10
|
+
from quantml.tensor import Tensor
|
|
11
|
+
from quantml import ops
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LSTMCell:
|
|
15
|
+
"""
|
|
16
|
+
A single LSTM cell.
|
|
17
|
+
|
|
18
|
+
Implements the LSTM equations:
|
|
19
|
+
f_t = sigmoid(x_t @ W_xf + h_{t-1} @ W_hf + b_f) # forget gate
|
|
20
|
+
i_t = sigmoid(x_t @ W_xi + h_{t-1} @ W_hi + b_i) # input gate
|
|
21
|
+
g_t = tanh(x_t @ W_xg + h_{t-1} @ W_hg + b_g) # cell candidate
|
|
22
|
+
o_t = sigmoid(x_t @ W_xo + h_{t-1} @ W_ho + b_o) # output gate
|
|
23
|
+
c_t = f_t * c_{t-1} + i_t * g_t # cell state
|
|
24
|
+
h_t = o_t * tanh(c_t) # hidden state
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
input_size: Size of input features
|
|
28
|
+
hidden_size: Size of hidden state
|
|
29
|
+
|
|
30
|
+
Examples:
|
|
31
|
+
>>> cell = LSTMCell(10, 20)
|
|
32
|
+
>>> x = Tensor([[1.0] * 10])
|
|
33
|
+
>>> h, c = cell.forward(x)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, input_size: int, hidden_size: int, bias: bool = True):
|
|
37
|
+
"""
|
|
38
|
+
Initialize LSTM cell.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
input_size: Number of input features
|
|
42
|
+
hidden_size: Size of hidden state
|
|
43
|
+
bias: Whether to include bias terms
|
|
44
|
+
"""
|
|
45
|
+
self.input_size = input_size
|
|
46
|
+
self.hidden_size = hidden_size
|
|
47
|
+
self.bias = bias
|
|
48
|
+
|
|
49
|
+
# Initialize weights using Xavier/Glorot initialization
|
|
50
|
+
# Combined weights for all 4 gates: [i, f, g, o]
|
|
51
|
+
# Input-to-hidden weights: (4 * hidden_size, input_size)
|
|
52
|
+
combined_size = 4 * hidden_size
|
|
53
|
+
limit_ih = math.sqrt(6.0 / (input_size + hidden_size))
|
|
54
|
+
|
|
55
|
+
weight_ih_data = [
|
|
56
|
+
[(2.0 * limit_ih * ((i * input_size + j) % 1000) / 1000 - limit_ih)
|
|
57
|
+
for j in range(input_size)]
|
|
58
|
+
for i in range(combined_size)
|
|
59
|
+
]
|
|
60
|
+
self.weight_ih = Tensor(weight_ih_data, requires_grad=True)
|
|
61
|
+
|
|
62
|
+
# Hidden-to-hidden weights: (4 * hidden_size, hidden_size)
|
|
63
|
+
limit_hh = math.sqrt(6.0 / (hidden_size + hidden_size))
|
|
64
|
+
weight_hh_data = [
|
|
65
|
+
[(2.0 * limit_hh * ((i * hidden_size + j) % 1000) / 1000 - limit_hh)
|
|
66
|
+
for j in range(hidden_size)]
|
|
67
|
+
for i in range(combined_size)
|
|
68
|
+
]
|
|
69
|
+
self.weight_hh = Tensor(weight_hh_data, requires_grad=True)
|
|
70
|
+
|
|
71
|
+
# Biases for all 4 gates
|
|
72
|
+
if bias:
|
|
73
|
+
# Initialize forget gate bias to 1.0 for better gradient flow
|
|
74
|
+
bias_data = [[0.0] for _ in range(combined_size)]
|
|
75
|
+
# Forget gate bias starts at index hidden_size
|
|
76
|
+
for i in range(hidden_size, 2 * hidden_size):
|
|
77
|
+
bias_data[i][0] = 1.0
|
|
78
|
+
self.bias_ih = Tensor(bias_data, requires_grad=True)
|
|
79
|
+
self.bias_hh = Tensor([[0.0] for _ in range(combined_size)], requires_grad=True)
|
|
80
|
+
else:
|
|
81
|
+
self.bias_ih = None
|
|
82
|
+
self.bias_hh = None
|
|
83
|
+
|
|
84
|
+
# Hidden and cell state
|
|
85
|
+
self.hidden = None
|
|
86
|
+
self.cell = None
|
|
87
|
+
|
|
88
|
+
def forward(
|
|
89
|
+
self,
|
|
90
|
+
x: Tensor,
|
|
91
|
+
hidden: Optional[Tuple[Tensor, Tensor]] = None
|
|
92
|
+
) -> Tuple[Tensor, Tensor]:
|
|
93
|
+
"""
|
|
94
|
+
Forward pass through LSTM cell.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
x: Input tensor (batch_size x input_size)
|
|
98
|
+
hidden: Optional tuple of (h_{t-1}, c_{t-1})
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Tuple of (h_t, c_t) - new hidden and cell states
|
|
102
|
+
"""
|
|
103
|
+
# Ensure 2D input
|
|
104
|
+
x_data = x.data if isinstance(x.data[0], list) else [x.data]
|
|
105
|
+
x_2d = Tensor(x_data)
|
|
106
|
+
batch_size = len(x_data)
|
|
107
|
+
|
|
108
|
+
# Initialize hidden state if not provided
|
|
109
|
+
if hidden is None:
|
|
110
|
+
if self.hidden is None or self.cell is None:
|
|
111
|
+
h_prev = Tensor([[0.0] * self.hidden_size for _ in range(batch_size)])
|
|
112
|
+
c_prev = Tensor([[0.0] * self.hidden_size for _ in range(batch_size)])
|
|
113
|
+
else:
|
|
114
|
+
h_prev = self.hidden
|
|
115
|
+
c_prev = self.cell
|
|
116
|
+
else:
|
|
117
|
+
h_prev, c_prev = hidden
|
|
118
|
+
|
|
119
|
+
# Compute gates: x @ W_ih^T + h @ W_hh^T + b
|
|
120
|
+
# W_ih is (4*hidden, input), W_hh is (4*hidden, hidden)
|
|
121
|
+
W_ih_T = self._transpose(self.weight_ih)
|
|
122
|
+
W_hh_T = self._transpose(self.weight_hh)
|
|
123
|
+
|
|
124
|
+
gates = ops.matmul(x_2d, W_ih_T)
|
|
125
|
+
gates = ops.add(gates, ops.matmul(h_prev, W_hh_T))
|
|
126
|
+
|
|
127
|
+
if self.bias and self.bias_ih is not None and self.bias_hh is not None:
|
|
128
|
+
# Reshape biases for broadcasting
|
|
129
|
+
bias = ops.add(self.bias_ih, self.bias_hh)
|
|
130
|
+
gates = ops.add(gates, self._transpose(bias))
|
|
131
|
+
|
|
132
|
+
# Split gates: each is (batch_size x hidden_size)
|
|
133
|
+
# Order: input, forget, cell candidate, output
|
|
134
|
+
i_gate, f_gate, g_gate, o_gate = self._split_gates(gates)
|
|
135
|
+
|
|
136
|
+
# Apply activations
|
|
137
|
+
i_t = ops.sigmoid(i_gate)
|
|
138
|
+
f_t = ops.sigmoid(f_gate)
|
|
139
|
+
g_t = ops.tanh(g_gate)
|
|
140
|
+
o_t = ops.sigmoid(o_gate)
|
|
141
|
+
|
|
142
|
+
# Cell state update: c_t = f_t * c_{t-1} + i_t * g_t
|
|
143
|
+
c_t = ops.add(ops.mul(f_t, c_prev), ops.mul(i_t, g_t))
|
|
144
|
+
|
|
145
|
+
# Hidden state: h_t = o_t * tanh(c_t)
|
|
146
|
+
h_t = ops.mul(o_t, ops.tanh(c_t))
|
|
147
|
+
|
|
148
|
+
# Store for next step
|
|
149
|
+
self.hidden = h_t
|
|
150
|
+
self.cell = c_t
|
|
151
|
+
|
|
152
|
+
return h_t, c_t
|
|
153
|
+
|
|
154
|
+
def _transpose(self, t: Tensor) -> Tensor:
|
|
155
|
+
"""Transpose a 2D tensor."""
|
|
156
|
+
if not isinstance(t.data[0], list):
|
|
157
|
+
data = [t.data]
|
|
158
|
+
else:
|
|
159
|
+
data = t.data
|
|
160
|
+
|
|
161
|
+
transposed = [[data[j][i] for j in range(len(data))]
|
|
162
|
+
for i in range(len(data[0]))]
|
|
163
|
+
return Tensor(transposed, requires_grad=t.requires_grad)
|
|
164
|
+
|
|
165
|
+
def _split_gates(self, gates: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
|
|
166
|
+
"""Split combined gates into individual gate tensors."""
|
|
167
|
+
h = self.hidden_size
|
|
168
|
+
data = gates.data
|
|
169
|
+
|
|
170
|
+
if isinstance(data[0], list):
|
|
171
|
+
i_data = [[row[j] for j in range(0, h)] for row in data]
|
|
172
|
+
f_data = [[row[j] for j in range(h, 2*h)] for row in data]
|
|
173
|
+
g_data = [[row[j] for j in range(2*h, 3*h)] for row in data]
|
|
174
|
+
o_data = [[row[j] for j in range(3*h, 4*h)] for row in data]
|
|
175
|
+
else:
|
|
176
|
+
i_data = [data[j] for j in range(0, h)]
|
|
177
|
+
f_data = [data[j] for j in range(h, 2*h)]
|
|
178
|
+
g_data = [data[j] for j in range(2*h, 3*h)]
|
|
179
|
+
o_data = [data[j] for j in range(3*h, 4*h)]
|
|
180
|
+
|
|
181
|
+
return (
|
|
182
|
+
Tensor(i_data, requires_grad=gates.requires_grad),
|
|
183
|
+
Tensor(f_data, requires_grad=gates.requires_grad),
|
|
184
|
+
Tensor(g_data, requires_grad=gates.requires_grad),
|
|
185
|
+
Tensor(o_data, requires_grad=gates.requires_grad)
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def reset_hidden(self) -> None:
|
|
189
|
+
"""Reset hidden and cell states."""
|
|
190
|
+
self.hidden = None
|
|
191
|
+
self.cell = None
|
|
192
|
+
|
|
193
|
+
def parameters(self) -> List[Tensor]:
|
|
194
|
+
"""Get all trainable parameters."""
|
|
195
|
+
params = [self.weight_ih, self.weight_hh]
|
|
196
|
+
if self.bias and self.bias_ih is not None:
|
|
197
|
+
params.extend([self.bias_ih, self.bias_hh])
|
|
198
|
+
return params
|
|
199
|
+
|
|
200
|
+
def zero_grad(self) -> None:
|
|
201
|
+
"""Clear gradients for all parameters."""
|
|
202
|
+
for p in self.parameters():
|
|
203
|
+
p.zero_grad()
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class LSTM:
|
|
207
|
+
"""
|
|
208
|
+
Multi-layer LSTM module.
|
|
209
|
+
|
|
210
|
+
Stacks multiple LSTM layers for deeper sequence processing.
|
|
211
|
+
|
|
212
|
+
Attributes:
|
|
213
|
+
input_size: Size of input features
|
|
214
|
+
hidden_size: Size of hidden state
|
|
215
|
+
num_layers: Number of stacked LSTM layers
|
|
216
|
+
|
|
217
|
+
Examples:
|
|
218
|
+
>>> lstm = LSTM(10, 20, num_layers=2)
|
|
219
|
+
>>> x = Tensor([[[1.0] * 10] * 5]) # batch x seq x features
|
|
220
|
+
>>> outputs, (h_n, c_n) = lstm.forward(x)
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
def __init__(
|
|
224
|
+
self,
|
|
225
|
+
input_size: int,
|
|
226
|
+
hidden_size: int,
|
|
227
|
+
num_layers: int = 1,
|
|
228
|
+
bias: bool = True,
|
|
229
|
+
batch_first: bool = True
|
|
230
|
+
):
|
|
231
|
+
"""
|
|
232
|
+
Initialize multi-layer LSTM.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
input_size: Number of input features
|
|
236
|
+
hidden_size: Size of hidden state
|
|
237
|
+
num_layers: Number of stacked LSTM layers
|
|
238
|
+
bias: Whether to include bias terms
|
|
239
|
+
batch_first: If True, input shape is (batch, seq, features)
|
|
240
|
+
"""
|
|
241
|
+
self.input_size = input_size
|
|
242
|
+
self.hidden_size = hidden_size
|
|
243
|
+
self.num_layers = num_layers
|
|
244
|
+
self.batch_first = batch_first
|
|
245
|
+
|
|
246
|
+
# Create LSTM cells for each layer
|
|
247
|
+
self.cells: List[LSTMCell] = []
|
|
248
|
+
for i in range(num_layers):
|
|
249
|
+
cell_input_size = input_size if i == 0 else hidden_size
|
|
250
|
+
self.cells.append(LSTMCell(cell_input_size, hidden_size, bias))
|
|
251
|
+
|
|
252
|
+
def forward(
|
|
253
|
+
self,
|
|
254
|
+
x: Tensor,
|
|
255
|
+
hidden: Optional[Tuple[Tensor, Tensor]] = None
|
|
256
|
+
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
|
|
257
|
+
"""
|
|
258
|
+
Forward pass through LSTM.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
x: Input tensor (batch x seq x features) if batch_first
|
|
262
|
+
hidden: Optional initial (h_0, c_0) for all layers
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Tuple of (outputs, (h_n, c_n)) where:
|
|
266
|
+
- outputs: All hidden states for each timestep
|
|
267
|
+
- h_n: Final hidden state for each layer
|
|
268
|
+
- c_n: Final cell state for each layer
|
|
269
|
+
"""
|
|
270
|
+
# Ensure proper shape
|
|
271
|
+
data = x.data
|
|
272
|
+
if not isinstance(data[0], list):
|
|
273
|
+
data = [[data]]
|
|
274
|
+
elif not isinstance(data[0][0], list):
|
|
275
|
+
data = [data]
|
|
276
|
+
|
|
277
|
+
batch_size = len(data)
|
|
278
|
+
seq_len = len(data[0])
|
|
279
|
+
|
|
280
|
+
# Initialize hidden states for all layers
|
|
281
|
+
if hidden is None:
|
|
282
|
+
h_layers = [None] * self.num_layers
|
|
283
|
+
c_layers = [None] * self.num_layers
|
|
284
|
+
else:
|
|
285
|
+
# Split initial states by layer
|
|
286
|
+
h_0, c_0 = hidden
|
|
287
|
+
h_layers = [None] * self.num_layers # TODO: proper splitting
|
|
288
|
+
c_layers = [None] * self.num_layers
|
|
289
|
+
|
|
290
|
+
# Process sequence
|
|
291
|
+
outputs = []
|
|
292
|
+
|
|
293
|
+
for t in range(seq_len):
|
|
294
|
+
# Get input for this timestep
|
|
295
|
+
x_t_data = [[data[b][t][f] for f in range(len(data[b][t]))]
|
|
296
|
+
for b in range(batch_size)]
|
|
297
|
+
x_t = Tensor(x_t_data)
|
|
298
|
+
|
|
299
|
+
# Process through each layer
|
|
300
|
+
layer_input = x_t
|
|
301
|
+
for layer_idx, cell in enumerate(self.cells):
|
|
302
|
+
h_prev = h_layers[layer_idx]
|
|
303
|
+
c_prev = c_layers[layer_idx]
|
|
304
|
+
hidden_tuple = (h_prev, c_prev) if h_prev is not None else None
|
|
305
|
+
|
|
306
|
+
h_t, c_t = cell.forward(layer_input, hidden_tuple)
|
|
307
|
+
|
|
308
|
+
h_layers[layer_idx] = h_t
|
|
309
|
+
c_layers[layer_idx] = c_t
|
|
310
|
+
layer_input = h_t
|
|
311
|
+
|
|
312
|
+
# Store output from last layer
|
|
313
|
+
outputs.append(layer_input)
|
|
314
|
+
|
|
315
|
+
# Stack outputs: (batch, seq, hidden)
|
|
316
|
+
output_data = [
|
|
317
|
+
[[float(outputs[t].data[b][h])
|
|
318
|
+
for h in range(self.hidden_size)]
|
|
319
|
+
for t in range(seq_len)]
|
|
320
|
+
for b in range(batch_size)
|
|
321
|
+
]
|
|
322
|
+
|
|
323
|
+
# Stack final states
|
|
324
|
+
h_n_data = [[float(h_layers[l].data[b][h])
|
|
325
|
+
for h in range(self.hidden_size)]
|
|
326
|
+
for l in range(self.num_layers)
|
|
327
|
+
for b in range(batch_size)]
|
|
328
|
+
c_n_data = [[float(c_layers[l].data[b][h])
|
|
329
|
+
for h in range(self.hidden_size)]
|
|
330
|
+
for l in range(self.num_layers)
|
|
331
|
+
for b in range(batch_size)]
|
|
332
|
+
|
|
333
|
+
return (
|
|
334
|
+
Tensor(output_data),
|
|
335
|
+
(Tensor(h_n_data), Tensor(c_n_data))
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
def reset_hidden(self) -> None:
|
|
339
|
+
"""Reset hidden states for all layers."""
|
|
340
|
+
for cell in self.cells:
|
|
341
|
+
cell.reset_hidden()
|
|
342
|
+
|
|
343
|
+
def parameters(self) -> List[Tensor]:
|
|
344
|
+
"""Get all trainable parameters."""
|
|
345
|
+
params = []
|
|
346
|
+
for cell in self.cells:
|
|
347
|
+
params.extend(cell.parameters())
|
|
348
|
+
return params
|
|
349
|
+
|
|
350
|
+
def zero_grad(self) -> None:
|
|
351
|
+
"""Clear gradients for all parameters."""
|
|
352
|
+
for cell in self.cells:
|
|
353
|
+
cell.zero_grad()
|
quantml/models/mlp.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Multi-Layer Perceptron (MLP) implementation.
|
|
3
|
+
|
|
4
|
+
Provides a flexible MLP builder for creating feedforward neural networks
|
|
5
|
+
with configurable layers, activations, and regularization.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Optional, Union, Callable
|
|
9
|
+
import math
|
|
10
|
+
from quantml.tensor import Tensor
|
|
11
|
+
from quantml import ops
|
|
12
|
+
from quantml.models.linear import Linear
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MLP:
|
|
16
|
+
"""
|
|
17
|
+
Multi-Layer Perceptron (feedforward neural network).
|
|
18
|
+
|
|
19
|
+
A flexible MLP builder that creates a sequence of linear layers
|
|
20
|
+
with configurable activations and dropout.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
layers: List of Linear layers
|
|
24
|
+
activations: Activation function(s) between layers
|
|
25
|
+
dropout_rate: Dropout probability
|
|
26
|
+
|
|
27
|
+
Examples:
|
|
28
|
+
>>> # Simple 3-layer MLP
|
|
29
|
+
>>> mlp = MLP([10, 64, 32, 1], activation='relu')
|
|
30
|
+
>>> x = Tensor([[1.0] * 10])
|
|
31
|
+
>>> y = mlp.forward(x)
|
|
32
|
+
|
|
33
|
+
>>> # MLP with different activations per layer
|
|
34
|
+
>>> mlp = MLP([10, 64, 1], activation=['relu', None])
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
layer_sizes: List[int],
|
|
40
|
+
activation: Union[str, List[Optional[str]], Callable, None] = 'relu',
|
|
41
|
+
dropout: float = 0.0,
|
|
42
|
+
bias: bool = True,
|
|
43
|
+
final_activation: Optional[str] = None
|
|
44
|
+
):
|
|
45
|
+
"""
|
|
46
|
+
Initialize MLP.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
layer_sizes: List of layer sizes [input_size, hidden1, ..., output_size]
|
|
50
|
+
activation: Activation function(s). Can be:
|
|
51
|
+
- String: 'relu', 'tanh', 'sigmoid', 'leaky_relu', 'gelu', 'swish', None
|
|
52
|
+
- List of strings (one per layer transition)
|
|
53
|
+
- Callable function
|
|
54
|
+
dropout: Dropout probability (0.0 = no dropout)
|
|
55
|
+
bias: Whether to include bias in linear layers
|
|
56
|
+
final_activation: Optional activation for the final layer
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
ValueError: If layer_sizes has fewer than 2 elements
|
|
60
|
+
"""
|
|
61
|
+
if len(layer_sizes) < 2:
|
|
62
|
+
raise ValueError("layer_sizes must have at least 2 elements (input and output)")
|
|
63
|
+
|
|
64
|
+
self.layer_sizes = layer_sizes
|
|
65
|
+
self.dropout_rate = dropout
|
|
66
|
+
self.training = True
|
|
67
|
+
|
|
68
|
+
# Build layers
|
|
69
|
+
self.layers: List[Linear] = []
|
|
70
|
+
for i in range(len(layer_sizes) - 1):
|
|
71
|
+
in_size = layer_sizes[i]
|
|
72
|
+
out_size = layer_sizes[i + 1]
|
|
73
|
+
self.layers.append(Linear(in_size, out_size, bias=bias))
|
|
74
|
+
|
|
75
|
+
# Parse activations
|
|
76
|
+
num_transitions = len(layer_sizes) - 1
|
|
77
|
+
if isinstance(activation, list):
|
|
78
|
+
if len(activation) != num_transitions:
|
|
79
|
+
raise ValueError(f"activation list must have {num_transitions} elements")
|
|
80
|
+
self.activations = activation
|
|
81
|
+
else:
|
|
82
|
+
# Same activation for all hidden layers, final_activation for last
|
|
83
|
+
self.activations = [activation] * (num_transitions - 1) + [final_activation]
|
|
84
|
+
|
|
85
|
+
# Convert string activations to functions
|
|
86
|
+
self.activation_fns = [self._get_activation_fn(a) for a in self.activations]
|
|
87
|
+
|
|
88
|
+
def _get_activation_fn(self, name: Optional[str]) -> Optional[Callable]:
|
|
89
|
+
"""Get activation function by name."""
|
|
90
|
+
if name is None:
|
|
91
|
+
return None
|
|
92
|
+
if callable(name):
|
|
93
|
+
return name
|
|
94
|
+
|
|
95
|
+
activations = {
|
|
96
|
+
'relu': ops.relu,
|
|
97
|
+
'tanh': ops.tanh,
|
|
98
|
+
'sigmoid': ops.sigmoid,
|
|
99
|
+
'leaky_relu': ops.leaky_relu,
|
|
100
|
+
'gelu': ops.gelu,
|
|
101
|
+
'swish': ops.swish,
|
|
102
|
+
'softmax': ops.softmax,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if name.lower() not in activations:
|
|
106
|
+
raise ValueError(f"Unknown activation: {name}. Available: {list(activations.keys())}")
|
|
107
|
+
|
|
108
|
+
return activations[name.lower()]
|
|
109
|
+
|
|
110
|
+
def forward(self, x: Tensor) -> Tensor:
|
|
111
|
+
"""
|
|
112
|
+
Forward pass through MLP.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
x: Input tensor (batch_size x input_size)
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Output tensor (batch_size x output_size)
|
|
119
|
+
"""
|
|
120
|
+
out = x
|
|
121
|
+
|
|
122
|
+
for i, (layer, activation_fn) in enumerate(zip(self.layers, self.activation_fns)):
|
|
123
|
+
# Linear transformation
|
|
124
|
+
out = layer.forward(out)
|
|
125
|
+
|
|
126
|
+
# Apply activation
|
|
127
|
+
if activation_fn is not None:
|
|
128
|
+
out = activation_fn(out)
|
|
129
|
+
|
|
130
|
+
# Apply dropout (except on last layer)
|
|
131
|
+
if self.training and self.dropout_rate > 0 and i < len(self.layers) - 1:
|
|
132
|
+
out = self._apply_dropout(out)
|
|
133
|
+
|
|
134
|
+
return out
|
|
135
|
+
|
|
136
|
+
def _apply_dropout(self, x: Tensor) -> Tensor:
|
|
137
|
+
"""Apply dropout during training."""
|
|
138
|
+
if not self.training or self.dropout_rate <= 0:
|
|
139
|
+
return x
|
|
140
|
+
|
|
141
|
+
# Simple dropout: randomly zero out elements
|
|
142
|
+
import random
|
|
143
|
+
|
|
144
|
+
data = x.data
|
|
145
|
+
if isinstance(data[0], list):
|
|
146
|
+
dropped = [
|
|
147
|
+
[val if random.random() > self.dropout_rate
|
|
148
|
+
else 0.0 for val in row]
|
|
149
|
+
for row in data
|
|
150
|
+
]
|
|
151
|
+
# Scale by 1/(1-p) to maintain expected value
|
|
152
|
+
scale = 1.0 / (1.0 - self.dropout_rate)
|
|
153
|
+
dropped = [[val * scale for val in row] for row in dropped]
|
|
154
|
+
else:
|
|
155
|
+
dropped = [
|
|
156
|
+
val if random.random() > self.dropout_rate else 0.0
|
|
157
|
+
for val in data
|
|
158
|
+
]
|
|
159
|
+
scale = 1.0 / (1.0 - self.dropout_rate)
|
|
160
|
+
dropped = [val * scale for val in dropped]
|
|
161
|
+
|
|
162
|
+
return Tensor(dropped, requires_grad=x.requires_grad)
|
|
163
|
+
|
|
164
|
+
def train(self, mode: bool = True) -> 'MLP':
|
|
165
|
+
"""
|
|
166
|
+
Set training mode.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
mode: If True, enables training mode (dropout active)
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
self
|
|
173
|
+
"""
|
|
174
|
+
self.training = mode
|
|
175
|
+
return self
|
|
176
|
+
|
|
177
|
+
def eval(self) -> 'MLP':
|
|
178
|
+
"""
|
|
179
|
+
Set evaluation mode (disables dropout).
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
self
|
|
183
|
+
"""
|
|
184
|
+
return self.train(False)
|
|
185
|
+
|
|
186
|
+
def parameters(self) -> List[Tensor]:
|
|
187
|
+
"""Get all trainable parameters."""
|
|
188
|
+
params = []
|
|
189
|
+
for layer in self.layers:
|
|
190
|
+
params.extend(layer.parameters())
|
|
191
|
+
return params
|
|
192
|
+
|
|
193
|
+
def zero_grad(self) -> None:
|
|
194
|
+
"""Clear gradients for all parameters."""
|
|
195
|
+
for layer in self.layers:
|
|
196
|
+
layer.zero_grad()
|
|
197
|
+
|
|
198
|
+
def __repr__(self) -> str:
|
|
199
|
+
"""String representation."""
|
|
200
|
+
layers_str = " -> ".join(str(s) for s in self.layer_sizes)
|
|
201
|
+
activations_str = ", ".join(
|
|
202
|
+
str(a) if a is not None else "None" for a in self.activations
|
|
203
|
+
)
|
|
204
|
+
return f"MLP({layers_str}, activations=[{activations_str}], dropout={self.dropout_rate})"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class ResidualMLP(MLP):
|
|
208
|
+
"""
|
|
209
|
+
MLP with residual connections.
|
|
210
|
+
|
|
211
|
+
Adds skip connections between layers where dimensions match,
|
|
212
|
+
improving gradient flow for deeper networks.
|
|
213
|
+
|
|
214
|
+
Examples:
|
|
215
|
+
>>> mlp = ResidualMLP([10, 64, 64, 64, 1], activation='relu')
|
|
216
|
+
>>> x = Tensor([[1.0] * 10])
|
|
217
|
+
>>> y = mlp.forward(x)
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
def forward(self, x: Tensor) -> Tensor:
|
|
221
|
+
"""
|
|
222
|
+
Forward pass with residual connections.
|
|
223
|
+
|
|
224
|
+
Adds residual connections where input and output dimensions match.
|
|
225
|
+
"""
|
|
226
|
+
out = x
|
|
227
|
+
|
|
228
|
+
for i, (layer, activation_fn) in enumerate(zip(self.layers, self.activation_fns)):
|
|
229
|
+
# Store for residual
|
|
230
|
+
residual = out
|
|
231
|
+
|
|
232
|
+
# Linear transformation
|
|
233
|
+
out = layer.forward(out)
|
|
234
|
+
|
|
235
|
+
# Apply activation
|
|
236
|
+
if activation_fn is not None:
|
|
237
|
+
out = activation_fn(out)
|
|
238
|
+
|
|
239
|
+
# Residual connection if dimensions match
|
|
240
|
+
if i > 0 and i < len(self.layers) - 1:
|
|
241
|
+
# Check if dimensions match
|
|
242
|
+
out_shape = out.shape if hasattr(out, 'shape') else len(out.data[0]) if isinstance(out.data[0], list) else len(out.data)
|
|
243
|
+
res_shape = residual.shape if hasattr(residual, 'shape') else len(residual.data[0]) if isinstance(residual.data[0], list) else len(residual.data)
|
|
244
|
+
|
|
245
|
+
if out_shape == res_shape:
|
|
246
|
+
out = ops.add(out, residual)
|
|
247
|
+
|
|
248
|
+
# Apply dropout (except on last layer)
|
|
249
|
+
if self.training and self.dropout_rate > 0 and i < len(self.layers) - 1:
|
|
250
|
+
out = self._apply_dropout(out)
|
|
251
|
+
|
|
252
|
+
return out
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def create_mlp(
|
|
256
|
+
input_size: int,
|
|
257
|
+
output_size: int,
|
|
258
|
+
hidden_sizes: List[int],
|
|
259
|
+
activation: str = 'relu',
|
|
260
|
+
output_activation: Optional[str] = None,
|
|
261
|
+
dropout: float = 0.0
|
|
262
|
+
) -> MLP:
|
|
263
|
+
"""
|
|
264
|
+
Convenience function to create an MLP.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
input_size: Input feature dimension
|
|
268
|
+
output_size: Output dimension
|
|
269
|
+
hidden_sizes: List of hidden layer sizes
|
|
270
|
+
activation: Activation for hidden layers
|
|
271
|
+
output_activation: Activation for output layer
|
|
272
|
+
dropout: Dropout probability
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Configured MLP instance
|
|
276
|
+
|
|
277
|
+
Examples:
|
|
278
|
+
>>> mlp = create_mlp(10, 1, [64, 32], activation='relu')
|
|
279
|
+
"""
|
|
280
|
+
layer_sizes = [input_size] + hidden_sizes + [output_size]
|
|
281
|
+
return MLP(
|
|
282
|
+
layer_sizes,
|
|
283
|
+
activation=activation,
|
|
284
|
+
final_activation=output_activation,
|
|
285
|
+
dropout=dropout
|
|
286
|
+
)
|