quantmllibrary 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantml/__init__.py +74 -0
- quantml/autograd.py +154 -0
- quantml/cli/__init__.py +10 -0
- quantml/cli/run_experiment.py +385 -0
- quantml/config/__init__.py +28 -0
- quantml/config/config.py +259 -0
- quantml/data/__init__.py +33 -0
- quantml/data/cache.py +149 -0
- quantml/data/feature_store.py +234 -0
- quantml/data/futures.py +254 -0
- quantml/data/loaders.py +236 -0
- quantml/data/memory_optimizer.py +234 -0
- quantml/data/validators.py +390 -0
- quantml/experiments/__init__.py +23 -0
- quantml/experiments/logger.py +208 -0
- quantml/experiments/results.py +158 -0
- quantml/experiments/tracker.py +223 -0
- quantml/features/__init__.py +25 -0
- quantml/features/base.py +104 -0
- quantml/features/gap_features.py +124 -0
- quantml/features/registry.py +138 -0
- quantml/features/volatility_features.py +140 -0
- quantml/features/volume_features.py +142 -0
- quantml/functional.py +37 -0
- quantml/models/__init__.py +27 -0
- quantml/models/attention.py +258 -0
- quantml/models/dropout.py +130 -0
- quantml/models/gru.py +319 -0
- quantml/models/linear.py +112 -0
- quantml/models/lstm.py +353 -0
- quantml/models/mlp.py +286 -0
- quantml/models/normalization.py +289 -0
- quantml/models/rnn.py +154 -0
- quantml/models/tcn.py +238 -0
- quantml/online.py +209 -0
- quantml/ops.py +1707 -0
- quantml/optim/__init__.py +42 -0
- quantml/optim/adafactor.py +206 -0
- quantml/optim/adagrad.py +157 -0
- quantml/optim/adam.py +267 -0
- quantml/optim/lookahead.py +97 -0
- quantml/optim/quant_optimizer.py +228 -0
- quantml/optim/radam.py +192 -0
- quantml/optim/rmsprop.py +203 -0
- quantml/optim/schedulers.py +286 -0
- quantml/optim/sgd.py +181 -0
- quantml/py.typed +0 -0
- quantml/streaming.py +175 -0
- quantml/tensor.py +462 -0
- quantml/time_series.py +447 -0
- quantml/training/__init__.py +135 -0
- quantml/training/alpha_eval.py +203 -0
- quantml/training/backtest.py +280 -0
- quantml/training/backtest_analysis.py +168 -0
- quantml/training/cv.py +106 -0
- quantml/training/data_loader.py +177 -0
- quantml/training/ensemble.py +84 -0
- quantml/training/feature_importance.py +135 -0
- quantml/training/features.py +364 -0
- quantml/training/futures_backtest.py +266 -0
- quantml/training/gradient_clipping.py +206 -0
- quantml/training/losses.py +248 -0
- quantml/training/lr_finder.py +127 -0
- quantml/training/metrics.py +376 -0
- quantml/training/regularization.py +89 -0
- quantml/training/trainer.py +239 -0
- quantml/training/walk_forward.py +190 -0
- quantml/utils/__init__.py +51 -0
- quantml/utils/gradient_check.py +274 -0
- quantml/utils/logging.py +181 -0
- quantml/utils/ops_cpu.py +231 -0
- quantml/utils/profiling.py +364 -0
- quantml/utils/reproducibility.py +220 -0
- quantml/utils/serialization.py +335 -0
- quantmllibrary-0.1.0.dist-info/METADATA +536 -0
- quantmllibrary-0.1.0.dist-info/RECORD +79 -0
- quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
- quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
- quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
quantml/models/gru.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gated Recurrent Unit (GRU) implementation.
|
|
3
|
+
|
|
4
|
+
GRU is a simplified variant of LSTM that uses fewer gates
|
|
5
|
+
while maintaining similar performance for sequence modeling.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional, Tuple, List
|
|
9
|
+
import math
|
|
10
|
+
from quantml.tensor import Tensor
|
|
11
|
+
from quantml import ops
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class GRUCell:
|
|
15
|
+
"""
|
|
16
|
+
A single GRU cell.
|
|
17
|
+
|
|
18
|
+
Implements the GRU equations:
|
|
19
|
+
r_t = sigmoid(x_t @ W_xr + h_{t-1} @ W_hr + b_r) # reset gate
|
|
20
|
+
z_t = sigmoid(x_t @ W_xz + h_{t-1} @ W_hz + b_z) # update gate
|
|
21
|
+
n_t = tanh(x_t @ W_xn + r_t * (h_{t-1} @ W_hn) + b_n) # candidate
|
|
22
|
+
h_t = (1 - z_t) * n_t + z_t * h_{t-1} # hidden state
|
|
23
|
+
|
|
24
|
+
Attributes:
|
|
25
|
+
input_size: Size of input features
|
|
26
|
+
hidden_size: Size of hidden state
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
>>> cell = GRUCell(10, 20)
|
|
30
|
+
>>> x = Tensor([[1.0] * 10])
|
|
31
|
+
>>> h = cell.forward(x)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, input_size: int, hidden_size: int, bias: bool = True):
|
|
35
|
+
"""
|
|
36
|
+
Initialize GRU cell.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
input_size: Number of input features
|
|
40
|
+
hidden_size: Size of hidden state
|
|
41
|
+
bias: Whether to include bias terms
|
|
42
|
+
"""
|
|
43
|
+
self.input_size = input_size
|
|
44
|
+
self.hidden_size = hidden_size
|
|
45
|
+
self.bias = bias
|
|
46
|
+
|
|
47
|
+
# Initialize weights using Xavier/Glorot initialization
|
|
48
|
+
# Combined weights for 3 gates: [r, z, n]
|
|
49
|
+
combined_size = 3 * hidden_size
|
|
50
|
+
limit_ih = math.sqrt(6.0 / (input_size + hidden_size))
|
|
51
|
+
|
|
52
|
+
# Input-to-hidden weights: (3 * hidden_size, input_size)
|
|
53
|
+
weight_ih_data = [
|
|
54
|
+
[(2.0 * limit_ih * ((i * input_size + j) % 1000) / 1000 - limit_ih)
|
|
55
|
+
for j in range(input_size)]
|
|
56
|
+
for i in range(combined_size)
|
|
57
|
+
]
|
|
58
|
+
self.weight_ih = Tensor(weight_ih_data, requires_grad=True)
|
|
59
|
+
|
|
60
|
+
# Hidden-to-hidden weights: (3 * hidden_size, hidden_size)
|
|
61
|
+
limit_hh = math.sqrt(6.0 / (hidden_size + hidden_size))
|
|
62
|
+
weight_hh_data = [
|
|
63
|
+
[(2.0 * limit_hh * ((i * hidden_size + j) % 1000) / 1000 - limit_hh)
|
|
64
|
+
for j in range(hidden_size)]
|
|
65
|
+
for i in range(combined_size)
|
|
66
|
+
]
|
|
67
|
+
self.weight_hh = Tensor(weight_hh_data, requires_grad=True)
|
|
68
|
+
|
|
69
|
+
# Biases for all 3 gates
|
|
70
|
+
if bias:
|
|
71
|
+
self.bias_ih = Tensor([[0.0] for _ in range(combined_size)], requires_grad=True)
|
|
72
|
+
self.bias_hh = Tensor([[0.0] for _ in range(combined_size)], requires_grad=True)
|
|
73
|
+
else:
|
|
74
|
+
self.bias_ih = None
|
|
75
|
+
self.bias_hh = None
|
|
76
|
+
|
|
77
|
+
# Hidden state
|
|
78
|
+
self.hidden = None
|
|
79
|
+
|
|
80
|
+
def forward(self, x: Tensor, hidden: Optional[Tensor] = None) -> Tensor:
|
|
81
|
+
"""
|
|
82
|
+
Forward pass through GRU cell.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
x: Input tensor (batch_size x input_size)
|
|
86
|
+
hidden: Optional previous hidden state h_{t-1}
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
New hidden state h_t
|
|
90
|
+
"""
|
|
91
|
+
# Ensure 2D input
|
|
92
|
+
x_data = x.data if isinstance(x.data[0], list) else [x.data]
|
|
93
|
+
x_2d = Tensor(x_data)
|
|
94
|
+
batch_size = len(x_data)
|
|
95
|
+
|
|
96
|
+
# Initialize hidden state if not provided
|
|
97
|
+
if hidden is None:
|
|
98
|
+
if self.hidden is None:
|
|
99
|
+
h_prev = Tensor([[0.0] * self.hidden_size for _ in range(batch_size)])
|
|
100
|
+
else:
|
|
101
|
+
h_prev = self.hidden
|
|
102
|
+
else:
|
|
103
|
+
h_prev = hidden
|
|
104
|
+
|
|
105
|
+
# Compute input projections: x @ W_ih^T
|
|
106
|
+
W_ih_T = self._transpose(self.weight_ih)
|
|
107
|
+
x_proj = ops.matmul(x_2d, W_ih_T)
|
|
108
|
+
|
|
109
|
+
if self.bias and self.bias_ih is not None:
|
|
110
|
+
bias_ih_T = self._transpose(self.bias_ih)
|
|
111
|
+
x_proj = ops.add(x_proj, bias_ih_T)
|
|
112
|
+
|
|
113
|
+
# Compute hidden projections: h @ W_hh^T
|
|
114
|
+
W_hh_T = self._transpose(self.weight_hh)
|
|
115
|
+
h_proj = ops.matmul(h_prev, W_hh_T)
|
|
116
|
+
|
|
117
|
+
if self.bias and self.bias_hh is not None:
|
|
118
|
+
bias_hh_T = self._transpose(self.bias_hh)
|
|
119
|
+
h_proj = ops.add(h_proj, bias_hh_T)
|
|
120
|
+
|
|
121
|
+
# Split projections into gates
|
|
122
|
+
# Order: reset, update, new (candidate)
|
|
123
|
+
h = self.hidden_size
|
|
124
|
+
x_r, x_z, x_n = self._split_3(x_proj, h)
|
|
125
|
+
h_r, h_z, h_n = self._split_3(h_proj, h)
|
|
126
|
+
|
|
127
|
+
# Reset gate: r_t = sigmoid(x_r + h_r)
|
|
128
|
+
r_t = ops.sigmoid(ops.add(x_r, h_r))
|
|
129
|
+
|
|
130
|
+
# Update gate: z_t = sigmoid(x_z + h_z)
|
|
131
|
+
z_t = ops.sigmoid(ops.add(x_z, h_z))
|
|
132
|
+
|
|
133
|
+
# Candidate hidden: n_t = tanh(x_n + r_t * h_n)
|
|
134
|
+
n_t = ops.tanh(ops.add(x_n, ops.mul(r_t, h_n)))
|
|
135
|
+
|
|
136
|
+
# New hidden state: h_t = (1 - z_t) * n_t + z_t * h_{t-1}
|
|
137
|
+
one_minus_z = ops.sub(Tensor([[1.0] * h for _ in range(batch_size)]), z_t)
|
|
138
|
+
h_t = ops.add(ops.mul(one_minus_z, n_t), ops.mul(z_t, h_prev))
|
|
139
|
+
|
|
140
|
+
# Store for next step
|
|
141
|
+
self.hidden = h_t
|
|
142
|
+
|
|
143
|
+
return h_t
|
|
144
|
+
|
|
145
|
+
def _transpose(self, t: Tensor) -> Tensor:
|
|
146
|
+
"""Transpose a 2D tensor."""
|
|
147
|
+
if not isinstance(t.data[0], list):
|
|
148
|
+
data = [t.data]
|
|
149
|
+
else:
|
|
150
|
+
data = t.data
|
|
151
|
+
|
|
152
|
+
transposed = [[data[j][i] for j in range(len(data))]
|
|
153
|
+
for i in range(len(data[0]))]
|
|
154
|
+
return Tensor(transposed, requires_grad=t.requires_grad)
|
|
155
|
+
|
|
156
|
+
def _split_3(self, tensor: Tensor, h: int) -> Tuple[Tensor, Tensor, Tensor]:
|
|
157
|
+
"""Split tensor into 3 chunks along last dimension."""
|
|
158
|
+
data = tensor.data
|
|
159
|
+
|
|
160
|
+
if isinstance(data[0], list):
|
|
161
|
+
r_data = [[row[j] for j in range(0, h)] for row in data]
|
|
162
|
+
z_data = [[row[j] for j in range(h, 2*h)] for row in data]
|
|
163
|
+
n_data = [[row[j] for j in range(2*h, 3*h)] for row in data]
|
|
164
|
+
else:
|
|
165
|
+
r_data = [data[j] for j in range(0, h)]
|
|
166
|
+
z_data = [data[j] for j in range(h, 2*h)]
|
|
167
|
+
n_data = [data[j] for j in range(2*h, 3*h)]
|
|
168
|
+
|
|
169
|
+
return (
|
|
170
|
+
Tensor(r_data, requires_grad=tensor.requires_grad),
|
|
171
|
+
Tensor(z_data, requires_grad=tensor.requires_grad),
|
|
172
|
+
Tensor(n_data, requires_grad=tensor.requires_grad)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
def reset_hidden(self) -> None:
|
|
176
|
+
"""Reset hidden state."""
|
|
177
|
+
self.hidden = None
|
|
178
|
+
|
|
179
|
+
def parameters(self) -> List[Tensor]:
|
|
180
|
+
"""Get all trainable parameters."""
|
|
181
|
+
params = [self.weight_ih, self.weight_hh]
|
|
182
|
+
if self.bias and self.bias_ih is not None:
|
|
183
|
+
params.extend([self.bias_ih, self.bias_hh])
|
|
184
|
+
return params
|
|
185
|
+
|
|
186
|
+
def zero_grad(self) -> None:
|
|
187
|
+
"""Clear gradients for all parameters."""
|
|
188
|
+
for p in self.parameters():
|
|
189
|
+
p.zero_grad()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class GRU:
|
|
193
|
+
"""
|
|
194
|
+
Multi-layer GRU module.
|
|
195
|
+
|
|
196
|
+
Stacks multiple GRU layers for deeper sequence processing.
|
|
197
|
+
|
|
198
|
+
Attributes:
|
|
199
|
+
input_size: Size of input features
|
|
200
|
+
hidden_size: Size of hidden state
|
|
201
|
+
num_layers: Number of stacked GRU layers
|
|
202
|
+
|
|
203
|
+
Examples:
|
|
204
|
+
>>> gru = GRU(10, 20, num_layers=2)
|
|
205
|
+
>>> x = Tensor([[[1.0] * 10] * 5]) # batch x seq x features
|
|
206
|
+
>>> outputs, h_n = gru.forward(x)
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
def __init__(
|
|
210
|
+
self,
|
|
211
|
+
input_size: int,
|
|
212
|
+
hidden_size: int,
|
|
213
|
+
num_layers: int = 1,
|
|
214
|
+
bias: bool = True,
|
|
215
|
+
batch_first: bool = True
|
|
216
|
+
):
|
|
217
|
+
"""
|
|
218
|
+
Initialize multi-layer GRU.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
input_size: Number of input features
|
|
222
|
+
hidden_size: Size of hidden state
|
|
223
|
+
num_layers: Number of stacked GRU layers
|
|
224
|
+
bias: Whether to include bias terms
|
|
225
|
+
batch_first: If True, input shape is (batch, seq, features)
|
|
226
|
+
"""
|
|
227
|
+
self.input_size = input_size
|
|
228
|
+
self.hidden_size = hidden_size
|
|
229
|
+
self.num_layers = num_layers
|
|
230
|
+
self.batch_first = batch_first
|
|
231
|
+
|
|
232
|
+
# Create GRU cells for each layer
|
|
233
|
+
self.cells: List[GRUCell] = []
|
|
234
|
+
for i in range(num_layers):
|
|
235
|
+
cell_input_size = input_size if i == 0 else hidden_size
|
|
236
|
+
self.cells.append(GRUCell(cell_input_size, hidden_size, bias))
|
|
237
|
+
|
|
238
|
+
def forward(
|
|
239
|
+
self,
|
|
240
|
+
x: Tensor,
|
|
241
|
+
hidden: Optional[Tensor] = None
|
|
242
|
+
) -> Tuple[Tensor, Tensor]:
|
|
243
|
+
"""
|
|
244
|
+
Forward pass through GRU.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
x: Input tensor (batch x seq x features) if batch_first
|
|
248
|
+
hidden: Optional initial h_0 for all layers
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Tuple of (outputs, h_n) where:
|
|
252
|
+
- outputs: All hidden states for each timestep
|
|
253
|
+
- h_n: Final hidden state for each layer
|
|
254
|
+
"""
|
|
255
|
+
# Ensure proper shape
|
|
256
|
+
data = x.data
|
|
257
|
+
if not isinstance(data[0], list):
|
|
258
|
+
data = [[data]]
|
|
259
|
+
elif not isinstance(data[0][0], list):
|
|
260
|
+
data = [data]
|
|
261
|
+
|
|
262
|
+
batch_size = len(data)
|
|
263
|
+
seq_len = len(data[0])
|
|
264
|
+
|
|
265
|
+
# Initialize hidden states for all layers
|
|
266
|
+
h_layers = [None] * self.num_layers
|
|
267
|
+
|
|
268
|
+
# Process sequence
|
|
269
|
+
outputs = []
|
|
270
|
+
|
|
271
|
+
for t in range(seq_len):
|
|
272
|
+
# Get input for this timestep
|
|
273
|
+
x_t_data = [[data[b][t][f] for f in range(len(data[b][t]))]
|
|
274
|
+
for b in range(batch_size)]
|
|
275
|
+
x_t = Tensor(x_t_data)
|
|
276
|
+
|
|
277
|
+
# Process through each layer
|
|
278
|
+
layer_input = x_t
|
|
279
|
+
for layer_idx, cell in enumerate(self.cells):
|
|
280
|
+
h_prev = h_layers[layer_idx]
|
|
281
|
+
h_t = cell.forward(layer_input, h_prev)
|
|
282
|
+
h_layers[layer_idx] = h_t
|
|
283
|
+
layer_input = h_t
|
|
284
|
+
|
|
285
|
+
# Store output from last layer
|
|
286
|
+
outputs.append(layer_input)
|
|
287
|
+
|
|
288
|
+
# Stack outputs: (batch, seq, hidden)
|
|
289
|
+
output_data = [
|
|
290
|
+
[[float(outputs[t].data[b][h])
|
|
291
|
+
for h in range(self.hidden_size)]
|
|
292
|
+
for t in range(seq_len)]
|
|
293
|
+
for b in range(batch_size)
|
|
294
|
+
]
|
|
295
|
+
|
|
296
|
+
# Stack final states
|
|
297
|
+
h_n_data = [[float(h_layers[l].data[b][h])
|
|
298
|
+
for h in range(self.hidden_size)]
|
|
299
|
+
for l in range(self.num_layers)
|
|
300
|
+
for b in range(batch_size)]
|
|
301
|
+
|
|
302
|
+
return Tensor(output_data), Tensor(h_n_data)
|
|
303
|
+
|
|
304
|
+
def reset_hidden(self) -> None:
|
|
305
|
+
"""Reset hidden states for all layers."""
|
|
306
|
+
for cell in self.cells:
|
|
307
|
+
cell.reset_hidden()
|
|
308
|
+
|
|
309
|
+
def parameters(self) -> List[Tensor]:
|
|
310
|
+
"""Get all trainable parameters."""
|
|
311
|
+
params = []
|
|
312
|
+
for cell in self.cells:
|
|
313
|
+
params.extend(cell.parameters())
|
|
314
|
+
return params
|
|
315
|
+
|
|
316
|
+
def zero_grad(self) -> None:
|
|
317
|
+
"""Clear gradients for all parameters."""
|
|
318
|
+
for cell in self.cells:
|
|
319
|
+
cell.zero_grad()
|
quantml/models/linear.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Linear (fully connected) layer implementation.
|
|
3
|
+
|
|
4
|
+
This module provides a simple linear layer suitable for quant trading models.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Optional
|
|
8
|
+
import math
|
|
9
|
+
from quantml.tensor import Tensor
|
|
10
|
+
from quantml import ops
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Linear:
|
|
14
|
+
"""
|
|
15
|
+
Linear (fully connected) layer: y = xW^T + b
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
in_features: Number of input features
|
|
19
|
+
out_features: Number of output features
|
|
20
|
+
weight: Weight matrix (out_features x in_features)
|
|
21
|
+
bias: Bias vector (out_features,)
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
>>> layer = Linear(10, 5)
|
|
25
|
+
>>> x = Tensor([[1.0] * 10])
|
|
26
|
+
>>> y = layer.forward(x) # Shape: (1, 5)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
in_features: int,
|
|
32
|
+
out_features: int,
|
|
33
|
+
bias: bool = True,
|
|
34
|
+
weight_init: Optional[Tensor] = None,
|
|
35
|
+
bias_init: Optional[Tensor] = None
|
|
36
|
+
):
|
|
37
|
+
"""
|
|
38
|
+
Initialize linear layer.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
in_features: Number of input features
|
|
42
|
+
out_features: Number of output features
|
|
43
|
+
bias: Whether to include bias term
|
|
44
|
+
weight_init: Optional initial weight tensor
|
|
45
|
+
bias_init: Optional initial bias tensor
|
|
46
|
+
"""
|
|
47
|
+
self.in_features = in_features
|
|
48
|
+
self.out_features = out_features
|
|
49
|
+
self.bias = bias
|
|
50
|
+
|
|
51
|
+
# Initialize weights
|
|
52
|
+
if weight_init is not None:
|
|
53
|
+
self.weight = weight_init
|
|
54
|
+
else:
|
|
55
|
+
# Xavier/Glorot initialization
|
|
56
|
+
limit = math.sqrt(6.0 / (in_features + out_features))
|
|
57
|
+
weight_data = [[(2.0 * limit * (i * out_features + j) / (in_features * out_features) - limit)
|
|
58
|
+
for j in range(in_features)]
|
|
59
|
+
for i in range(out_features)]
|
|
60
|
+
self.weight = Tensor(weight_data, requires_grad=True)
|
|
61
|
+
|
|
62
|
+
# Initialize bias
|
|
63
|
+
if bias:
|
|
64
|
+
if bias_init is not None:
|
|
65
|
+
self.bias_param = bias_init
|
|
66
|
+
else:
|
|
67
|
+
bias_data = [[0.0] for _ in range(out_features)]
|
|
68
|
+
self.bias_param = Tensor(bias_data, requires_grad=True)
|
|
69
|
+
else:
|
|
70
|
+
self.bias_param = None
|
|
71
|
+
|
|
72
|
+
def forward(self, x: Tensor) -> Tensor:
|
|
73
|
+
"""
|
|
74
|
+
Forward pass: y = xW^T + b
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
x: Input tensor (batch_size x in_features)
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Output tensor (batch_size x out_features)
|
|
81
|
+
"""
|
|
82
|
+
# x: (batch, in_features)
|
|
83
|
+
# weight: (out_features, in_features)
|
|
84
|
+
# We need: x @ weight.T
|
|
85
|
+
# For simplicity, we'll do: (x @ weight.T) which is matmul(x, weight.T)
|
|
86
|
+
|
|
87
|
+
# Transpose weight: (in_features, out_features)
|
|
88
|
+
weight_T = ops.transpose(self.weight)
|
|
89
|
+
|
|
90
|
+
# Matrix multiply: x @ weight_T
|
|
91
|
+
out = ops.matmul(x, weight_T)
|
|
92
|
+
|
|
93
|
+
# Add bias if present
|
|
94
|
+
if self.bias and self.bias_param is not None:
|
|
95
|
+
# Broadcast bias to match output shape
|
|
96
|
+
out = ops.add(out, self.bias_param)
|
|
97
|
+
|
|
98
|
+
return out
|
|
99
|
+
|
|
100
|
+
def parameters(self) -> list:
|
|
101
|
+
"""Get all trainable parameters."""
|
|
102
|
+
params = [self.weight]
|
|
103
|
+
if self.bias and self.bias_param is not None:
|
|
104
|
+
params.append(self.bias_param)
|
|
105
|
+
return params
|
|
106
|
+
|
|
107
|
+
def zero_grad(self):
|
|
108
|
+
"""Clear gradients for all parameters."""
|
|
109
|
+
self.weight.zero_grad()
|
|
110
|
+
if self.bias_param is not None:
|
|
111
|
+
self.bias_param.zero_grad()
|
|
112
|
+
|