quantmllibrary 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantml/__init__.py +74 -0
- quantml/autograd.py +154 -0
- quantml/cli/__init__.py +10 -0
- quantml/cli/run_experiment.py +385 -0
- quantml/config/__init__.py +28 -0
- quantml/config/config.py +259 -0
- quantml/data/__init__.py +33 -0
- quantml/data/cache.py +149 -0
- quantml/data/feature_store.py +234 -0
- quantml/data/futures.py +254 -0
- quantml/data/loaders.py +236 -0
- quantml/data/memory_optimizer.py +234 -0
- quantml/data/validators.py +390 -0
- quantml/experiments/__init__.py +23 -0
- quantml/experiments/logger.py +208 -0
- quantml/experiments/results.py +158 -0
- quantml/experiments/tracker.py +223 -0
- quantml/features/__init__.py +25 -0
- quantml/features/base.py +104 -0
- quantml/features/gap_features.py +124 -0
- quantml/features/registry.py +138 -0
- quantml/features/volatility_features.py +140 -0
- quantml/features/volume_features.py +142 -0
- quantml/functional.py +37 -0
- quantml/models/__init__.py +27 -0
- quantml/models/attention.py +258 -0
- quantml/models/dropout.py +130 -0
- quantml/models/gru.py +319 -0
- quantml/models/linear.py +112 -0
- quantml/models/lstm.py +353 -0
- quantml/models/mlp.py +286 -0
- quantml/models/normalization.py +289 -0
- quantml/models/rnn.py +154 -0
- quantml/models/tcn.py +238 -0
- quantml/online.py +209 -0
- quantml/ops.py +1707 -0
- quantml/optim/__init__.py +42 -0
- quantml/optim/adafactor.py +206 -0
- quantml/optim/adagrad.py +157 -0
- quantml/optim/adam.py +267 -0
- quantml/optim/lookahead.py +97 -0
- quantml/optim/quant_optimizer.py +228 -0
- quantml/optim/radam.py +192 -0
- quantml/optim/rmsprop.py +203 -0
- quantml/optim/schedulers.py +286 -0
- quantml/optim/sgd.py +181 -0
- quantml/py.typed +0 -0
- quantml/streaming.py +175 -0
- quantml/tensor.py +462 -0
- quantml/time_series.py +447 -0
- quantml/training/__init__.py +135 -0
- quantml/training/alpha_eval.py +203 -0
- quantml/training/backtest.py +280 -0
- quantml/training/backtest_analysis.py +168 -0
- quantml/training/cv.py +106 -0
- quantml/training/data_loader.py +177 -0
- quantml/training/ensemble.py +84 -0
- quantml/training/feature_importance.py +135 -0
- quantml/training/features.py +364 -0
- quantml/training/futures_backtest.py +266 -0
- quantml/training/gradient_clipping.py +206 -0
- quantml/training/losses.py +248 -0
- quantml/training/lr_finder.py +127 -0
- quantml/training/metrics.py +376 -0
- quantml/training/regularization.py +89 -0
- quantml/training/trainer.py +239 -0
- quantml/training/walk_forward.py +190 -0
- quantml/utils/__init__.py +51 -0
- quantml/utils/gradient_check.py +274 -0
- quantml/utils/logging.py +181 -0
- quantml/utils/ops_cpu.py +231 -0
- quantml/utils/profiling.py +364 -0
- quantml/utils/reproducibility.py +220 -0
- quantml/utils/serialization.py +335 -0
- quantmllibrary-0.1.0.dist-info/METADATA +536 -0
- quantmllibrary-0.1.0.dist-info/RECORD +79 -0
- quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
- quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
- quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
QuantML Models
|
|
3
|
+
|
|
4
|
+
This module provides neural network models optimized for quantitative trading.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from quantml.models.linear import Linear
|
|
8
|
+
from quantml.models.rnn import SimpleRNN
|
|
9
|
+
from quantml.models.tcn import TCN, TCNBlock
|
|
10
|
+
from quantml.models.lstm import LSTM, LSTMCell
|
|
11
|
+
from quantml.models.gru import GRU, GRUCell
|
|
12
|
+
from quantml.models.mlp import MLP, ResidualMLP, create_mlp
|
|
13
|
+
from quantml.models.normalization import BatchNorm1d, LayerNorm
|
|
14
|
+
from quantml.models.dropout import Dropout
|
|
15
|
+
from quantml.models.attention import SelfAttention, MultiHeadAttention
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
'Linear',
|
|
19
|
+
'SimpleRNN',
|
|
20
|
+
'TCN', 'TCNBlock',
|
|
21
|
+
'LSTM', 'LSTMCell',
|
|
22
|
+
'GRU', 'GRUCell',
|
|
23
|
+
'MLP', 'ResidualMLP', 'create_mlp',
|
|
24
|
+
'BatchNorm1d', 'LayerNorm',
|
|
25
|
+
'Dropout',
|
|
26
|
+
'SelfAttention', 'MultiHeadAttention',
|
|
27
|
+
]
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Attention mechanisms.
|
|
3
|
+
|
|
4
|
+
Implementations of Self-Attention and Multi-Head Attention.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Optional, List, Tuple
|
|
8
|
+
import math
|
|
9
|
+
from quantml.tensor import Tensor
|
|
10
|
+
from quantml import ops
|
|
11
|
+
from quantml.models.linear import Linear
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SelfAttention:
|
|
15
|
+
"""
|
|
16
|
+
Scaled Dot-Product Self-Attention.
|
|
17
|
+
|
|
18
|
+
Attention(Q, K, V) = softmax(QK^T / sqrt(d_k))V
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
embed_dim: Dimension of embeddings
|
|
22
|
+
num_heads: Number of attention heads (default: 1)
|
|
23
|
+
dropout: Dropout probability
|
|
24
|
+
|
|
25
|
+
Examples:
|
|
26
|
+
>>> attn = SelfAttention(64)
|
|
27
|
+
>>> x = Tensor([[[1.0] * 64] * 10]) # batch x seq x dim
|
|
28
|
+
>>> out = attn.forward(x)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, embed_dim: int, dropout: float = 0.0):
|
|
32
|
+
self.embed_dim = embed_dim
|
|
33
|
+
self.dropout = dropout
|
|
34
|
+
|
|
35
|
+
# Projections
|
|
36
|
+
self.q_proj = Linear(embed_dim, embed_dim)
|
|
37
|
+
self.k_proj = Linear(embed_dim, embed_dim)
|
|
38
|
+
self.v_proj = Linear(embed_dim, embed_dim)
|
|
39
|
+
self.out_proj = Linear(embed_dim, embed_dim)
|
|
40
|
+
|
|
41
|
+
# Scaling factor
|
|
42
|
+
self.scale = 1.0 / math.sqrt(embed_dim)
|
|
43
|
+
|
|
44
|
+
def forward(
|
|
45
|
+
self,
|
|
46
|
+
x: Tensor,
|
|
47
|
+
mask: Optional[Tensor] = None
|
|
48
|
+
) -> Tensor:
|
|
49
|
+
"""
|
|
50
|
+
Forward pass.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
x: Input tensor (batch x seq_len x embed_dim)
|
|
54
|
+
mask: Optional mask (batch x seq_len x seq_len)
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Output tensor (batch x seq_len x embed_dim)
|
|
58
|
+
"""
|
|
59
|
+
# Linear projections
|
|
60
|
+
# Each is (batch x seq_len x embed_dim)
|
|
61
|
+
q = self.q_proj.forward(x)
|
|
62
|
+
k = self.k_proj.forward(x)
|
|
63
|
+
v = self.v_proj.forward(x)
|
|
64
|
+
|
|
65
|
+
# Calculate attention scores: Q @ K^T
|
|
66
|
+
# Transpose k for matmul
|
|
67
|
+
# Q: (B, S, D), K: (B, S, D) -> K^T: (B, D, S)
|
|
68
|
+
# We need an explicit transpose op for 3D tensors which we don't have yet
|
|
69
|
+
# So we'll implement a simplified version assuming batch=1 or flattened
|
|
70
|
+
|
|
71
|
+
# NOTE: Full multi-head attention with batching requires reshaping and 3D matmul
|
|
72
|
+
# which depends on robust tensor ops. This is a simplified implementation.
|
|
73
|
+
|
|
74
|
+
# Fallback to pure python loop for batch support
|
|
75
|
+
data = x.data
|
|
76
|
+
if isinstance(data[0][0], list): # 3D
|
|
77
|
+
batch_size = len(data)
|
|
78
|
+
out_batches = []
|
|
79
|
+
|
|
80
|
+
for b in range(batch_size):
|
|
81
|
+
# Extract batch slice as 2D tensor
|
|
82
|
+
q_b = self._get_batch_slice(q, b)
|
|
83
|
+
k_b = self._get_batch_slice(k, b)
|
|
84
|
+
v_b = self._get_batch_slice(v, b)
|
|
85
|
+
mask_b = self._get_batch_slice(mask, b) if mask is not None else None
|
|
86
|
+
|
|
87
|
+
out_b = self._attention_2d(q_b, k_b, v_b, mask_b)
|
|
88
|
+
out_batches.append(out_b.data)
|
|
89
|
+
|
|
90
|
+
# Combine back to 3D tensor
|
|
91
|
+
# Note: This breaks autograd graph effectively unless we are careful
|
|
92
|
+
# Ideally we'd use 3D ops
|
|
93
|
+
attn_out = Tensor(out_batches, requires_grad=True) # Graph broken here for now
|
|
94
|
+
else:
|
|
95
|
+
# 2D case
|
|
96
|
+
attn_out = self._attention_2d(q, k, v, mask)
|
|
97
|
+
|
|
98
|
+
# Output projection
|
|
99
|
+
return self.out_proj.forward(attn_out)
|
|
100
|
+
|
|
101
|
+
def _attention_2d(self, An: Tensor, K: Tensor, V: Tensor, mask: Optional[Tensor]) -> Tensor:
|
|
102
|
+
"""Compute attention for single sample (seq_len x dim)."""
|
|
103
|
+
# K^T
|
|
104
|
+
# K is (S, D), we need (D, S) manually transposed
|
|
105
|
+
K_T = self._transpose_2d(K)
|
|
106
|
+
|
|
107
|
+
# Scores: (S, D) @ (D, S) -> (S, S)
|
|
108
|
+
scores = ops.matmul(An, K_T)
|
|
109
|
+
|
|
110
|
+
# Scale
|
|
111
|
+
scaled_scores = ops.mul(scores, self.scale)
|
|
112
|
+
|
|
113
|
+
# Mask
|
|
114
|
+
if mask is not None:
|
|
115
|
+
# Assume mask is additive (0 for keep, -inf for mask)
|
|
116
|
+
scaled_scores = ops.add(scaled_scores, mask)
|
|
117
|
+
|
|
118
|
+
# Softmax over last dim (rows)
|
|
119
|
+
attn_weights = ops.softmax(scaled_scores, axis=-1)
|
|
120
|
+
|
|
121
|
+
# Output: (S, S) @ (S, D) -> (S, D)
|
|
122
|
+
output = ops.matmul(attn_weights, V)
|
|
123
|
+
|
|
124
|
+
return output
|
|
125
|
+
|
|
126
|
+
def _transpose_2d(self, t: Tensor) -> Tensor:
|
|
127
|
+
"""Transpose 2D tensor."""
|
|
128
|
+
data = t.data
|
|
129
|
+
rows = len(data)
|
|
130
|
+
cols = len(data[0])
|
|
131
|
+
new_data = [[data[i][j] for i in range(rows)] for j in range(cols)]
|
|
132
|
+
return Tensor(new_data, requires_grad=t.requires_grad)
|
|
133
|
+
|
|
134
|
+
def _get_batch_slice(self, t: Tensor, idx: int) -> Tensor:
|
|
135
|
+
"""Get 2D slice from 3D tensor."""
|
|
136
|
+
return Tensor(t.data[idx], requires_grad=t.requires_grad)
|
|
137
|
+
|
|
138
|
+
def parameters(self) -> List[Tensor]:
|
|
139
|
+
"""Get parameters."""
|
|
140
|
+
return (self.q_proj.parameters() +
|
|
141
|
+
self.k_proj.parameters() +
|
|
142
|
+
self.v_proj.parameters() +
|
|
143
|
+
self.out_proj.parameters())
|
|
144
|
+
|
|
145
|
+
def zero_grad(self) -> None:
|
|
146
|
+
"""Zero gradients."""
|
|
147
|
+
for p in self.parameters():
|
|
148
|
+
p.zero_grad()
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class MultiHeadAttention:
|
|
152
|
+
"""
|
|
153
|
+
Multi-Head Attention.
|
|
154
|
+
|
|
155
|
+
Splits embedding into multiple heads, applies attention independently,
|
|
156
|
+
and concatenates results.
|
|
157
|
+
|
|
158
|
+
Attributes:
|
|
159
|
+
embed_dim: Model dimension
|
|
160
|
+
num_heads: Number of heads
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
|
|
164
|
+
if embed_dim % num_heads != 0:
|
|
165
|
+
raise ValueError(f"embed_dim {embed_dim} must be divisible by num_heads {num_heads}")
|
|
166
|
+
|
|
167
|
+
self.embed_dim = embed_dim
|
|
168
|
+
self.num_heads = num_heads
|
|
169
|
+
self.head_dim = embed_dim // num_heads
|
|
170
|
+
|
|
171
|
+
# In typical implementation, we project to (num_heads * head_dim)
|
|
172
|
+
# which is same as embed_dim. So we can use one big Linear layer
|
|
173
|
+
# and reshape/split logic.
|
|
174
|
+
|
|
175
|
+
self.q_proj = Linear(embed_dim, embed_dim)
|
|
176
|
+
self.k_proj = Linear(embed_dim, embed_dim)
|
|
177
|
+
self.v_proj = Linear(embed_dim, embed_dim)
|
|
178
|
+
self.out_proj = Linear(embed_dim, embed_dim)
|
|
179
|
+
|
|
180
|
+
self.scale = 1.0 / math.sqrt(self.head_dim)
|
|
181
|
+
|
|
182
|
+
def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
|
|
183
|
+
"""
|
|
184
|
+
Simple implementation: Loop over heads (inefficient but clear).
|
|
185
|
+
True parallel implementation requires robust reshape/transpose ops.
|
|
186
|
+
"""
|
|
187
|
+
batch_size = len(x.data) if isinstance(x.data[0], list) and isinstance(x.data[0][0], list) else 1
|
|
188
|
+
|
|
189
|
+
# 1. Projections
|
|
190
|
+
q = self.q_proj.forward(x)
|
|
191
|
+
k = self.k_proj.forward(x)
|
|
192
|
+
v = self.v_proj.forward(x)
|
|
193
|
+
|
|
194
|
+
# 2. Split heads and generic attention logic is complex without proper tensor reshaping
|
|
195
|
+
# For this library's current state, we will implement a simplified single-head equivalent
|
|
196
|
+
# that mathematically matches but doesn't actually split distinct subspaces without reshaping.
|
|
197
|
+
|
|
198
|
+
# For now, we'll delegate to a simpler attention mechanism that treats it as one big head
|
|
199
|
+
# This is a PLACEHOLDER until complex tensor manipulation functions (view, permute) are added.
|
|
200
|
+
|
|
201
|
+
# We'll re-use the SelfAttention logic which effectively does 1 head
|
|
202
|
+
# To support multi-head properly, we need ops.reshape / ops.transpose(permute)
|
|
203
|
+
|
|
204
|
+
return self._fallback_single_head(q, k, v, mask)
|
|
205
|
+
|
|
206
|
+
def _fallback_single_head(self, q, k, v, mask):
|
|
207
|
+
# Same logic as SelfAttention._attention_2d but handling batches loop
|
|
208
|
+
data = q.data
|
|
209
|
+
if isinstance(data[0][0], list): # 3D
|
|
210
|
+
batch_size = len(data)
|
|
211
|
+
out_batches = []
|
|
212
|
+
for b in range(batch_size):
|
|
213
|
+
q_b = self._get_batch_slice(q, b)
|
|
214
|
+
k_b = self._get_batch_slice(k, b)
|
|
215
|
+
v_b = self._get_batch_slice(v, b)
|
|
216
|
+
mask_b = self._get_batch_slice(mask, b) if mask is not None else None
|
|
217
|
+
|
|
218
|
+
# Manual attention with correct scaling for head_dim
|
|
219
|
+
K_T = self._transpose_2d(k_b)
|
|
220
|
+
scores = ops.matmul(q_b, K_T)
|
|
221
|
+
scaled_scores = ops.mul(scores, self.scale)
|
|
222
|
+
if mask_b is not None:
|
|
223
|
+
scaled_scores = ops.add(scaled_scores, mask_b)
|
|
224
|
+
attn_weights = ops.softmax(scaled_scores, axis=-1)
|
|
225
|
+
out_b = ops.matmul(attn_weights, v_b)
|
|
226
|
+
|
|
227
|
+
out_batches.append(out_b.data)
|
|
228
|
+
attn_out = Tensor(out_batches, requires_grad=True)
|
|
229
|
+
else:
|
|
230
|
+
K_T = self._transpose_2d(k)
|
|
231
|
+
scores = ops.matmul(q, K_T)
|
|
232
|
+
scaled_scores = ops.mul(scores, self.scale)
|
|
233
|
+
if mask is not None:
|
|
234
|
+
scaled_scores = ops.add(scaled_scores, mask)
|
|
235
|
+
attn_weights = ops.softmax(scaled_scores, axis=-1)
|
|
236
|
+
attn_out = ops.matmul(attn_weights, v)
|
|
237
|
+
|
|
238
|
+
return self.out_proj.forward(attn_out)
|
|
239
|
+
|
|
240
|
+
def _transpose_2d(self, t: Tensor) -> Tensor:
|
|
241
|
+
data = t.data
|
|
242
|
+
rows = len(data)
|
|
243
|
+
cols = len(data[0])
|
|
244
|
+
new_data = [[data[i][j] for i in range(rows)] for j in range(cols)]
|
|
245
|
+
return Tensor(new_data, requires_grad=t.requires_grad)
|
|
246
|
+
|
|
247
|
+
def _get_batch_slice(self, t: Tensor, idx: int) -> Tensor:
|
|
248
|
+
return Tensor(t.data[idx], requires_grad=t.requires_grad)
|
|
249
|
+
|
|
250
|
+
def parameters(self) -> List[Tensor]:
|
|
251
|
+
return (self.q_proj.parameters() +
|
|
252
|
+
self.k_proj.parameters() +
|
|
253
|
+
self.v_proj.parameters() +
|
|
254
|
+
self.out_proj.parameters())
|
|
255
|
+
|
|
256
|
+
def zero_grad(self) -> None:
|
|
257
|
+
for p in self.parameters():
|
|
258
|
+
p.zero_grad()
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dropout regularization.
|
|
3
|
+
|
|
4
|
+
Implements Dropout layer for preventing overfitting.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Optional, List
|
|
8
|
+
import random
|
|
9
|
+
from quantml.tensor import Tensor
|
|
10
|
+
from quantml import ops
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Dropout:
|
|
14
|
+
"""
|
|
15
|
+
Dropout layer.
|
|
16
|
+
|
|
17
|
+
Randomly zeroes some elements of the input tensor with probability p
|
|
18
|
+
using samples from a Bernoulli distribution.
|
|
19
|
+
|
|
20
|
+
During training, outputs are scaled by 1/(1-p).
|
|
21
|
+
During evaluation, does nothing.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
p: Probability of an element being zeroed
|
|
25
|
+
inplace: If True, do operation in-place (not supported yet)
|
|
26
|
+
|
|
27
|
+
Examples:
|
|
28
|
+
>>> dropout = Dropout(p=0.5)
|
|
29
|
+
>>> x = Tensor([[1.0, 2.0, 3.0]])
|
|
30
|
+
>>> y = dropout(x)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, p: float = 0.5, inplace: bool = False):
|
|
34
|
+
if p < 0 or p > 1:
|
|
35
|
+
raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}")
|
|
36
|
+
self.p = p
|
|
37
|
+
self.inplace = inplace
|
|
38
|
+
self.training = True
|
|
39
|
+
|
|
40
|
+
def forward(self, x: Tensor) -> Tensor:
|
|
41
|
+
"""
|
|
42
|
+
Forward pass.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
x: Input tensor
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Tensor with dropout applied
|
|
49
|
+
"""
|
|
50
|
+
if not self.training or self.p == 0:
|
|
51
|
+
return x
|
|
52
|
+
|
|
53
|
+
# Apply dropout mask
|
|
54
|
+
scale = 1.0 / (1.0 - self.p)
|
|
55
|
+
|
|
56
|
+
data = x.data
|
|
57
|
+
if isinstance(data[0], list):
|
|
58
|
+
# 2D case (common for dropout)
|
|
59
|
+
if isinstance(data[0][0], list):
|
|
60
|
+
# 3D
|
|
61
|
+
out_data = []
|
|
62
|
+
for i in range(len(data)):
|
|
63
|
+
batch_out = []
|
|
64
|
+
for j in range(len(data[i])):
|
|
65
|
+
row_out = []
|
|
66
|
+
for k in range(len(data[i][j])):
|
|
67
|
+
if random.random() > self.p:
|
|
68
|
+
row_out.append(float(data[i][j][k]) * scale)
|
|
69
|
+
else:
|
|
70
|
+
row_out.append(0.0)
|
|
71
|
+
batch_out.append(row_out)
|
|
72
|
+
out_data.append(batch_out)
|
|
73
|
+
else:
|
|
74
|
+
# 2D
|
|
75
|
+
out_data = []
|
|
76
|
+
for i in range(len(data)):
|
|
77
|
+
row_out = []
|
|
78
|
+
for j in range(len(data[i])):
|
|
79
|
+
if random.random() > self.p:
|
|
80
|
+
row_out.append(float(data[i][j]) * scale)
|
|
81
|
+
else:
|
|
82
|
+
row_out.append(0.0)
|
|
83
|
+
out_data.append(row_out)
|
|
84
|
+
else:
|
|
85
|
+
# 1D
|
|
86
|
+
out_data = []
|
|
87
|
+
for i in range(len(data)):
|
|
88
|
+
if random.random() > self.p:
|
|
89
|
+
out_data.append(float(data[i]) * scale)
|
|
90
|
+
else:
|
|
91
|
+
out_data.append(0.0)
|
|
92
|
+
|
|
93
|
+
# Note: Dropout backward is handled because we are returning a new Tensor
|
|
94
|
+
# However, for correct autograd, we should probably implement it as an op
|
|
95
|
+
# Or multiply by a mask tensor
|
|
96
|
+
|
|
97
|
+
# Better approach for autograd: Create a mask tensor and multiply
|
|
98
|
+
# This ensures the graph captures the dropout operation correctly
|
|
99
|
+
|
|
100
|
+
mask_data = self._generate_mask_data(x.data, scale)
|
|
101
|
+
mask = Tensor(mask_data, requires_grad=False)
|
|
102
|
+
|
|
103
|
+
return ops.mul(x, mask)
|
|
104
|
+
|
|
105
|
+
def _generate_mask_data(self, data, scale):
|
|
106
|
+
"""Recursively generate dropout mask."""
|
|
107
|
+
if isinstance(data, list):
|
|
108
|
+
return [self._generate_mask_data(item, scale) for item in data]
|
|
109
|
+
else:
|
|
110
|
+
return scale if random.random() > self.p else 0.0
|
|
111
|
+
|
|
112
|
+
def __call__(self, x: Tensor) -> Tensor:
|
|
113
|
+
return self.forward(x)
|
|
114
|
+
|
|
115
|
+
def train(self, mode: bool = True) -> 'Dropout':
|
|
116
|
+
"""Set training mode."""
|
|
117
|
+
self.training = mode
|
|
118
|
+
return self
|
|
119
|
+
|
|
120
|
+
def eval(self) -> 'Dropout':
|
|
121
|
+
"""Set evaluation mode."""
|
|
122
|
+
return self.train(False)
|
|
123
|
+
|
|
124
|
+
def parameters(self) -> List[Tensor]:
|
|
125
|
+
"""Get trainable parameters (none for dropout)."""
|
|
126
|
+
return []
|
|
127
|
+
|
|
128
|
+
def zero_grad(self) -> None:
|
|
129
|
+
"""Clear gradients (no-op)."""
|
|
130
|
+
pass
|