quantmllibrary 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantml/__init__.py +74 -0
- quantml/autograd.py +154 -0
- quantml/cli/__init__.py +10 -0
- quantml/cli/run_experiment.py +385 -0
- quantml/config/__init__.py +28 -0
- quantml/config/config.py +259 -0
- quantml/data/__init__.py +33 -0
- quantml/data/cache.py +149 -0
- quantml/data/feature_store.py +234 -0
- quantml/data/futures.py +254 -0
- quantml/data/loaders.py +236 -0
- quantml/data/memory_optimizer.py +234 -0
- quantml/data/validators.py +390 -0
- quantml/experiments/__init__.py +23 -0
- quantml/experiments/logger.py +208 -0
- quantml/experiments/results.py +158 -0
- quantml/experiments/tracker.py +223 -0
- quantml/features/__init__.py +25 -0
- quantml/features/base.py +104 -0
- quantml/features/gap_features.py +124 -0
- quantml/features/registry.py +138 -0
- quantml/features/volatility_features.py +140 -0
- quantml/features/volume_features.py +142 -0
- quantml/functional.py +37 -0
- quantml/models/__init__.py +27 -0
- quantml/models/attention.py +258 -0
- quantml/models/dropout.py +130 -0
- quantml/models/gru.py +319 -0
- quantml/models/linear.py +112 -0
- quantml/models/lstm.py +353 -0
- quantml/models/mlp.py +286 -0
- quantml/models/normalization.py +289 -0
- quantml/models/rnn.py +154 -0
- quantml/models/tcn.py +238 -0
- quantml/online.py +209 -0
- quantml/ops.py +1707 -0
- quantml/optim/__init__.py +42 -0
- quantml/optim/adafactor.py +206 -0
- quantml/optim/adagrad.py +157 -0
- quantml/optim/adam.py +267 -0
- quantml/optim/lookahead.py +97 -0
- quantml/optim/quant_optimizer.py +228 -0
- quantml/optim/radam.py +192 -0
- quantml/optim/rmsprop.py +203 -0
- quantml/optim/schedulers.py +286 -0
- quantml/optim/sgd.py +181 -0
- quantml/py.typed +0 -0
- quantml/streaming.py +175 -0
- quantml/tensor.py +462 -0
- quantml/time_series.py +447 -0
- quantml/training/__init__.py +135 -0
- quantml/training/alpha_eval.py +203 -0
- quantml/training/backtest.py +280 -0
- quantml/training/backtest_analysis.py +168 -0
- quantml/training/cv.py +106 -0
- quantml/training/data_loader.py +177 -0
- quantml/training/ensemble.py +84 -0
- quantml/training/feature_importance.py +135 -0
- quantml/training/features.py +364 -0
- quantml/training/futures_backtest.py +266 -0
- quantml/training/gradient_clipping.py +206 -0
- quantml/training/losses.py +248 -0
- quantml/training/lr_finder.py +127 -0
- quantml/training/metrics.py +376 -0
- quantml/training/regularization.py +89 -0
- quantml/training/trainer.py +239 -0
- quantml/training/walk_forward.py +190 -0
- quantml/utils/__init__.py +51 -0
- quantml/utils/gradient_check.py +274 -0
- quantml/utils/logging.py +181 -0
- quantml/utils/ops_cpu.py +231 -0
- quantml/utils/profiling.py +364 -0
- quantml/utils/reproducibility.py +220 -0
- quantml/utils/serialization.py +335 -0
- quantmllibrary-0.1.0.dist-info/METADATA +536 -0
- quantmllibrary-0.1.0.dist-info/RECORD +79 -0
- quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
- quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
- quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
QuantML Optimizers
|
|
3
|
+
|
|
4
|
+
This module provides optimization algorithms for training models.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from quantml.optim.sgd import SGD
|
|
8
|
+
from quantml.optim.adam import Adam
|
|
9
|
+
from quantml.optim.rmsprop import RMSProp
|
|
10
|
+
from quantml.optim.adagrad import AdaGrad
|
|
11
|
+
from quantml.optim.adafactor import AdaFactor
|
|
12
|
+
from quantml.optim.lookahead import Lookahead
|
|
13
|
+
from quantml.optim.radam import RAdam
|
|
14
|
+
from quantml.optim.quant_optimizer import QuantOptimizer
|
|
15
|
+
from quantml.optim.schedulers import (
|
|
16
|
+
LRScheduler,
|
|
17
|
+
StepLR,
|
|
18
|
+
CosineAnnealingLR,
|
|
19
|
+
WarmupLR,
|
|
20
|
+
ReduceLROnPlateau,
|
|
21
|
+
CyclicLR,
|
|
22
|
+
OneCycleLR
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
'SGD',
|
|
27
|
+
'Adam',
|
|
28
|
+
'RMSProp',
|
|
29
|
+
'AdaGrad',
|
|
30
|
+
'AdaFactor',
|
|
31
|
+
'Lookahead',
|
|
32
|
+
'RAdam',
|
|
33
|
+
'QuantOptimizer',
|
|
34
|
+
'LRScheduler',
|
|
35
|
+
'StepLR',
|
|
36
|
+
'CosineAnnealingLR',
|
|
37
|
+
'WarmupLR',
|
|
38
|
+
'ReduceLROnPlateau',
|
|
39
|
+
'CyclicLR',
|
|
40
|
+
'OneCycleLR'
|
|
41
|
+
]
|
|
42
|
+
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdaFactor optimizer implementation.
|
|
3
|
+
|
|
4
|
+
AdaFactor is a memory-efficient variant of Adam that uses factorized second moment estimates.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional, Dict, Any
|
|
8
|
+
from quantml.tensor import Tensor
|
|
9
|
+
from quantml import ops
|
|
10
|
+
|
|
11
|
+
# Try to import NumPy
|
|
12
|
+
try:
|
|
13
|
+
import numpy as np
|
|
14
|
+
HAS_NUMPY = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
HAS_NUMPY = False
|
|
17
|
+
np = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AdaFactor:
|
|
21
|
+
"""
|
|
22
|
+
AdaFactor optimizer - memory-efficient Adam variant.
|
|
23
|
+
|
|
24
|
+
Uses factorized second moment estimates to reduce memory usage.
|
|
25
|
+
Good for quant models with many parameters.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
lr: Learning rate
|
|
29
|
+
betas: Tuple of (beta1, beta2) for moment estimates
|
|
30
|
+
eps: Small value for numerical stability
|
|
31
|
+
weight_decay: Weight decay coefficient
|
|
32
|
+
m: First moment estimates
|
|
33
|
+
v: Factorized second moment estimates
|
|
34
|
+
|
|
35
|
+
Examples:
|
|
36
|
+
>>> optimizer = AdaFactor(lr=0.001, betas=(0.9, 0.999))
|
|
37
|
+
>>> for param in model.parameters():
|
|
38
|
+
>>> optimizer.step(param)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
params: Optional[List[Tensor]] = None,
|
|
44
|
+
lr: float = 0.001,
|
|
45
|
+
betas: tuple = (0.9, 0.999),
|
|
46
|
+
eps: float = 1e-30,
|
|
47
|
+
weight_decay: float = 0.0,
|
|
48
|
+
factor_decay: float = 0.8
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Initialize AdaFactor optimizer.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
params: Optional list of parameters to optimize
|
|
55
|
+
lr: Learning rate
|
|
56
|
+
betas: Tuple of (beta1, beta2) for exponential decay rates
|
|
57
|
+
eps: Small value to prevent division by zero
|
|
58
|
+
weight_decay: Weight decay coefficient
|
|
59
|
+
factor_decay: Decay factor for second moment factorization
|
|
60
|
+
"""
|
|
61
|
+
self.params = params if params is not None else []
|
|
62
|
+
self.lr = lr
|
|
63
|
+
self.beta1, self.beta2 = betas
|
|
64
|
+
self.eps = eps
|
|
65
|
+
self.weight_decay = weight_decay
|
|
66
|
+
self.factor_decay = factor_decay
|
|
67
|
+
self.m: Dict[int, Any] = {} # First moment
|
|
68
|
+
self.v_row: Dict[int, Any] = {} # Row factors for second moment
|
|
69
|
+
self.v_col: Dict[int, Any] = {} # Column factors for second moment
|
|
70
|
+
self.step_count = 0
|
|
71
|
+
|
|
72
|
+
def step(self, param: Optional[Tensor] = None):
|
|
73
|
+
"""Perform a single optimization step."""
|
|
74
|
+
if param is not None:
|
|
75
|
+
self._update_param(param)
|
|
76
|
+
else:
|
|
77
|
+
for p in self.params:
|
|
78
|
+
self._update_param(p)
|
|
79
|
+
self.step_count += 1
|
|
80
|
+
|
|
81
|
+
def _update_param(self, param: Tensor):
|
|
82
|
+
"""Update a single parameter using AdaFactor algorithm."""
|
|
83
|
+
if not param.requires_grad:
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
if param.grad is None:
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
param_id = id(param)
|
|
90
|
+
|
|
91
|
+
if HAS_NUMPY:
|
|
92
|
+
try:
|
|
93
|
+
grad = param.grad
|
|
94
|
+
if isinstance(grad, np.ndarray):
|
|
95
|
+
grad_arr = grad
|
|
96
|
+
else:
|
|
97
|
+
grad_arr = np.array(grad, dtype=np.float64)
|
|
98
|
+
|
|
99
|
+
param_arr = param.numpy if param.numpy is not None else np.array(param.data, dtype=np.float64)
|
|
100
|
+
|
|
101
|
+
if self.weight_decay > 0:
|
|
102
|
+
grad_arr = grad_arr + self.weight_decay * param_arr
|
|
103
|
+
|
|
104
|
+
# Initialize moments if needed
|
|
105
|
+
if param_id not in self.m:
|
|
106
|
+
self.m[param_id] = np.zeros_like(param_arr, dtype=np.float64)
|
|
107
|
+
# Factorize second moment for 2D arrays
|
|
108
|
+
if grad_arr.ndim == 2:
|
|
109
|
+
self.v_row[param_id] = np.zeros(grad_arr.shape[0], dtype=np.float64)
|
|
110
|
+
self.v_col[param_id] = np.zeros(grad_arr.shape[1], dtype=np.float64)
|
|
111
|
+
else:
|
|
112
|
+
self.v_row[param_id] = np.zeros_like(grad_arr, dtype=np.float64)
|
|
113
|
+
self.v_col[param_id] = None
|
|
114
|
+
|
|
115
|
+
# Update first moment
|
|
116
|
+
m = self.m[param_id]
|
|
117
|
+
m[:] = self.beta1 * m + (1.0 - self.beta1) * grad_arr
|
|
118
|
+
|
|
119
|
+
# Update factorized second moment
|
|
120
|
+
if grad_arr.ndim == 2 and self.v_col[param_id] is not None:
|
|
121
|
+
# 2D case: factorize
|
|
122
|
+
v_row = self.v_row[param_id]
|
|
123
|
+
v_col = self.v_col[param_id]
|
|
124
|
+
grad_sq = grad_arr ** 2
|
|
125
|
+
v_row[:] = self.beta2 * v_row + (1.0 - self.beta2) * np.mean(grad_sq, axis=1)
|
|
126
|
+
v_col[:] = self.beta2 * v_col + (1.0 - self.beta2) * np.mean(grad_sq, axis=0)
|
|
127
|
+
v_hat = np.outer(v_row, v_col) / np.mean(v_row)
|
|
128
|
+
else:
|
|
129
|
+
# 1D case: regular second moment
|
|
130
|
+
v_row = self.v_row[param_id]
|
|
131
|
+
v_row[:] = self.beta2 * v_row + (1.0 - self.beta2) * (grad_arr ** 2)
|
|
132
|
+
v_hat = v_row
|
|
133
|
+
|
|
134
|
+
# Bias correction
|
|
135
|
+
bias_correction1 = 1.0 - (self.beta1 ** self.step_count)
|
|
136
|
+
m_hat = m / bias_correction1
|
|
137
|
+
|
|
138
|
+
# Update parameter
|
|
139
|
+
v_hat_sqrt = np.sqrt(v_hat) + self.eps
|
|
140
|
+
update = m_hat / v_hat_sqrt
|
|
141
|
+
param_update = self.lr * update
|
|
142
|
+
new_param_arr = param_arr - param_update
|
|
143
|
+
param.data = new_param_arr
|
|
144
|
+
|
|
145
|
+
except (ValueError, TypeError, AttributeError):
|
|
146
|
+
# Fallback to simplified Adam-like update
|
|
147
|
+
self._update_param_fallback(param)
|
|
148
|
+
else:
|
|
149
|
+
self._update_param_fallback(param)
|
|
150
|
+
|
|
151
|
+
def _update_param_fallback(self, param: Tensor):
|
|
152
|
+
"""Fallback update using Tensor operations (simplified)."""
|
|
153
|
+
# Simplified fallback - use regular Adam-like update
|
|
154
|
+
if param.grad is None:
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
param_id = id(param)
|
|
158
|
+
|
|
159
|
+
if param_id not in self.m:
|
|
160
|
+
if isinstance(param.data[0], list):
|
|
161
|
+
self.m[param_id] = Tensor([[0.0] * len(row) for row in param.data])
|
|
162
|
+
self.v_row[param_id] = Tensor([[0.0] * len(row) for row in param.data])
|
|
163
|
+
else:
|
|
164
|
+
self.m[param_id] = Tensor([0.0] * len(param.data))
|
|
165
|
+
self.v_row[param_id] = Tensor([0.0] * len(param.data))
|
|
166
|
+
|
|
167
|
+
grad = Tensor(param.grad)
|
|
168
|
+
if self.weight_decay > 0:
|
|
169
|
+
grad = ops.add(grad, ops.mul(param, self.weight_decay))
|
|
170
|
+
|
|
171
|
+
m_prev = self.m[param_id]
|
|
172
|
+
m_new = ops.add(ops.mul(m_prev, self.beta1), ops.mul(grad, 1.0 - self.beta1))
|
|
173
|
+
self.m[param_id] = m_new
|
|
174
|
+
|
|
175
|
+
v_prev = self.v_row[param_id]
|
|
176
|
+
grad_sq = ops.mul(grad, grad)
|
|
177
|
+
v_new = ops.add(ops.mul(v_prev, self.beta2), ops.mul(grad_sq, 1.0 - self.beta2))
|
|
178
|
+
self.v_row[param_id] = v_new
|
|
179
|
+
|
|
180
|
+
bias_correction1 = 1.0 - (self.beta1 ** self.step_count)
|
|
181
|
+
m_hat = ops.div(m_new, bias_correction1)
|
|
182
|
+
v_hat = v_new
|
|
183
|
+
|
|
184
|
+
v_hat_sqrt = ops.pow(ops.add(v_hat, self.eps), 0.5)
|
|
185
|
+
update = ops.div(m_hat, v_hat_sqrt)
|
|
186
|
+
param_update = ops.mul(update, self.lr)
|
|
187
|
+
|
|
188
|
+
if param.requires_grad:
|
|
189
|
+
param_detached = param.detach()
|
|
190
|
+
param_detached.sub_(param_update)
|
|
191
|
+
param.data = param_detached.data
|
|
192
|
+
else:
|
|
193
|
+
param.sub_(param_update)
|
|
194
|
+
|
|
195
|
+
def zero_grad(self, param: Optional[Tensor] = None):
|
|
196
|
+
"""Clear gradients."""
|
|
197
|
+
if param is not None:
|
|
198
|
+
param.zero_grad()
|
|
199
|
+
else:
|
|
200
|
+
for p in self.params:
|
|
201
|
+
p.zero_grad()
|
|
202
|
+
|
|
203
|
+
def add_param_group(self, params: List[Tensor]):
|
|
204
|
+
"""Add a parameter group to optimize."""
|
|
205
|
+
self.params.extend(params)
|
|
206
|
+
|
quantml/optim/adagrad.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdaGrad optimizer implementation.
|
|
3
|
+
|
|
4
|
+
AdaGrad (Adaptive Gradient) adapts learning rates by accumulating squared gradients.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional, Dict, Any
|
|
8
|
+
from quantml.tensor import Tensor
|
|
9
|
+
from quantml import ops
|
|
10
|
+
|
|
11
|
+
# Try to import NumPy
|
|
12
|
+
try:
|
|
13
|
+
import numpy as np
|
|
14
|
+
HAS_NUMPY = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
HAS_NUMPY = False
|
|
17
|
+
np = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AdaGrad:
|
|
21
|
+
"""
|
|
22
|
+
AdaGrad optimizer.
|
|
23
|
+
|
|
24
|
+
AdaGrad adapts learning rates by accumulating squared gradients.
|
|
25
|
+
Learning rates decrease for parameters with large gradients.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
lr: Learning rate
|
|
29
|
+
eps: Small value for numerical stability
|
|
30
|
+
weight_decay: Weight decay coefficient
|
|
31
|
+
accum: Accumulated squared gradients
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
>>> optimizer = AdaGrad(lr=0.01)
|
|
35
|
+
>>> for param in model.parameters():
|
|
36
|
+
>>> optimizer.step(param)
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
params: Optional[List[Tensor]] = None,
|
|
42
|
+
lr: float = 0.01,
|
|
43
|
+
eps: float = 1e-10,
|
|
44
|
+
weight_decay: float = 0.0
|
|
45
|
+
):
|
|
46
|
+
"""
|
|
47
|
+
Initialize AdaGrad optimizer.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
params: Optional list of parameters to optimize
|
|
51
|
+
lr: Learning rate
|
|
52
|
+
eps: Small value to prevent division by zero
|
|
53
|
+
weight_decay: Weight decay (L2 regularization) coefficient
|
|
54
|
+
"""
|
|
55
|
+
self.params = params if params is not None else []
|
|
56
|
+
self.lr = lr
|
|
57
|
+
self.eps = eps
|
|
58
|
+
self.weight_decay = weight_decay
|
|
59
|
+
self.accum: Dict[int, Any] = {} # Accumulated squared gradients
|
|
60
|
+
|
|
61
|
+
def step(self, param: Optional[Tensor] = None):
|
|
62
|
+
"""Perform a single optimization step."""
|
|
63
|
+
if param is not None:
|
|
64
|
+
self._update_param(param)
|
|
65
|
+
else:
|
|
66
|
+
for p in self.params:
|
|
67
|
+
self._update_param(p)
|
|
68
|
+
|
|
69
|
+
def _update_param(self, param: Tensor):
|
|
70
|
+
"""Update a single parameter using AdaGrad algorithm."""
|
|
71
|
+
if not param.requires_grad:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
if param.grad is None:
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
param_id = id(param)
|
|
78
|
+
|
|
79
|
+
if HAS_NUMPY:
|
|
80
|
+
try:
|
|
81
|
+
grad = param.grad
|
|
82
|
+
if isinstance(grad, np.ndarray):
|
|
83
|
+
grad_arr = grad
|
|
84
|
+
else:
|
|
85
|
+
grad_arr = np.array(grad, dtype=np.float64)
|
|
86
|
+
|
|
87
|
+
param_arr = param.numpy if param.numpy is not None else np.array(param.data, dtype=np.float64)
|
|
88
|
+
|
|
89
|
+
if self.weight_decay > 0:
|
|
90
|
+
grad_arr = grad_arr + self.weight_decay * param_arr
|
|
91
|
+
|
|
92
|
+
# Initialize accumulator if needed
|
|
93
|
+
if param_id not in self.accum:
|
|
94
|
+
self.accum[param_id] = np.zeros_like(param_arr, dtype=np.float64)
|
|
95
|
+
|
|
96
|
+
# Accumulate squared gradients
|
|
97
|
+
accum = self.accum[param_id]
|
|
98
|
+
accum[:] = accum + grad_arr ** 2
|
|
99
|
+
|
|
100
|
+
# Update parameter: param = param - lr * grad / (sqrt(accum) + eps)
|
|
101
|
+
update = grad_arr / (np.sqrt(accum) + self.eps)
|
|
102
|
+
param_update = self.lr * update
|
|
103
|
+
new_param_arr = param_arr - param_update
|
|
104
|
+
param.data = new_param_arr
|
|
105
|
+
|
|
106
|
+
except (ValueError, TypeError, AttributeError):
|
|
107
|
+
self._update_param_fallback(param)
|
|
108
|
+
else:
|
|
109
|
+
self._update_param_fallback(param)
|
|
110
|
+
|
|
111
|
+
def _update_param_fallback(self, param: Tensor):
|
|
112
|
+
"""Fallback update using Tensor operations."""
|
|
113
|
+
if param.grad is None:
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
param_id = id(param)
|
|
117
|
+
|
|
118
|
+
if param_id not in self.accum:
|
|
119
|
+
if isinstance(param.data[0], list):
|
|
120
|
+
self.accum[param_id] = [[0.0] * len(row) for row in param.data]
|
|
121
|
+
else:
|
|
122
|
+
self.accum[param_id] = [0.0] * len(param.data)
|
|
123
|
+
|
|
124
|
+
grad = param.grad
|
|
125
|
+
if self.weight_decay > 0:
|
|
126
|
+
grad = ops.add(grad, ops.mul(param, self.weight_decay))
|
|
127
|
+
|
|
128
|
+
# Accumulate squared gradients
|
|
129
|
+
grad_sq = ops.mul(grad, grad)
|
|
130
|
+
accum = Tensor(self.accum[param_id])
|
|
131
|
+
new_accum = ops.add(accum, grad_sq)
|
|
132
|
+
self.accum[param_id] = new_accum.data
|
|
133
|
+
|
|
134
|
+
# Update parameter
|
|
135
|
+
accum_sqrt = ops.pow(ops.add(new_accum, self.eps), 0.5)
|
|
136
|
+
update = ops.div(grad, accum_sqrt)
|
|
137
|
+
param_update = ops.mul(update, self.lr)
|
|
138
|
+
|
|
139
|
+
if param.requires_grad:
|
|
140
|
+
param_detached = param.detach()
|
|
141
|
+
param_detached.sub_(param_update)
|
|
142
|
+
param.data = param_detached.data
|
|
143
|
+
else:
|
|
144
|
+
param.sub_(param_update)
|
|
145
|
+
|
|
146
|
+
def zero_grad(self, param: Optional[Tensor] = None):
|
|
147
|
+
"""Clear gradients."""
|
|
148
|
+
if param is not None:
|
|
149
|
+
param.zero_grad()
|
|
150
|
+
else:
|
|
151
|
+
for p in self.params:
|
|
152
|
+
p.zero_grad()
|
|
153
|
+
|
|
154
|
+
def add_param_group(self, params: List[Tensor]):
|
|
155
|
+
"""Add a parameter group to optimize."""
|
|
156
|
+
self.params.extend(params)
|
|
157
|
+
|
quantml/optim/adam.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Adam optimizer implementation.
|
|
3
|
+
|
|
4
|
+
Adam (Adaptive Moment Estimation) is an adaptive learning rate optimizer
|
|
5
|
+
that combines the benefits of AdaGrad and RMSProp.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Optional, Dict, Any
|
|
9
|
+
from quantml.tensor import Tensor
|
|
10
|
+
from quantml import ops
|
|
11
|
+
|
|
12
|
+
# Try to import NumPy
|
|
13
|
+
try:
|
|
14
|
+
import numpy as np
|
|
15
|
+
HAS_NUMPY = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
HAS_NUMPY = False
|
|
18
|
+
np = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Adam:
|
|
22
|
+
"""
|
|
23
|
+
Adam optimizer.
|
|
24
|
+
|
|
25
|
+
Adam maintains per-parameter adaptive learning rates based on estimates
|
|
26
|
+
of first and second moments of gradients.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
lr: Learning rate
|
|
30
|
+
betas: Tuple of (beta1, beta2) for moment estimates
|
|
31
|
+
eps: Small value for numerical stability
|
|
32
|
+
m: First moment estimates (momentum)
|
|
33
|
+
v: Second moment estimates (variance)
|
|
34
|
+
step_count: Number of steps taken
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
>>> optimizer = Adam(lr=0.001, betas=(0.9, 0.999))
|
|
38
|
+
>>> for param in model.parameters():
|
|
39
|
+
>>> optimizer.step(param)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
params: Optional[List[Tensor]] = None,
|
|
45
|
+
lr: float = 0.001,
|
|
46
|
+
betas: tuple = (0.9, 0.999),
|
|
47
|
+
eps: float = 1e-8,
|
|
48
|
+
weight_decay: float = 0.0
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Initialize Adam optimizer.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
params: Optional list of parameters to optimize
|
|
55
|
+
lr: Learning rate
|
|
56
|
+
betas: Tuple of (beta1, beta2) for exponential decay rates
|
|
57
|
+
eps: Small value to prevent division by zero
|
|
58
|
+
weight_decay: Weight decay (L2 regularization) coefficient
|
|
59
|
+
"""
|
|
60
|
+
self.params = params if params is not None else []
|
|
61
|
+
self.lr = lr
|
|
62
|
+
self.beta1, self.beta2 = betas
|
|
63
|
+
self.eps = eps
|
|
64
|
+
self.weight_decay = weight_decay
|
|
65
|
+
|
|
66
|
+
# Moment estimates (stored as NumPy arrays for efficiency)
|
|
67
|
+
self.m: Dict[int, Any] = {} # First moment
|
|
68
|
+
self.v: Dict[int, Any] = {} # Second moment
|
|
69
|
+
self.step_count = 0
|
|
70
|
+
|
|
71
|
+
def step(self, param: Optional[Tensor] = None):
|
|
72
|
+
"""
|
|
73
|
+
Perform a single optimization step.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
param: Optional single parameter to update
|
|
77
|
+
"""
|
|
78
|
+
if param is not None:
|
|
79
|
+
self._update_param(param)
|
|
80
|
+
else:
|
|
81
|
+
for p in self.params:
|
|
82
|
+
self._update_param(p)
|
|
83
|
+
|
|
84
|
+
self.step_count += 1
|
|
85
|
+
|
|
86
|
+
def _update_param(self, param: Tensor):
|
|
87
|
+
"""Update a single parameter using Adam algorithm with direct NumPy operations."""
|
|
88
|
+
if not param.requires_grad:
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
if param.grad is None:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
param_id = id(param)
|
|
95
|
+
|
|
96
|
+
if HAS_NUMPY:
|
|
97
|
+
try:
|
|
98
|
+
# Get gradient and parameter as NumPy arrays
|
|
99
|
+
grad = param.grad
|
|
100
|
+
if isinstance(grad, np.ndarray):
|
|
101
|
+
grad_arr = grad
|
|
102
|
+
else:
|
|
103
|
+
grad_arr = np.array(grad, dtype=np.float64)
|
|
104
|
+
|
|
105
|
+
param_arr = param.numpy if param.numpy is not None else np.array(param.data, dtype=np.float64)
|
|
106
|
+
|
|
107
|
+
# Initialize moments if needed
|
|
108
|
+
if param_id not in self.m:
|
|
109
|
+
self.m[param_id] = np.zeros_like(param_arr, dtype=np.float64)
|
|
110
|
+
self.v[param_id] = np.zeros_like(param_arr, dtype=np.float64)
|
|
111
|
+
|
|
112
|
+
# Apply weight decay
|
|
113
|
+
if self.weight_decay > 0:
|
|
114
|
+
grad_arr = grad_arr + self.weight_decay * param_arr
|
|
115
|
+
|
|
116
|
+
# Update biased first moment: m = beta1 * m + (1 - beta1) * grad
|
|
117
|
+
m = self.m[param_id]
|
|
118
|
+
m[:] = self.beta1 * m + (1.0 - self.beta1) * grad_arr
|
|
119
|
+
|
|
120
|
+
# Update biased second moment: v = beta2 * v + (1 - beta2) * grad^2
|
|
121
|
+
v = self.v[param_id]
|
|
122
|
+
v[:] = self.beta2 * v + (1.0 - self.beta2) * (grad_arr ** 2)
|
|
123
|
+
|
|
124
|
+
# Bias correction
|
|
125
|
+
bias_correction1 = 1.0 - (self.beta1 ** self.step_count)
|
|
126
|
+
bias_correction2 = 1.0 - (self.beta2 ** self.step_count)
|
|
127
|
+
|
|
128
|
+
# Compute bias-corrected estimates
|
|
129
|
+
m_hat = m / bias_correction1
|
|
130
|
+
v_hat = v / bias_correction2
|
|
131
|
+
|
|
132
|
+
# Update parameter: param = param - lr * m_hat / (sqrt(v_hat) + eps)
|
|
133
|
+
v_hat_sqrt = np.sqrt(v_hat) + self.eps
|
|
134
|
+
update = m_hat / v_hat_sqrt
|
|
135
|
+
param_update = self.lr * update
|
|
136
|
+
new_param_arr = param_arr - param_update
|
|
137
|
+
|
|
138
|
+
# Update parameter data directly
|
|
139
|
+
param.data = new_param_arr
|
|
140
|
+
|
|
141
|
+
except (ValueError, TypeError, AttributeError):
|
|
142
|
+
# Fallback to Tensor operations
|
|
143
|
+
self._update_param_fallback(param)
|
|
144
|
+
else:
|
|
145
|
+
# Fallback to Tensor operations
|
|
146
|
+
self._update_param_fallback(param)
|
|
147
|
+
|
|
148
|
+
def _update_param_fallback(self, param: Tensor):
|
|
149
|
+
"""Fallback update using Tensor operations."""
|
|
150
|
+
if param.grad is None:
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
param_id = id(param)
|
|
154
|
+
|
|
155
|
+
# Initialize moments if needed
|
|
156
|
+
if param_id not in self.m:
|
|
157
|
+
if isinstance(param.data[0], list):
|
|
158
|
+
self.m[param_id] = Tensor([[0.0] * len(row) for row in param.data])
|
|
159
|
+
self.v[param_id] = Tensor([[0.0] * len(row) for row in param.data])
|
|
160
|
+
else:
|
|
161
|
+
self.m[param_id] = Tensor([0.0] * len(param.data))
|
|
162
|
+
self.v[param_id] = Tensor([0.0] * len(param.data))
|
|
163
|
+
|
|
164
|
+
# Get gradient
|
|
165
|
+
grad = Tensor(param.grad)
|
|
166
|
+
|
|
167
|
+
# Apply weight decay
|
|
168
|
+
if self.weight_decay > 0:
|
|
169
|
+
grad = ops.add(grad, ops.mul(param, self.weight_decay))
|
|
170
|
+
|
|
171
|
+
# Update biased first moment estimate: m = beta1 * m + (1 - beta1) * grad
|
|
172
|
+
m_prev = self.m[param_id]
|
|
173
|
+
m_new = ops.add(
|
|
174
|
+
ops.mul(m_prev, self.beta1),
|
|
175
|
+
ops.mul(grad, 1.0 - self.beta1)
|
|
176
|
+
)
|
|
177
|
+
self.m[param_id] = m_new
|
|
178
|
+
|
|
179
|
+
# Update biased second moment estimate: v = beta2 * v + (1 - beta2) * grad^2
|
|
180
|
+
v_prev = self.v[param_id]
|
|
181
|
+
grad_sq = ops.mul(grad, grad)
|
|
182
|
+
v_new = ops.add(
|
|
183
|
+
ops.mul(v_prev, self.beta2),
|
|
184
|
+
ops.mul(grad_sq, 1.0 - self.beta2)
|
|
185
|
+
)
|
|
186
|
+
self.v[param_id] = v_new
|
|
187
|
+
|
|
188
|
+
# Bias correction
|
|
189
|
+
bias_correction1 = 1.0 - (self.beta1 ** self.step_count)
|
|
190
|
+
bias_correction2 = 1.0 - (self.beta2 ** self.step_count)
|
|
191
|
+
|
|
192
|
+
# Compute bias-corrected estimates
|
|
193
|
+
m_hat = ops.div(m_new, bias_correction1)
|
|
194
|
+
v_hat = ops.div(v_new, bias_correction2)
|
|
195
|
+
|
|
196
|
+
# Update parameter in-place: param = param - lr * m_hat / (sqrt(v_hat) + eps)
|
|
197
|
+
v_hat_sqrt = ops.pow(ops.add(v_hat, self.eps), 0.5)
|
|
198
|
+
update = ops.div(m_hat, v_hat_sqrt)
|
|
199
|
+
param_update = ops.mul(update, self.lr)
|
|
200
|
+
|
|
201
|
+
# Detach and update in-place
|
|
202
|
+
if param.requires_grad:
|
|
203
|
+
param_detached = param.detach()
|
|
204
|
+
param_detached.sub_(param_update)
|
|
205
|
+
param.data = param_detached.data
|
|
206
|
+
else:
|
|
207
|
+
param.sub_(param_update)
|
|
208
|
+
|
|
209
|
+
def zero_grad(self, param: Optional[Tensor] = None):
|
|
210
|
+
"""
|
|
211
|
+
Clear gradients.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
param: Optional single parameter, otherwise clears all
|
|
215
|
+
"""
|
|
216
|
+
if param is not None:
|
|
217
|
+
param.zero_grad()
|
|
218
|
+
else:
|
|
219
|
+
for p in self.params:
|
|
220
|
+
p.zero_grad()
|
|
221
|
+
|
|
222
|
+
def add_param_group(self, params: List[Tensor]):
|
|
223
|
+
"""Add a parameter group to optimize."""
|
|
224
|
+
self.params.extend(params)
|
|
225
|
+
|
|
226
|
+
def state_dict(self) -> dict:
|
|
227
|
+
"""Get optimizer state dictionary."""
|
|
228
|
+
# Convert NumPy arrays to lists for serialization
|
|
229
|
+
m_data = {}
|
|
230
|
+
v_data = {}
|
|
231
|
+
for k, v in self.m.items():
|
|
232
|
+
if HAS_NUMPY and isinstance(v, np.ndarray):
|
|
233
|
+
m_data[k] = v.tolist()
|
|
234
|
+
elif isinstance(v, Tensor):
|
|
235
|
+
m_data[k] = v.data
|
|
236
|
+
else:
|
|
237
|
+
m_data[k] = v
|
|
238
|
+
|
|
239
|
+
for k, v in self.v.items():
|
|
240
|
+
if HAS_NUMPY and isinstance(v, np.ndarray):
|
|
241
|
+
v_data[k] = v.tolist()
|
|
242
|
+
elif isinstance(v, Tensor):
|
|
243
|
+
v_data[k] = v.data
|
|
244
|
+
else:
|
|
245
|
+
v_data[k] = v
|
|
246
|
+
|
|
247
|
+
return {
|
|
248
|
+
'step_count': self.step_count,
|
|
249
|
+
'm': m_data,
|
|
250
|
+
'v': v_data
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
def load_state_dict(self, state_dict: dict):
|
|
254
|
+
"""Load optimizer state from dictionary."""
|
|
255
|
+
self.step_count = state_dict.get('step_count', 0)
|
|
256
|
+
# Reconstruct moment arrays from data
|
|
257
|
+
for k, m_data in state_dict.get('m', {}).items():
|
|
258
|
+
if HAS_NUMPY:
|
|
259
|
+
self.m[int(k)] = np.array(m_data, dtype=np.float64)
|
|
260
|
+
else:
|
|
261
|
+
self.m[int(k)] = Tensor(m_data)
|
|
262
|
+
for k, v_data in state_dict.get('v', {}).items():
|
|
263
|
+
if HAS_NUMPY:
|
|
264
|
+
self.v[int(k)] = np.array(v_data, dtype=np.float64)
|
|
265
|
+
else:
|
|
266
|
+
self.v[int(k)] = Tensor(v_data)
|
|
267
|
+
|