quantmllibrary 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantml/__init__.py +74 -0
- quantml/autograd.py +154 -0
- quantml/cli/__init__.py +10 -0
- quantml/cli/run_experiment.py +385 -0
- quantml/config/__init__.py +28 -0
- quantml/config/config.py +259 -0
- quantml/data/__init__.py +33 -0
- quantml/data/cache.py +149 -0
- quantml/data/feature_store.py +234 -0
- quantml/data/futures.py +254 -0
- quantml/data/loaders.py +236 -0
- quantml/data/memory_optimizer.py +234 -0
- quantml/data/validators.py +390 -0
- quantml/experiments/__init__.py +23 -0
- quantml/experiments/logger.py +208 -0
- quantml/experiments/results.py +158 -0
- quantml/experiments/tracker.py +223 -0
- quantml/features/__init__.py +25 -0
- quantml/features/base.py +104 -0
- quantml/features/gap_features.py +124 -0
- quantml/features/registry.py +138 -0
- quantml/features/volatility_features.py +140 -0
- quantml/features/volume_features.py +142 -0
- quantml/functional.py +37 -0
- quantml/models/__init__.py +27 -0
- quantml/models/attention.py +258 -0
- quantml/models/dropout.py +130 -0
- quantml/models/gru.py +319 -0
- quantml/models/linear.py +112 -0
- quantml/models/lstm.py +353 -0
- quantml/models/mlp.py +286 -0
- quantml/models/normalization.py +289 -0
- quantml/models/rnn.py +154 -0
- quantml/models/tcn.py +238 -0
- quantml/online.py +209 -0
- quantml/ops.py +1707 -0
- quantml/optim/__init__.py +42 -0
- quantml/optim/adafactor.py +206 -0
- quantml/optim/adagrad.py +157 -0
- quantml/optim/adam.py +267 -0
- quantml/optim/lookahead.py +97 -0
- quantml/optim/quant_optimizer.py +228 -0
- quantml/optim/radam.py +192 -0
- quantml/optim/rmsprop.py +203 -0
- quantml/optim/schedulers.py +286 -0
- quantml/optim/sgd.py +181 -0
- quantml/py.typed +0 -0
- quantml/streaming.py +175 -0
- quantml/tensor.py +462 -0
- quantml/time_series.py +447 -0
- quantml/training/__init__.py +135 -0
- quantml/training/alpha_eval.py +203 -0
- quantml/training/backtest.py +280 -0
- quantml/training/backtest_analysis.py +168 -0
- quantml/training/cv.py +106 -0
- quantml/training/data_loader.py +177 -0
- quantml/training/ensemble.py +84 -0
- quantml/training/feature_importance.py +135 -0
- quantml/training/features.py +364 -0
- quantml/training/futures_backtest.py +266 -0
- quantml/training/gradient_clipping.py +206 -0
- quantml/training/losses.py +248 -0
- quantml/training/lr_finder.py +127 -0
- quantml/training/metrics.py +376 -0
- quantml/training/regularization.py +89 -0
- quantml/training/trainer.py +239 -0
- quantml/training/walk_forward.py +190 -0
- quantml/utils/__init__.py +51 -0
- quantml/utils/gradient_check.py +274 -0
- quantml/utils/logging.py +181 -0
- quantml/utils/ops_cpu.py +231 -0
- quantml/utils/profiling.py +364 -0
- quantml/utils/reproducibility.py +220 -0
- quantml/utils/serialization.py +335 -0
- quantmllibrary-0.1.0.dist-info/METADATA +536 -0
- quantmllibrary-0.1.0.dist-info/RECORD +79 -0
- quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
- quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
- quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
quantml/optim/rmsprop.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RMSProp optimizer implementation.
|
|
3
|
+
|
|
4
|
+
RMSProp (Root Mean Square Propagation) is an adaptive learning rate optimizer
|
|
5
|
+
that maintains a moving average of squared gradients.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Optional, Dict, Any
|
|
9
|
+
from quantml.tensor import Tensor
|
|
10
|
+
from quantml import ops
|
|
11
|
+
|
|
12
|
+
# Try to import NumPy
|
|
13
|
+
try:
|
|
14
|
+
import numpy as np
|
|
15
|
+
HAS_NUMPY = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
HAS_NUMPY = False
|
|
18
|
+
np = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class RMSProp:
|
|
22
|
+
"""
|
|
23
|
+
RMSProp optimizer.
|
|
24
|
+
|
|
25
|
+
RMSProp maintains a moving average of squared gradients and divides
|
|
26
|
+
the gradient by the root of this average.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
lr: Learning rate
|
|
30
|
+
alpha: Smoothing constant (decay factor)
|
|
31
|
+
eps: Small value for numerical stability
|
|
32
|
+
weight_decay: Weight decay coefficient
|
|
33
|
+
momentum: Momentum factor (0 = no momentum)
|
|
34
|
+
squared_avg: Moving average of squared gradients
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
>>> optimizer = RMSProp(lr=0.001, alpha=0.99)
|
|
38
|
+
>>> for param in model.parameters():
|
|
39
|
+
>>> optimizer.step(param)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
params: Optional[List[Tensor]] = None,
|
|
45
|
+
lr: float = 0.01,
|
|
46
|
+
alpha: float = 0.99,
|
|
47
|
+
eps: float = 1e-8,
|
|
48
|
+
weight_decay: float = 0.0,
|
|
49
|
+
momentum: float = 0.0
|
|
50
|
+
):
|
|
51
|
+
"""
|
|
52
|
+
Initialize RMSProp optimizer.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
params: Optional list of parameters to optimize
|
|
56
|
+
lr: Learning rate
|
|
57
|
+
alpha: Smoothing constant (decay factor for squared gradient average)
|
|
58
|
+
eps: Small value to prevent division by zero
|
|
59
|
+
weight_decay: Weight decay (L2 regularization) coefficient
|
|
60
|
+
momentum: Momentum factor (0.0 to disable)
|
|
61
|
+
"""
|
|
62
|
+
self.params = params if params is not None else []
|
|
63
|
+
self.lr = lr
|
|
64
|
+
self.alpha = alpha
|
|
65
|
+
self.eps = eps
|
|
66
|
+
self.weight_decay = weight_decay
|
|
67
|
+
self.momentum = momentum
|
|
68
|
+
self.squared_avg: Dict[int, Any] = {} # Moving average of squared gradients
|
|
69
|
+
self.momentum_buffer: Dict[int, Any] = {} # Momentum buffer
|
|
70
|
+
|
|
71
|
+
def step(self, param: Optional[Tensor] = None):
|
|
72
|
+
"""
|
|
73
|
+
Perform a single optimization step.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
param: Optional single parameter to update
|
|
77
|
+
"""
|
|
78
|
+
if param is not None:
|
|
79
|
+
self._update_param(param)
|
|
80
|
+
else:
|
|
81
|
+
for p in self.params:
|
|
82
|
+
self._update_param(p)
|
|
83
|
+
|
|
84
|
+
def _update_param(self, param: Tensor):
|
|
85
|
+
"""Update a single parameter using RMSProp algorithm."""
|
|
86
|
+
if not param.requires_grad:
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
if param.grad is None:
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
param_id = id(param)
|
|
93
|
+
|
|
94
|
+
if HAS_NUMPY:
|
|
95
|
+
try:
|
|
96
|
+
# Get gradient and parameter as NumPy arrays
|
|
97
|
+
grad = param.grad
|
|
98
|
+
if isinstance(grad, np.ndarray):
|
|
99
|
+
grad_arr = grad
|
|
100
|
+
else:
|
|
101
|
+
grad_arr = np.array(grad, dtype=np.float64)
|
|
102
|
+
|
|
103
|
+
param_arr = param.numpy if param.numpy is not None else np.array(param.data, dtype=np.float64)
|
|
104
|
+
|
|
105
|
+
# Apply weight decay
|
|
106
|
+
if self.weight_decay > 0:
|
|
107
|
+
grad_arr = grad_arr + self.weight_decay * param_arr
|
|
108
|
+
|
|
109
|
+
# Initialize squared average if needed
|
|
110
|
+
if param_id not in self.squared_avg:
|
|
111
|
+
self.squared_avg[param_id] = np.zeros_like(param_arr, dtype=np.float64)
|
|
112
|
+
|
|
113
|
+
# Update squared average: avg = alpha * avg + (1 - alpha) * grad^2
|
|
114
|
+
sq_avg = self.squared_avg[param_id]
|
|
115
|
+
sq_avg[:] = self.alpha * sq_avg + (1.0 - self.alpha) * (grad_arr ** 2)
|
|
116
|
+
|
|
117
|
+
# Compute update: update = grad / (sqrt(avg) + eps)
|
|
118
|
+
update = grad_arr / (np.sqrt(sq_avg) + self.eps)
|
|
119
|
+
|
|
120
|
+
# Apply momentum if enabled
|
|
121
|
+
if self.momentum > 0:
|
|
122
|
+
if param_id not in self.momentum_buffer:
|
|
123
|
+
self.momentum_buffer[param_id] = np.zeros_like(param_arr, dtype=np.float64)
|
|
124
|
+
|
|
125
|
+
buf = self.momentum_buffer[param_id]
|
|
126
|
+
buf[:] = self.momentum * buf + update
|
|
127
|
+
update = buf
|
|
128
|
+
|
|
129
|
+
# Update parameter: param = param - lr * update
|
|
130
|
+
param_update = self.lr * update
|
|
131
|
+
new_param_arr = param_arr - param_update
|
|
132
|
+
param.data = new_param_arr
|
|
133
|
+
|
|
134
|
+
except (ValueError, TypeError, AttributeError):
|
|
135
|
+
self._update_param_fallback(param)
|
|
136
|
+
else:
|
|
137
|
+
self._update_param_fallback(param)
|
|
138
|
+
|
|
139
|
+
def _update_param_fallback(self, param: Tensor):
|
|
140
|
+
"""Fallback update using Tensor operations."""
|
|
141
|
+
if param.grad is None:
|
|
142
|
+
return
|
|
143
|
+
|
|
144
|
+
param_id = id(param)
|
|
145
|
+
|
|
146
|
+
# Initialize squared average if needed
|
|
147
|
+
if param_id not in self.squared_avg:
|
|
148
|
+
if isinstance(param.data[0], list):
|
|
149
|
+
self.squared_avg[param_id] = [[0.0] * len(row) for row in param.data]
|
|
150
|
+
else:
|
|
151
|
+
self.squared_avg[param_id] = [0.0] * len(param.data)
|
|
152
|
+
|
|
153
|
+
grad = param.grad
|
|
154
|
+
if self.weight_decay > 0:
|
|
155
|
+
grad = ops.add(grad, ops.mul(param, self.weight_decay))
|
|
156
|
+
|
|
157
|
+
# Update squared average
|
|
158
|
+
grad_sq = ops.mul(grad, grad)
|
|
159
|
+
sq_avg = Tensor(self.squared_avg[param_id])
|
|
160
|
+
new_sq_avg = ops.add(
|
|
161
|
+
ops.mul(sq_avg, self.alpha),
|
|
162
|
+
ops.mul(grad_sq, 1.0 - self.alpha)
|
|
163
|
+
)
|
|
164
|
+
self.squared_avg[param_id] = new_sq_avg.data
|
|
165
|
+
|
|
166
|
+
# Compute update
|
|
167
|
+
sq_avg_sqrt = ops.pow(ops.add(new_sq_avg, self.eps), 0.5)
|
|
168
|
+
update = ops.div(grad, sq_avg_sqrt)
|
|
169
|
+
|
|
170
|
+
# Apply momentum
|
|
171
|
+
if self.momentum > 0:
|
|
172
|
+
if param_id not in self.momentum_buffer:
|
|
173
|
+
if isinstance(param.data[0], list):
|
|
174
|
+
self.momentum_buffer[param_id] = [[0.0] * len(row) for row in param.data]
|
|
175
|
+
else:
|
|
176
|
+
self.momentum_buffer[param_id] = [0.0] * len(param.data)
|
|
177
|
+
|
|
178
|
+
buf = Tensor(self.momentum_buffer[param_id])
|
|
179
|
+
new_buf = ops.add(ops.mul(buf, self.momentum), update)
|
|
180
|
+
self.momentum_buffer[param_id] = new_buf.data
|
|
181
|
+
update = new_buf
|
|
182
|
+
|
|
183
|
+
# Update parameter
|
|
184
|
+
param_update = ops.mul(update, self.lr)
|
|
185
|
+
if param.requires_grad:
|
|
186
|
+
param_detached = param.detach()
|
|
187
|
+
param_detached.sub_(param_update)
|
|
188
|
+
param.data = param_detached.data
|
|
189
|
+
else:
|
|
190
|
+
param.sub_(param_update)
|
|
191
|
+
|
|
192
|
+
def zero_grad(self, param: Optional[Tensor] = None):
|
|
193
|
+
"""Clear gradients."""
|
|
194
|
+
if param is not None:
|
|
195
|
+
param.zero_grad()
|
|
196
|
+
else:
|
|
197
|
+
for p in self.params:
|
|
198
|
+
p.zero_grad()
|
|
199
|
+
|
|
200
|
+
def add_param_group(self, params: List[Tensor]):
|
|
201
|
+
"""Add a parameter group to optimize."""
|
|
202
|
+
self.params.extend(params)
|
|
203
|
+
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Learning rate schedulers for QuantML.
|
|
3
|
+
|
|
4
|
+
Provides various learning rate scheduling strategies for training optimization.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional, Any
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
import math
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LRScheduler(ABC):
|
|
13
|
+
"""
|
|
14
|
+
Base class for learning rate schedulers.
|
|
15
|
+
|
|
16
|
+
All schedulers should inherit from this class and implement
|
|
17
|
+
the get_lr() and step() methods.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, optimizer: Any, last_epoch: int = -1):
|
|
21
|
+
"""
|
|
22
|
+
Initialize scheduler.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
optimizer: The optimizer to schedule
|
|
26
|
+
last_epoch: The index of the last epoch
|
|
27
|
+
"""
|
|
28
|
+
self.optimizer = optimizer
|
|
29
|
+
self.last_epoch = last_epoch
|
|
30
|
+
self.base_lrs = [group.get('lr', optimizer.lr) if isinstance(group, dict) else optimizer.lr
|
|
31
|
+
for group in getattr(optimizer, 'param_groups', [optimizer])]
|
|
32
|
+
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def get_lr(self) -> List[float]:
|
|
35
|
+
"""Compute learning rate for current epoch."""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
def step(self, epoch: Optional[int] = None):
|
|
39
|
+
"""
|
|
40
|
+
Step the scheduler.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
epoch: Current epoch (if None, uses last_epoch + 1)
|
|
44
|
+
"""
|
|
45
|
+
if epoch is None:
|
|
46
|
+
self.last_epoch += 1
|
|
47
|
+
else:
|
|
48
|
+
self.last_epoch = epoch
|
|
49
|
+
|
|
50
|
+
lrs = self.get_lr()
|
|
51
|
+
# Update optimizer learning rates
|
|
52
|
+
if hasattr(self.optimizer, 'param_groups'):
|
|
53
|
+
for param_group, lr in zip(self.optimizer.param_groups, lrs):
|
|
54
|
+
param_group['lr'] = lr
|
|
55
|
+
else:
|
|
56
|
+
self.optimizer.lr = lrs[0] if lrs else self.optimizer.lr
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class StepLR(LRScheduler):
|
|
60
|
+
"""Step learning rate scheduler - decays LR by gamma every step_size epochs."""
|
|
61
|
+
|
|
62
|
+
def __init__(self, optimizer: Any, step_size: int, gamma: float = 0.1, last_epoch: int = -1):
|
|
63
|
+
"""
|
|
64
|
+
Initialize StepLR scheduler.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
optimizer: The optimizer
|
|
68
|
+
step_size: Period of learning rate decay
|
|
69
|
+
gamma: Multiplicative factor for decay
|
|
70
|
+
last_epoch: The index of the last epoch
|
|
71
|
+
"""
|
|
72
|
+
self.step_size = step_size
|
|
73
|
+
self.gamma = gamma
|
|
74
|
+
super().__init__(optimizer, last_epoch)
|
|
75
|
+
|
|
76
|
+
def get_lr(self) -> List[float]:
|
|
77
|
+
"""Compute learning rate."""
|
|
78
|
+
return [base_lr * (self.gamma ** (self.last_epoch // self.step_size))
|
|
79
|
+
for base_lr in self.base_lrs]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class CosineAnnealingLR(LRScheduler):
|
|
83
|
+
"""Cosine annealing learning rate scheduler."""
|
|
84
|
+
|
|
85
|
+
def __init__(self, optimizer: Any, T_max: int, eta_min: float = 0.0, last_epoch: int = -1):
|
|
86
|
+
"""
|
|
87
|
+
Initialize CosineAnnealingLR scheduler.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
optimizer: The optimizer
|
|
91
|
+
T_max: Maximum number of iterations
|
|
92
|
+
eta_min: Minimum learning rate
|
|
93
|
+
last_epoch: The index of the last epoch
|
|
94
|
+
"""
|
|
95
|
+
self.T_max = T_max
|
|
96
|
+
self.eta_min = eta_min
|
|
97
|
+
super().__init__(optimizer, last_epoch)
|
|
98
|
+
|
|
99
|
+
def get_lr(self) -> List[float]:
|
|
100
|
+
"""Compute learning rate."""
|
|
101
|
+
return [self.eta_min + (base_lr - self.eta_min) *
|
|
102
|
+
(1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
|
|
103
|
+
for base_lr in self.base_lrs]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class WarmupLR(LRScheduler):
|
|
107
|
+
"""Warmup learning rate scheduler."""
|
|
108
|
+
|
|
109
|
+
def __init__(self, optimizer: Any, warmup_steps: int, warmup_type: str = 'linear', last_epoch: int = -1):
|
|
110
|
+
"""
|
|
111
|
+
Initialize WarmupLR scheduler.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
optimizer: The optimizer
|
|
115
|
+
warmup_steps: Number of warmup steps
|
|
116
|
+
warmup_type: Type of warmup ('linear' or 'cosine')
|
|
117
|
+
last_epoch: The index of the last epoch
|
|
118
|
+
"""
|
|
119
|
+
self.warmup_steps = warmup_steps
|
|
120
|
+
self.warmup_type = warmup_type
|
|
121
|
+
super().__init__(optimizer, last_epoch)
|
|
122
|
+
|
|
123
|
+
def get_lr(self) -> List[float]:
|
|
124
|
+
"""Compute learning rate."""
|
|
125
|
+
if self.last_epoch < self.warmup_steps:
|
|
126
|
+
if self.warmup_type == 'linear':
|
|
127
|
+
factor = (self.last_epoch + 1) / self.warmup_steps
|
|
128
|
+
else: # cosine
|
|
129
|
+
factor = (1 + math.cos(math.pi * (1 - (self.last_epoch + 1) / self.warmup_steps))) / 2
|
|
130
|
+
return [base_lr * factor for base_lr in self.base_lrs]
|
|
131
|
+
return self.base_lrs
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class ReduceLROnPlateau:
|
|
135
|
+
"""Reduce learning rate when a metric has stopped improving."""
|
|
136
|
+
|
|
137
|
+
def __init__(self, optimizer: Any, mode: str = 'min', factor: float = 0.1,
|
|
138
|
+
patience: int = 10, threshold: float = 1e-4, min_lr: float = 0.0):
|
|
139
|
+
"""
|
|
140
|
+
Initialize ReduceLROnPlateau scheduler.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
optimizer: The optimizer
|
|
144
|
+
mode: 'min' or 'max' - whether to reduce LR when metric stops decreasing/increasing
|
|
145
|
+
factor: Factor to multiply LR by
|
|
146
|
+
patience: Number of epochs with no improvement before reducing LR
|
|
147
|
+
threshold: Threshold for measuring improvement
|
|
148
|
+
min_lr: Minimum learning rate
|
|
149
|
+
"""
|
|
150
|
+
self.optimizer = optimizer
|
|
151
|
+
self.mode = mode
|
|
152
|
+
self.factor = factor
|
|
153
|
+
self.patience = patience
|
|
154
|
+
self.threshold = threshold
|
|
155
|
+
self.min_lr = min_lr
|
|
156
|
+
self.best = None
|
|
157
|
+
self.num_bad_epochs = 0
|
|
158
|
+
self.base_lrs = [optimizer.lr]
|
|
159
|
+
|
|
160
|
+
def step(self, metrics: float):
|
|
161
|
+
"""
|
|
162
|
+
Step the scheduler based on metrics.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
metrics: Current metric value
|
|
166
|
+
"""
|
|
167
|
+
if self.best is None:
|
|
168
|
+
self.best = metrics
|
|
169
|
+
else:
|
|
170
|
+
if self.mode == 'min':
|
|
171
|
+
is_better = metrics < self.best - self.threshold
|
|
172
|
+
else:
|
|
173
|
+
is_better = metrics > self.best + self.threshold
|
|
174
|
+
|
|
175
|
+
if is_better:
|
|
176
|
+
self.best = metrics
|
|
177
|
+
self.num_bad_epochs = 0
|
|
178
|
+
else:
|
|
179
|
+
self.num_bad_epochs += 1
|
|
180
|
+
|
|
181
|
+
if self.num_bad_epochs >= self.patience:
|
|
182
|
+
self._reduce_lr()
|
|
183
|
+
self.num_bad_epochs = 0
|
|
184
|
+
|
|
185
|
+
def _reduce_lr(self):
|
|
186
|
+
"""Reduce learning rate."""
|
|
187
|
+
new_lr = max(self.optimizer.lr * self.factor, self.min_lr)
|
|
188
|
+
self.optimizer.lr = new_lr
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class CyclicLR(LRScheduler):
|
|
192
|
+
"""Cyclic learning rate scheduler."""
|
|
193
|
+
|
|
194
|
+
def __init__(self, optimizer: Any, base_lr: float, max_lr: float,
|
|
195
|
+
step_size_up: int = 2000, step_size_down: Optional[int] = None,
|
|
196
|
+
mode: str = 'triangular', gamma: float = 1.0, last_epoch: int = -1):
|
|
197
|
+
"""
|
|
198
|
+
Initialize CyclicLR scheduler.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
optimizer: The optimizer
|
|
202
|
+
base_lr: Lower bound of learning rate
|
|
203
|
+
max_lr: Upper bound of learning rate
|
|
204
|
+
step_size_up: Number of steps to increase LR
|
|
205
|
+
step_size_down: Number of steps to decrease LR (if None, equals step_size_up)
|
|
206
|
+
mode: 'triangular', 'triangular2', or 'exp_range'
|
|
207
|
+
gamma: Scaling factor for 'exp_range' mode
|
|
208
|
+
last_epoch: The index of the last epoch
|
|
209
|
+
"""
|
|
210
|
+
self.base_lr = base_lr
|
|
211
|
+
self.max_lr = max_lr
|
|
212
|
+
self.step_size_up = step_size_up
|
|
213
|
+
self.step_size_down = step_size_down if step_size_down is not None else step_size_up
|
|
214
|
+
self.mode = mode
|
|
215
|
+
self.gamma = gamma
|
|
216
|
+
self.step_size = self.step_size_up + self.step_size_down
|
|
217
|
+
super().__init__(optimizer, last_epoch)
|
|
218
|
+
|
|
219
|
+
def get_lr(self) -> List[float]:
|
|
220
|
+
"""Compute learning rate."""
|
|
221
|
+
cycle = math.floor(1 + self.last_epoch / self.step_size)
|
|
222
|
+
x = 1 + self.last_epoch / self.step_size - cycle
|
|
223
|
+
|
|
224
|
+
if x <= self.step_size_up / self.step_size:
|
|
225
|
+
scale = x * (self.step_size / self.step_size_up)
|
|
226
|
+
else:
|
|
227
|
+
scale = (self.step_size - x) * (self.step_size / self.step_size_down)
|
|
228
|
+
|
|
229
|
+
if self.mode == 'triangular':
|
|
230
|
+
lr = self.base_lr + (self.max_lr - self.base_lr) * max(0, scale)
|
|
231
|
+
elif self.mode == 'triangular2':
|
|
232
|
+
lr = self.base_lr + (self.max_lr - self.base_lr) * max(0, scale) / (2 ** (cycle - 1))
|
|
233
|
+
else: # exp_range
|
|
234
|
+
lr = self.base_lr + (self.max_lr - self.base_lr) * max(0, scale) * (self.gamma ** self.last_epoch)
|
|
235
|
+
|
|
236
|
+
return [lr for _ in self.base_lrs]
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
class OneCycleLR(LRScheduler):
|
|
240
|
+
"""One cycle learning rate scheduler."""
|
|
241
|
+
|
|
242
|
+
def __init__(self, optimizer: Any, max_lr: float, total_steps: int,
|
|
243
|
+
pct_start: float = 0.3, anneal_strategy: str = 'cos',
|
|
244
|
+
div_factor: float = 25.0, final_div_factor: float = 10000.0, last_epoch: int = -1):
|
|
245
|
+
"""
|
|
246
|
+
Initialize OneCycleLR scheduler.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
optimizer: The optimizer
|
|
250
|
+
max_lr: Maximum learning rate
|
|
251
|
+
total_steps: Total number of steps
|
|
252
|
+
pct_start: Percentage of steps for warmup
|
|
253
|
+
anneal_strategy: 'cos' or 'linear' annealing
|
|
254
|
+
div_factor: Initial LR = max_lr / div_factor
|
|
255
|
+
final_div_factor: Final LR = initial_lr / final_div_factor
|
|
256
|
+
last_epoch: The index of the last epoch
|
|
257
|
+
"""
|
|
258
|
+
self.max_lr = max_lr
|
|
259
|
+
self.total_steps = total_steps
|
|
260
|
+
self.pct_start = pct_start
|
|
261
|
+
self.anneal_strategy = anneal_strategy
|
|
262
|
+
self.div_factor = div_factor
|
|
263
|
+
self.final_div_factor = final_div_factor
|
|
264
|
+
self.initial_lr = max_lr / div_factor
|
|
265
|
+
self.final_lr = self.initial_lr / final_div_factor
|
|
266
|
+
super().__init__(optimizer, last_epoch)
|
|
267
|
+
|
|
268
|
+
def get_lr(self) -> List[float]:
|
|
269
|
+
"""Compute learning rate."""
|
|
270
|
+
if self.last_epoch < self.total_steps * self.pct_start:
|
|
271
|
+
# Warmup phase
|
|
272
|
+
pct = self.last_epoch / (self.total_steps * self.pct_start)
|
|
273
|
+
if self.anneal_strategy == 'cos':
|
|
274
|
+
lr = self.initial_lr + (self.max_lr - self.initial_lr) * (1 + math.cos(math.pi * (1 - pct))) / 2
|
|
275
|
+
else:
|
|
276
|
+
lr = self.initial_lr + (self.max_lr - self.initial_lr) * pct
|
|
277
|
+
else:
|
|
278
|
+
# Annealing phase
|
|
279
|
+
pct = (self.last_epoch - self.total_steps * self.pct_start) / (self.total_steps * (1 - self.pct_start))
|
|
280
|
+
if self.anneal_strategy == 'cos':
|
|
281
|
+
lr = self.final_lr + (self.max_lr - self.final_lr) * (1 + math.cos(math.pi * pct)) / 2
|
|
282
|
+
else:
|
|
283
|
+
lr = self.max_lr - (self.max_lr - self.final_lr) * pct
|
|
284
|
+
|
|
285
|
+
return [lr for _ in self.base_lrs]
|
|
286
|
+
|
quantml/optim/sgd.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Stochastic Gradient Descent (SGD) optimizer.
|
|
3
|
+
|
|
4
|
+
This module provides the SGD optimizer with optional momentum and weight decay.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional, Dict, Any
|
|
8
|
+
from quantml.tensor import Tensor
|
|
9
|
+
from quantml import ops
|
|
10
|
+
|
|
11
|
+
# Try to import NumPy
|
|
12
|
+
try:
|
|
13
|
+
import numpy as np
|
|
14
|
+
HAS_NUMPY = True
|
|
15
|
+
except ImportError:
|
|
16
|
+
HAS_NUMPY = False
|
|
17
|
+
np = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SGD:
|
|
21
|
+
"""
|
|
22
|
+
Stochastic Gradient Descent optimizer.
|
|
23
|
+
|
|
24
|
+
Updates parameters using: param = param - lr * (grad + weight_decay * param)
|
|
25
|
+
With momentum: v = momentum * v + grad, param = param - lr * v
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
lr: Learning rate
|
|
29
|
+
momentum: Momentum factor (0 = no momentum)
|
|
30
|
+
weight_decay: Weight decay (L2 regularization) factor
|
|
31
|
+
velocity: Momentum velocity for each parameter
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
>>> optimizer = SGD(lr=0.01, momentum=0.9)
|
|
35
|
+
>>> for param in model.parameters():
|
|
36
|
+
>>> optimizer.step(param)
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
params: Optional[List[Tensor]] = None,
|
|
42
|
+
lr: float = 0.01,
|
|
43
|
+
momentum: float = 0.0,
|
|
44
|
+
weight_decay: float = 0.0
|
|
45
|
+
):
|
|
46
|
+
"""
|
|
47
|
+
Initialize SGD optimizer.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
params: Optional list of parameters to optimize
|
|
51
|
+
lr: Learning rate
|
|
52
|
+
momentum: Momentum factor (0.0 to disable)
|
|
53
|
+
weight_decay: Weight decay coefficient
|
|
54
|
+
"""
|
|
55
|
+
self.params = params if params is not None else []
|
|
56
|
+
self.lr = lr
|
|
57
|
+
self.momentum = momentum
|
|
58
|
+
self.weight_decay = weight_decay
|
|
59
|
+
self.velocity: Dict[int, Any] = {} # Store velocity as NumPy arrays
|
|
60
|
+
|
|
61
|
+
def step(self, param: Optional[Tensor] = None):
|
|
62
|
+
"""
|
|
63
|
+
Perform a single optimization step.
|
|
64
|
+
|
|
65
|
+
If param is provided, updates that parameter.
|
|
66
|
+
Otherwise, updates all parameters in self.params.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
param: Optional single parameter to update
|
|
70
|
+
"""
|
|
71
|
+
if param is not None:
|
|
72
|
+
self._update_param(param)
|
|
73
|
+
else:
|
|
74
|
+
for p in self.params:
|
|
75
|
+
self._update_param(p)
|
|
76
|
+
|
|
77
|
+
def _update_param(self, param: Tensor):
|
|
78
|
+
"""Update a single parameter using direct NumPy operations."""
|
|
79
|
+
if not param.requires_grad:
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
if param.grad is None:
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
param_id = id(param)
|
|
86
|
+
|
|
87
|
+
# Get gradient as NumPy array if possible
|
|
88
|
+
if HAS_NUMPY:
|
|
89
|
+
try:
|
|
90
|
+
# Get gradient as NumPy array
|
|
91
|
+
grad = param.grad
|
|
92
|
+
if isinstance(grad, np.ndarray):
|
|
93
|
+
grad_arr = grad
|
|
94
|
+
else:
|
|
95
|
+
grad_arr = np.array(grad, dtype=np.float64)
|
|
96
|
+
|
|
97
|
+
# Get parameter as NumPy array
|
|
98
|
+
param_arr = param.numpy if param.numpy is not None else np.array(param.data, dtype=np.float64)
|
|
99
|
+
|
|
100
|
+
# Apply weight decay
|
|
101
|
+
if self.weight_decay > 0:
|
|
102
|
+
grad_arr = grad_arr + self.weight_decay * param_arr
|
|
103
|
+
|
|
104
|
+
# Update velocity if momentum is used
|
|
105
|
+
if self.momentum > 0:
|
|
106
|
+
if param_id not in self.velocity:
|
|
107
|
+
# Initialize velocity to zero
|
|
108
|
+
self.velocity[param_id] = np.zeros_like(param_arr, dtype=np.float64)
|
|
109
|
+
|
|
110
|
+
# v = momentum * v + grad
|
|
111
|
+
vel = self.velocity[param_id]
|
|
112
|
+
vel[:] = self.momentum * vel + grad_arr
|
|
113
|
+
update = vel
|
|
114
|
+
else:
|
|
115
|
+
update = grad_arr
|
|
116
|
+
|
|
117
|
+
# Compute parameter update: param = param - lr * update
|
|
118
|
+
param_update = self.lr * update
|
|
119
|
+
new_param_arr = param_arr - param_update
|
|
120
|
+
|
|
121
|
+
# Update parameter data directly
|
|
122
|
+
param.data = new_param_arr
|
|
123
|
+
|
|
124
|
+
except (ValueError, TypeError, AttributeError):
|
|
125
|
+
# Fallback to Tensor operations
|
|
126
|
+
self._update_param_fallback(param)
|
|
127
|
+
else:
|
|
128
|
+
# Fallback to Tensor operations
|
|
129
|
+
self._update_param_fallback(param)
|
|
130
|
+
|
|
131
|
+
def _update_param_fallback(self, param: Tensor):
|
|
132
|
+
"""Fallback update using Tensor operations."""
|
|
133
|
+
if param.grad is None:
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
param_id = id(param)
|
|
137
|
+
if param_id not in self.velocity:
|
|
138
|
+
if isinstance(param.data[0], list):
|
|
139
|
+
self.velocity[param_id] = [[0.0] * len(row) for row in param.data]
|
|
140
|
+
else:
|
|
141
|
+
self.velocity[param_id] = [0.0] * len(param.data)
|
|
142
|
+
|
|
143
|
+
grad = param.grad
|
|
144
|
+
if self.weight_decay > 0:
|
|
145
|
+
grad = ops.add(grad, ops.mul(param, self.weight_decay))
|
|
146
|
+
|
|
147
|
+
if self.momentum > 0:
|
|
148
|
+
vel = self.velocity[param_id]
|
|
149
|
+
vel_tensor = Tensor(vel)
|
|
150
|
+
new_vel = ops.add(ops.mul(vel_tensor, self.momentum), grad)
|
|
151
|
+
self.velocity[param_id] = new_vel.data
|
|
152
|
+
update = new_vel
|
|
153
|
+
else:
|
|
154
|
+
update = grad
|
|
155
|
+
|
|
156
|
+
if param.requires_grad:
|
|
157
|
+
param_detached = param.detach()
|
|
158
|
+
param_update = ops.mul(update, self.lr)
|
|
159
|
+
param_detached.sub_(param_update)
|
|
160
|
+
param.data = param_detached.data
|
|
161
|
+
else:
|
|
162
|
+
param_update = ops.mul(update, self.lr)
|
|
163
|
+
param.sub_(param_update)
|
|
164
|
+
|
|
165
|
+
def zero_grad(self, param: Optional[Tensor] = None):
|
|
166
|
+
"""
|
|
167
|
+
Clear gradients.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
param: Optional single parameter, otherwise clears all
|
|
171
|
+
"""
|
|
172
|
+
if param is not None:
|
|
173
|
+
param.zero_grad()
|
|
174
|
+
else:
|
|
175
|
+
for p in self.params:
|
|
176
|
+
p.zero_grad()
|
|
177
|
+
|
|
178
|
+
def add_param_group(self, params: List[Tensor]):
|
|
179
|
+
"""Add a parameter group to optimize."""
|
|
180
|
+
self.params.extend(params)
|
|
181
|
+
|
quantml/py.typed
ADDED
|
File without changes
|