quantmllibrary 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. quantml/__init__.py +74 -0
  2. quantml/autograd.py +154 -0
  3. quantml/cli/__init__.py +10 -0
  4. quantml/cli/run_experiment.py +385 -0
  5. quantml/config/__init__.py +28 -0
  6. quantml/config/config.py +259 -0
  7. quantml/data/__init__.py +33 -0
  8. quantml/data/cache.py +149 -0
  9. quantml/data/feature_store.py +234 -0
  10. quantml/data/futures.py +254 -0
  11. quantml/data/loaders.py +236 -0
  12. quantml/data/memory_optimizer.py +234 -0
  13. quantml/data/validators.py +390 -0
  14. quantml/experiments/__init__.py +23 -0
  15. quantml/experiments/logger.py +208 -0
  16. quantml/experiments/results.py +158 -0
  17. quantml/experiments/tracker.py +223 -0
  18. quantml/features/__init__.py +25 -0
  19. quantml/features/base.py +104 -0
  20. quantml/features/gap_features.py +124 -0
  21. quantml/features/registry.py +138 -0
  22. quantml/features/volatility_features.py +140 -0
  23. quantml/features/volume_features.py +142 -0
  24. quantml/functional.py +37 -0
  25. quantml/models/__init__.py +27 -0
  26. quantml/models/attention.py +258 -0
  27. quantml/models/dropout.py +130 -0
  28. quantml/models/gru.py +319 -0
  29. quantml/models/linear.py +112 -0
  30. quantml/models/lstm.py +353 -0
  31. quantml/models/mlp.py +286 -0
  32. quantml/models/normalization.py +289 -0
  33. quantml/models/rnn.py +154 -0
  34. quantml/models/tcn.py +238 -0
  35. quantml/online.py +209 -0
  36. quantml/ops.py +1707 -0
  37. quantml/optim/__init__.py +42 -0
  38. quantml/optim/adafactor.py +206 -0
  39. quantml/optim/adagrad.py +157 -0
  40. quantml/optim/adam.py +267 -0
  41. quantml/optim/lookahead.py +97 -0
  42. quantml/optim/quant_optimizer.py +228 -0
  43. quantml/optim/radam.py +192 -0
  44. quantml/optim/rmsprop.py +203 -0
  45. quantml/optim/schedulers.py +286 -0
  46. quantml/optim/sgd.py +181 -0
  47. quantml/py.typed +0 -0
  48. quantml/streaming.py +175 -0
  49. quantml/tensor.py +462 -0
  50. quantml/time_series.py +447 -0
  51. quantml/training/__init__.py +135 -0
  52. quantml/training/alpha_eval.py +203 -0
  53. quantml/training/backtest.py +280 -0
  54. quantml/training/backtest_analysis.py +168 -0
  55. quantml/training/cv.py +106 -0
  56. quantml/training/data_loader.py +177 -0
  57. quantml/training/ensemble.py +84 -0
  58. quantml/training/feature_importance.py +135 -0
  59. quantml/training/features.py +364 -0
  60. quantml/training/futures_backtest.py +266 -0
  61. quantml/training/gradient_clipping.py +206 -0
  62. quantml/training/losses.py +248 -0
  63. quantml/training/lr_finder.py +127 -0
  64. quantml/training/metrics.py +376 -0
  65. quantml/training/regularization.py +89 -0
  66. quantml/training/trainer.py +239 -0
  67. quantml/training/walk_forward.py +190 -0
  68. quantml/utils/__init__.py +51 -0
  69. quantml/utils/gradient_check.py +274 -0
  70. quantml/utils/logging.py +181 -0
  71. quantml/utils/ops_cpu.py +231 -0
  72. quantml/utils/profiling.py +364 -0
  73. quantml/utils/reproducibility.py +220 -0
  74. quantml/utils/serialization.py +335 -0
  75. quantmllibrary-0.1.0.dist-info/METADATA +536 -0
  76. quantmllibrary-0.1.0.dist-info/RECORD +79 -0
  77. quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
  78. quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
  79. quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,203 @@
1
+ """
2
+ RMSProp optimizer implementation.
3
+
4
+ RMSProp (Root Mean Square Propagation) is an adaptive learning rate optimizer
5
+ that maintains a moving average of squared gradients.
6
+ """
7
+
8
+ from typing import List, Optional, Dict, Any
9
+ from quantml.tensor import Tensor
10
+ from quantml import ops
11
+
12
+ # Try to import NumPy
13
+ try:
14
+ import numpy as np
15
+ HAS_NUMPY = True
16
+ except ImportError:
17
+ HAS_NUMPY = False
18
+ np = None
19
+
20
+
21
+ class RMSProp:
22
+ """
23
+ RMSProp optimizer.
24
+
25
+ RMSProp maintains a moving average of squared gradients and divides
26
+ the gradient by the root of this average.
27
+
28
+ Attributes:
29
+ lr: Learning rate
30
+ alpha: Smoothing constant (decay factor)
31
+ eps: Small value for numerical stability
32
+ weight_decay: Weight decay coefficient
33
+ momentum: Momentum factor (0 = no momentum)
34
+ squared_avg: Moving average of squared gradients
35
+
36
+ Examples:
37
+ >>> optimizer = RMSProp(lr=0.001, alpha=0.99)
38
+ >>> for param in model.parameters():
39
+ >>> optimizer.step(param)
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ params: Optional[List[Tensor]] = None,
45
+ lr: float = 0.01,
46
+ alpha: float = 0.99,
47
+ eps: float = 1e-8,
48
+ weight_decay: float = 0.0,
49
+ momentum: float = 0.0
50
+ ):
51
+ """
52
+ Initialize RMSProp optimizer.
53
+
54
+ Args:
55
+ params: Optional list of parameters to optimize
56
+ lr: Learning rate
57
+ alpha: Smoothing constant (decay factor for squared gradient average)
58
+ eps: Small value to prevent division by zero
59
+ weight_decay: Weight decay (L2 regularization) coefficient
60
+ momentum: Momentum factor (0.0 to disable)
61
+ """
62
+ self.params = params if params is not None else []
63
+ self.lr = lr
64
+ self.alpha = alpha
65
+ self.eps = eps
66
+ self.weight_decay = weight_decay
67
+ self.momentum = momentum
68
+ self.squared_avg: Dict[int, Any] = {} # Moving average of squared gradients
69
+ self.momentum_buffer: Dict[int, Any] = {} # Momentum buffer
70
+
71
+ def step(self, param: Optional[Tensor] = None):
72
+ """
73
+ Perform a single optimization step.
74
+
75
+ Args:
76
+ param: Optional single parameter to update
77
+ """
78
+ if param is not None:
79
+ self._update_param(param)
80
+ else:
81
+ for p in self.params:
82
+ self._update_param(p)
83
+
84
+ def _update_param(self, param: Tensor):
85
+ """Update a single parameter using RMSProp algorithm."""
86
+ if not param.requires_grad:
87
+ return
88
+
89
+ if param.grad is None:
90
+ return
91
+
92
+ param_id = id(param)
93
+
94
+ if HAS_NUMPY:
95
+ try:
96
+ # Get gradient and parameter as NumPy arrays
97
+ grad = param.grad
98
+ if isinstance(grad, np.ndarray):
99
+ grad_arr = grad
100
+ else:
101
+ grad_arr = np.array(grad, dtype=np.float64)
102
+
103
+ param_arr = param.numpy if param.numpy is not None else np.array(param.data, dtype=np.float64)
104
+
105
+ # Apply weight decay
106
+ if self.weight_decay > 0:
107
+ grad_arr = grad_arr + self.weight_decay * param_arr
108
+
109
+ # Initialize squared average if needed
110
+ if param_id not in self.squared_avg:
111
+ self.squared_avg[param_id] = np.zeros_like(param_arr, dtype=np.float64)
112
+
113
+ # Update squared average: avg = alpha * avg + (1 - alpha) * grad^2
114
+ sq_avg = self.squared_avg[param_id]
115
+ sq_avg[:] = self.alpha * sq_avg + (1.0 - self.alpha) * (grad_arr ** 2)
116
+
117
+ # Compute update: update = grad / (sqrt(avg) + eps)
118
+ update = grad_arr / (np.sqrt(sq_avg) + self.eps)
119
+
120
+ # Apply momentum if enabled
121
+ if self.momentum > 0:
122
+ if param_id not in self.momentum_buffer:
123
+ self.momentum_buffer[param_id] = np.zeros_like(param_arr, dtype=np.float64)
124
+
125
+ buf = self.momentum_buffer[param_id]
126
+ buf[:] = self.momentum * buf + update
127
+ update = buf
128
+
129
+ # Update parameter: param = param - lr * update
130
+ param_update = self.lr * update
131
+ new_param_arr = param_arr - param_update
132
+ param.data = new_param_arr
133
+
134
+ except (ValueError, TypeError, AttributeError):
135
+ self._update_param_fallback(param)
136
+ else:
137
+ self._update_param_fallback(param)
138
+
139
+ def _update_param_fallback(self, param: Tensor):
140
+ """Fallback update using Tensor operations."""
141
+ if param.grad is None:
142
+ return
143
+
144
+ param_id = id(param)
145
+
146
+ # Initialize squared average if needed
147
+ if param_id not in self.squared_avg:
148
+ if isinstance(param.data[0], list):
149
+ self.squared_avg[param_id] = [[0.0] * len(row) for row in param.data]
150
+ else:
151
+ self.squared_avg[param_id] = [0.0] * len(param.data)
152
+
153
+ grad = param.grad
154
+ if self.weight_decay > 0:
155
+ grad = ops.add(grad, ops.mul(param, self.weight_decay))
156
+
157
+ # Update squared average
158
+ grad_sq = ops.mul(grad, grad)
159
+ sq_avg = Tensor(self.squared_avg[param_id])
160
+ new_sq_avg = ops.add(
161
+ ops.mul(sq_avg, self.alpha),
162
+ ops.mul(grad_sq, 1.0 - self.alpha)
163
+ )
164
+ self.squared_avg[param_id] = new_sq_avg.data
165
+
166
+ # Compute update
167
+ sq_avg_sqrt = ops.pow(ops.add(new_sq_avg, self.eps), 0.5)
168
+ update = ops.div(grad, sq_avg_sqrt)
169
+
170
+ # Apply momentum
171
+ if self.momentum > 0:
172
+ if param_id not in self.momentum_buffer:
173
+ if isinstance(param.data[0], list):
174
+ self.momentum_buffer[param_id] = [[0.0] * len(row) for row in param.data]
175
+ else:
176
+ self.momentum_buffer[param_id] = [0.0] * len(param.data)
177
+
178
+ buf = Tensor(self.momentum_buffer[param_id])
179
+ new_buf = ops.add(ops.mul(buf, self.momentum), update)
180
+ self.momentum_buffer[param_id] = new_buf.data
181
+ update = new_buf
182
+
183
+ # Update parameter
184
+ param_update = ops.mul(update, self.lr)
185
+ if param.requires_grad:
186
+ param_detached = param.detach()
187
+ param_detached.sub_(param_update)
188
+ param.data = param_detached.data
189
+ else:
190
+ param.sub_(param_update)
191
+
192
+ def zero_grad(self, param: Optional[Tensor] = None):
193
+ """Clear gradients."""
194
+ if param is not None:
195
+ param.zero_grad()
196
+ else:
197
+ for p in self.params:
198
+ p.zero_grad()
199
+
200
+ def add_param_group(self, params: List[Tensor]):
201
+ """Add a parameter group to optimize."""
202
+ self.params.extend(params)
203
+
@@ -0,0 +1,286 @@
1
+ """
2
+ Learning rate schedulers for QuantML.
3
+
4
+ Provides various learning rate scheduling strategies for training optimization.
5
+ """
6
+
7
+ from typing import List, Optional, Any
8
+ from abc import ABC, abstractmethod
9
+ import math
10
+
11
+
12
+ class LRScheduler(ABC):
13
+ """
14
+ Base class for learning rate schedulers.
15
+
16
+ All schedulers should inherit from this class and implement
17
+ the get_lr() and step() methods.
18
+ """
19
+
20
+ def __init__(self, optimizer: Any, last_epoch: int = -1):
21
+ """
22
+ Initialize scheduler.
23
+
24
+ Args:
25
+ optimizer: The optimizer to schedule
26
+ last_epoch: The index of the last epoch
27
+ """
28
+ self.optimizer = optimizer
29
+ self.last_epoch = last_epoch
30
+ self.base_lrs = [group.get('lr', optimizer.lr) if isinstance(group, dict) else optimizer.lr
31
+ for group in getattr(optimizer, 'param_groups', [optimizer])]
32
+
33
+ @abstractmethod
34
+ def get_lr(self) -> List[float]:
35
+ """Compute learning rate for current epoch."""
36
+ pass
37
+
38
+ def step(self, epoch: Optional[int] = None):
39
+ """
40
+ Step the scheduler.
41
+
42
+ Args:
43
+ epoch: Current epoch (if None, uses last_epoch + 1)
44
+ """
45
+ if epoch is None:
46
+ self.last_epoch += 1
47
+ else:
48
+ self.last_epoch = epoch
49
+
50
+ lrs = self.get_lr()
51
+ # Update optimizer learning rates
52
+ if hasattr(self.optimizer, 'param_groups'):
53
+ for param_group, lr in zip(self.optimizer.param_groups, lrs):
54
+ param_group['lr'] = lr
55
+ else:
56
+ self.optimizer.lr = lrs[0] if lrs else self.optimizer.lr
57
+
58
+
59
+ class StepLR(LRScheduler):
60
+ """Step learning rate scheduler - decays LR by gamma every step_size epochs."""
61
+
62
+ def __init__(self, optimizer: Any, step_size: int, gamma: float = 0.1, last_epoch: int = -1):
63
+ """
64
+ Initialize StepLR scheduler.
65
+
66
+ Args:
67
+ optimizer: The optimizer
68
+ step_size: Period of learning rate decay
69
+ gamma: Multiplicative factor for decay
70
+ last_epoch: The index of the last epoch
71
+ """
72
+ self.step_size = step_size
73
+ self.gamma = gamma
74
+ super().__init__(optimizer, last_epoch)
75
+
76
+ def get_lr(self) -> List[float]:
77
+ """Compute learning rate."""
78
+ return [base_lr * (self.gamma ** (self.last_epoch // self.step_size))
79
+ for base_lr in self.base_lrs]
80
+
81
+
82
+ class CosineAnnealingLR(LRScheduler):
83
+ """Cosine annealing learning rate scheduler."""
84
+
85
+ def __init__(self, optimizer: Any, T_max: int, eta_min: float = 0.0, last_epoch: int = -1):
86
+ """
87
+ Initialize CosineAnnealingLR scheduler.
88
+
89
+ Args:
90
+ optimizer: The optimizer
91
+ T_max: Maximum number of iterations
92
+ eta_min: Minimum learning rate
93
+ last_epoch: The index of the last epoch
94
+ """
95
+ self.T_max = T_max
96
+ self.eta_min = eta_min
97
+ super().__init__(optimizer, last_epoch)
98
+
99
+ def get_lr(self) -> List[float]:
100
+ """Compute learning rate."""
101
+ return [self.eta_min + (base_lr - self.eta_min) *
102
+ (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
103
+ for base_lr in self.base_lrs]
104
+
105
+
106
+ class WarmupLR(LRScheduler):
107
+ """Warmup learning rate scheduler."""
108
+
109
+ def __init__(self, optimizer: Any, warmup_steps: int, warmup_type: str = 'linear', last_epoch: int = -1):
110
+ """
111
+ Initialize WarmupLR scheduler.
112
+
113
+ Args:
114
+ optimizer: The optimizer
115
+ warmup_steps: Number of warmup steps
116
+ warmup_type: Type of warmup ('linear' or 'cosine')
117
+ last_epoch: The index of the last epoch
118
+ """
119
+ self.warmup_steps = warmup_steps
120
+ self.warmup_type = warmup_type
121
+ super().__init__(optimizer, last_epoch)
122
+
123
+ def get_lr(self) -> List[float]:
124
+ """Compute learning rate."""
125
+ if self.last_epoch < self.warmup_steps:
126
+ if self.warmup_type == 'linear':
127
+ factor = (self.last_epoch + 1) / self.warmup_steps
128
+ else: # cosine
129
+ factor = (1 + math.cos(math.pi * (1 - (self.last_epoch + 1) / self.warmup_steps))) / 2
130
+ return [base_lr * factor for base_lr in self.base_lrs]
131
+ return self.base_lrs
132
+
133
+
134
+ class ReduceLROnPlateau:
135
+ """Reduce learning rate when a metric has stopped improving."""
136
+
137
+ def __init__(self, optimizer: Any, mode: str = 'min', factor: float = 0.1,
138
+ patience: int = 10, threshold: float = 1e-4, min_lr: float = 0.0):
139
+ """
140
+ Initialize ReduceLROnPlateau scheduler.
141
+
142
+ Args:
143
+ optimizer: The optimizer
144
+ mode: 'min' or 'max' - whether to reduce LR when metric stops decreasing/increasing
145
+ factor: Factor to multiply LR by
146
+ patience: Number of epochs with no improvement before reducing LR
147
+ threshold: Threshold for measuring improvement
148
+ min_lr: Minimum learning rate
149
+ """
150
+ self.optimizer = optimizer
151
+ self.mode = mode
152
+ self.factor = factor
153
+ self.patience = patience
154
+ self.threshold = threshold
155
+ self.min_lr = min_lr
156
+ self.best = None
157
+ self.num_bad_epochs = 0
158
+ self.base_lrs = [optimizer.lr]
159
+
160
+ def step(self, metrics: float):
161
+ """
162
+ Step the scheduler based on metrics.
163
+
164
+ Args:
165
+ metrics: Current metric value
166
+ """
167
+ if self.best is None:
168
+ self.best = metrics
169
+ else:
170
+ if self.mode == 'min':
171
+ is_better = metrics < self.best - self.threshold
172
+ else:
173
+ is_better = metrics > self.best + self.threshold
174
+
175
+ if is_better:
176
+ self.best = metrics
177
+ self.num_bad_epochs = 0
178
+ else:
179
+ self.num_bad_epochs += 1
180
+
181
+ if self.num_bad_epochs >= self.patience:
182
+ self._reduce_lr()
183
+ self.num_bad_epochs = 0
184
+
185
+ def _reduce_lr(self):
186
+ """Reduce learning rate."""
187
+ new_lr = max(self.optimizer.lr * self.factor, self.min_lr)
188
+ self.optimizer.lr = new_lr
189
+
190
+
191
+ class CyclicLR(LRScheduler):
192
+ """Cyclic learning rate scheduler."""
193
+
194
+ def __init__(self, optimizer: Any, base_lr: float, max_lr: float,
195
+ step_size_up: int = 2000, step_size_down: Optional[int] = None,
196
+ mode: str = 'triangular', gamma: float = 1.0, last_epoch: int = -1):
197
+ """
198
+ Initialize CyclicLR scheduler.
199
+
200
+ Args:
201
+ optimizer: The optimizer
202
+ base_lr: Lower bound of learning rate
203
+ max_lr: Upper bound of learning rate
204
+ step_size_up: Number of steps to increase LR
205
+ step_size_down: Number of steps to decrease LR (if None, equals step_size_up)
206
+ mode: 'triangular', 'triangular2', or 'exp_range'
207
+ gamma: Scaling factor for 'exp_range' mode
208
+ last_epoch: The index of the last epoch
209
+ """
210
+ self.base_lr = base_lr
211
+ self.max_lr = max_lr
212
+ self.step_size_up = step_size_up
213
+ self.step_size_down = step_size_down if step_size_down is not None else step_size_up
214
+ self.mode = mode
215
+ self.gamma = gamma
216
+ self.step_size = self.step_size_up + self.step_size_down
217
+ super().__init__(optimizer, last_epoch)
218
+
219
+ def get_lr(self) -> List[float]:
220
+ """Compute learning rate."""
221
+ cycle = math.floor(1 + self.last_epoch / self.step_size)
222
+ x = 1 + self.last_epoch / self.step_size - cycle
223
+
224
+ if x <= self.step_size_up / self.step_size:
225
+ scale = x * (self.step_size / self.step_size_up)
226
+ else:
227
+ scale = (self.step_size - x) * (self.step_size / self.step_size_down)
228
+
229
+ if self.mode == 'triangular':
230
+ lr = self.base_lr + (self.max_lr - self.base_lr) * max(0, scale)
231
+ elif self.mode == 'triangular2':
232
+ lr = self.base_lr + (self.max_lr - self.base_lr) * max(0, scale) / (2 ** (cycle - 1))
233
+ else: # exp_range
234
+ lr = self.base_lr + (self.max_lr - self.base_lr) * max(0, scale) * (self.gamma ** self.last_epoch)
235
+
236
+ return [lr for _ in self.base_lrs]
237
+
238
+
239
+ class OneCycleLR(LRScheduler):
240
+ """One cycle learning rate scheduler."""
241
+
242
+ def __init__(self, optimizer: Any, max_lr: float, total_steps: int,
243
+ pct_start: float = 0.3, anneal_strategy: str = 'cos',
244
+ div_factor: float = 25.0, final_div_factor: float = 10000.0, last_epoch: int = -1):
245
+ """
246
+ Initialize OneCycleLR scheduler.
247
+
248
+ Args:
249
+ optimizer: The optimizer
250
+ max_lr: Maximum learning rate
251
+ total_steps: Total number of steps
252
+ pct_start: Percentage of steps for warmup
253
+ anneal_strategy: 'cos' or 'linear' annealing
254
+ div_factor: Initial LR = max_lr / div_factor
255
+ final_div_factor: Final LR = initial_lr / final_div_factor
256
+ last_epoch: The index of the last epoch
257
+ """
258
+ self.max_lr = max_lr
259
+ self.total_steps = total_steps
260
+ self.pct_start = pct_start
261
+ self.anneal_strategy = anneal_strategy
262
+ self.div_factor = div_factor
263
+ self.final_div_factor = final_div_factor
264
+ self.initial_lr = max_lr / div_factor
265
+ self.final_lr = self.initial_lr / final_div_factor
266
+ super().__init__(optimizer, last_epoch)
267
+
268
+ def get_lr(self) -> List[float]:
269
+ """Compute learning rate."""
270
+ if self.last_epoch < self.total_steps * self.pct_start:
271
+ # Warmup phase
272
+ pct = self.last_epoch / (self.total_steps * self.pct_start)
273
+ if self.anneal_strategy == 'cos':
274
+ lr = self.initial_lr + (self.max_lr - self.initial_lr) * (1 + math.cos(math.pi * (1 - pct))) / 2
275
+ else:
276
+ lr = self.initial_lr + (self.max_lr - self.initial_lr) * pct
277
+ else:
278
+ # Annealing phase
279
+ pct = (self.last_epoch - self.total_steps * self.pct_start) / (self.total_steps * (1 - self.pct_start))
280
+ if self.anneal_strategy == 'cos':
281
+ lr = self.final_lr + (self.max_lr - self.final_lr) * (1 + math.cos(math.pi * pct)) / 2
282
+ else:
283
+ lr = self.max_lr - (self.max_lr - self.final_lr) * pct
284
+
285
+ return [lr for _ in self.base_lrs]
286
+
quantml/optim/sgd.py ADDED
@@ -0,0 +1,181 @@
1
+ """
2
+ Stochastic Gradient Descent (SGD) optimizer.
3
+
4
+ This module provides the SGD optimizer with optional momentum and weight decay.
5
+ """
6
+
7
+ from typing import List, Optional, Dict, Any
8
+ from quantml.tensor import Tensor
9
+ from quantml import ops
10
+
11
+ # Try to import NumPy
12
+ try:
13
+ import numpy as np
14
+ HAS_NUMPY = True
15
+ except ImportError:
16
+ HAS_NUMPY = False
17
+ np = None
18
+
19
+
20
+ class SGD:
21
+ """
22
+ Stochastic Gradient Descent optimizer.
23
+
24
+ Updates parameters using: param = param - lr * (grad + weight_decay * param)
25
+ With momentum: v = momentum * v + grad, param = param - lr * v
26
+
27
+ Attributes:
28
+ lr: Learning rate
29
+ momentum: Momentum factor (0 = no momentum)
30
+ weight_decay: Weight decay (L2 regularization) factor
31
+ velocity: Momentum velocity for each parameter
32
+
33
+ Examples:
34
+ >>> optimizer = SGD(lr=0.01, momentum=0.9)
35
+ >>> for param in model.parameters():
36
+ >>> optimizer.step(param)
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ params: Optional[List[Tensor]] = None,
42
+ lr: float = 0.01,
43
+ momentum: float = 0.0,
44
+ weight_decay: float = 0.0
45
+ ):
46
+ """
47
+ Initialize SGD optimizer.
48
+
49
+ Args:
50
+ params: Optional list of parameters to optimize
51
+ lr: Learning rate
52
+ momentum: Momentum factor (0.0 to disable)
53
+ weight_decay: Weight decay coefficient
54
+ """
55
+ self.params = params if params is not None else []
56
+ self.lr = lr
57
+ self.momentum = momentum
58
+ self.weight_decay = weight_decay
59
+ self.velocity: Dict[int, Any] = {} # Store velocity as NumPy arrays
60
+
61
+ def step(self, param: Optional[Tensor] = None):
62
+ """
63
+ Perform a single optimization step.
64
+
65
+ If param is provided, updates that parameter.
66
+ Otherwise, updates all parameters in self.params.
67
+
68
+ Args:
69
+ param: Optional single parameter to update
70
+ """
71
+ if param is not None:
72
+ self._update_param(param)
73
+ else:
74
+ for p in self.params:
75
+ self._update_param(p)
76
+
77
+ def _update_param(self, param: Tensor):
78
+ """Update a single parameter using direct NumPy operations."""
79
+ if not param.requires_grad:
80
+ return
81
+
82
+ if param.grad is None:
83
+ return
84
+
85
+ param_id = id(param)
86
+
87
+ # Get gradient as NumPy array if possible
88
+ if HAS_NUMPY:
89
+ try:
90
+ # Get gradient as NumPy array
91
+ grad = param.grad
92
+ if isinstance(grad, np.ndarray):
93
+ grad_arr = grad
94
+ else:
95
+ grad_arr = np.array(grad, dtype=np.float64)
96
+
97
+ # Get parameter as NumPy array
98
+ param_arr = param.numpy if param.numpy is not None else np.array(param.data, dtype=np.float64)
99
+
100
+ # Apply weight decay
101
+ if self.weight_decay > 0:
102
+ grad_arr = grad_arr + self.weight_decay * param_arr
103
+
104
+ # Update velocity if momentum is used
105
+ if self.momentum > 0:
106
+ if param_id not in self.velocity:
107
+ # Initialize velocity to zero
108
+ self.velocity[param_id] = np.zeros_like(param_arr, dtype=np.float64)
109
+
110
+ # v = momentum * v + grad
111
+ vel = self.velocity[param_id]
112
+ vel[:] = self.momentum * vel + grad_arr
113
+ update = vel
114
+ else:
115
+ update = grad_arr
116
+
117
+ # Compute parameter update: param = param - lr * update
118
+ param_update = self.lr * update
119
+ new_param_arr = param_arr - param_update
120
+
121
+ # Update parameter data directly
122
+ param.data = new_param_arr
123
+
124
+ except (ValueError, TypeError, AttributeError):
125
+ # Fallback to Tensor operations
126
+ self._update_param_fallback(param)
127
+ else:
128
+ # Fallback to Tensor operations
129
+ self._update_param_fallback(param)
130
+
131
+ def _update_param_fallback(self, param: Tensor):
132
+ """Fallback update using Tensor operations."""
133
+ if param.grad is None:
134
+ return
135
+
136
+ param_id = id(param)
137
+ if param_id not in self.velocity:
138
+ if isinstance(param.data[0], list):
139
+ self.velocity[param_id] = [[0.0] * len(row) for row in param.data]
140
+ else:
141
+ self.velocity[param_id] = [0.0] * len(param.data)
142
+
143
+ grad = param.grad
144
+ if self.weight_decay > 0:
145
+ grad = ops.add(grad, ops.mul(param, self.weight_decay))
146
+
147
+ if self.momentum > 0:
148
+ vel = self.velocity[param_id]
149
+ vel_tensor = Tensor(vel)
150
+ new_vel = ops.add(ops.mul(vel_tensor, self.momentum), grad)
151
+ self.velocity[param_id] = new_vel.data
152
+ update = new_vel
153
+ else:
154
+ update = grad
155
+
156
+ if param.requires_grad:
157
+ param_detached = param.detach()
158
+ param_update = ops.mul(update, self.lr)
159
+ param_detached.sub_(param_update)
160
+ param.data = param_detached.data
161
+ else:
162
+ param_update = ops.mul(update, self.lr)
163
+ param.sub_(param_update)
164
+
165
+ def zero_grad(self, param: Optional[Tensor] = None):
166
+ """
167
+ Clear gradients.
168
+
169
+ Args:
170
+ param: Optional single parameter, otherwise clears all
171
+ """
172
+ if param is not None:
173
+ param.zero_grad()
174
+ else:
175
+ for p in self.params:
176
+ p.zero_grad()
177
+
178
+ def add_param_group(self, params: List[Tensor]):
179
+ """Add a parameter group to optimize."""
180
+ self.params.extend(params)
181
+
quantml/py.typed ADDED
File without changes