quantmllibrary 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. quantml/__init__.py +74 -0
  2. quantml/autograd.py +154 -0
  3. quantml/cli/__init__.py +10 -0
  4. quantml/cli/run_experiment.py +385 -0
  5. quantml/config/__init__.py +28 -0
  6. quantml/config/config.py +259 -0
  7. quantml/data/__init__.py +33 -0
  8. quantml/data/cache.py +149 -0
  9. quantml/data/feature_store.py +234 -0
  10. quantml/data/futures.py +254 -0
  11. quantml/data/loaders.py +236 -0
  12. quantml/data/memory_optimizer.py +234 -0
  13. quantml/data/validators.py +390 -0
  14. quantml/experiments/__init__.py +23 -0
  15. quantml/experiments/logger.py +208 -0
  16. quantml/experiments/results.py +158 -0
  17. quantml/experiments/tracker.py +223 -0
  18. quantml/features/__init__.py +25 -0
  19. quantml/features/base.py +104 -0
  20. quantml/features/gap_features.py +124 -0
  21. quantml/features/registry.py +138 -0
  22. quantml/features/volatility_features.py +140 -0
  23. quantml/features/volume_features.py +142 -0
  24. quantml/functional.py +37 -0
  25. quantml/models/__init__.py +27 -0
  26. quantml/models/attention.py +258 -0
  27. quantml/models/dropout.py +130 -0
  28. quantml/models/gru.py +319 -0
  29. quantml/models/linear.py +112 -0
  30. quantml/models/lstm.py +353 -0
  31. quantml/models/mlp.py +286 -0
  32. quantml/models/normalization.py +289 -0
  33. quantml/models/rnn.py +154 -0
  34. quantml/models/tcn.py +238 -0
  35. quantml/online.py +209 -0
  36. quantml/ops.py +1707 -0
  37. quantml/optim/__init__.py +42 -0
  38. quantml/optim/adafactor.py +206 -0
  39. quantml/optim/adagrad.py +157 -0
  40. quantml/optim/adam.py +267 -0
  41. quantml/optim/lookahead.py +97 -0
  42. quantml/optim/quant_optimizer.py +228 -0
  43. quantml/optim/radam.py +192 -0
  44. quantml/optim/rmsprop.py +203 -0
  45. quantml/optim/schedulers.py +286 -0
  46. quantml/optim/sgd.py +181 -0
  47. quantml/py.typed +0 -0
  48. quantml/streaming.py +175 -0
  49. quantml/tensor.py +462 -0
  50. quantml/time_series.py +447 -0
  51. quantml/training/__init__.py +135 -0
  52. quantml/training/alpha_eval.py +203 -0
  53. quantml/training/backtest.py +280 -0
  54. quantml/training/backtest_analysis.py +168 -0
  55. quantml/training/cv.py +106 -0
  56. quantml/training/data_loader.py +177 -0
  57. quantml/training/ensemble.py +84 -0
  58. quantml/training/feature_importance.py +135 -0
  59. quantml/training/features.py +364 -0
  60. quantml/training/futures_backtest.py +266 -0
  61. quantml/training/gradient_clipping.py +206 -0
  62. quantml/training/losses.py +248 -0
  63. quantml/training/lr_finder.py +127 -0
  64. quantml/training/metrics.py +376 -0
  65. quantml/training/regularization.py +89 -0
  66. quantml/training/trainer.py +239 -0
  67. quantml/training/walk_forward.py +190 -0
  68. quantml/utils/__init__.py +51 -0
  69. quantml/utils/gradient_check.py +274 -0
  70. quantml/utils/logging.py +181 -0
  71. quantml/utils/ops_cpu.py +231 -0
  72. quantml/utils/profiling.py +364 -0
  73. quantml/utils/reproducibility.py +220 -0
  74. quantml/utils/serialization.py +335 -0
  75. quantmllibrary-0.1.0.dist-info/METADATA +536 -0
  76. quantmllibrary-0.1.0.dist-info/RECORD +79 -0
  77. quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
  78. quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
  79. quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,42 @@
1
+ """
2
+ QuantML Optimizers
3
+
4
+ This module provides optimization algorithms for training models.
5
+ """
6
+
7
+ from quantml.optim.sgd import SGD
8
+ from quantml.optim.adam import Adam
9
+ from quantml.optim.rmsprop import RMSProp
10
+ from quantml.optim.adagrad import AdaGrad
11
+ from quantml.optim.adafactor import AdaFactor
12
+ from quantml.optim.lookahead import Lookahead
13
+ from quantml.optim.radam import RAdam
14
+ from quantml.optim.quant_optimizer import QuantOptimizer
15
+ from quantml.optim.schedulers import (
16
+ LRScheduler,
17
+ StepLR,
18
+ CosineAnnealingLR,
19
+ WarmupLR,
20
+ ReduceLROnPlateau,
21
+ CyclicLR,
22
+ OneCycleLR
23
+ )
24
+
25
+ __all__ = [
26
+ 'SGD',
27
+ 'Adam',
28
+ 'RMSProp',
29
+ 'AdaGrad',
30
+ 'AdaFactor',
31
+ 'Lookahead',
32
+ 'RAdam',
33
+ 'QuantOptimizer',
34
+ 'LRScheduler',
35
+ 'StepLR',
36
+ 'CosineAnnealingLR',
37
+ 'WarmupLR',
38
+ 'ReduceLROnPlateau',
39
+ 'CyclicLR',
40
+ 'OneCycleLR'
41
+ ]
42
+
@@ -0,0 +1,206 @@
1
+ """
2
+ AdaFactor optimizer implementation.
3
+
4
+ AdaFactor is a memory-efficient variant of Adam that uses factorized second moment estimates.
5
+ """
6
+
7
+ from typing import List, Optional, Dict, Any
8
+ from quantml.tensor import Tensor
9
+ from quantml import ops
10
+
11
+ # Try to import NumPy
12
+ try:
13
+ import numpy as np
14
+ HAS_NUMPY = True
15
+ except ImportError:
16
+ HAS_NUMPY = False
17
+ np = None
18
+
19
+
20
+ class AdaFactor:
21
+ """
22
+ AdaFactor optimizer - memory-efficient Adam variant.
23
+
24
+ Uses factorized second moment estimates to reduce memory usage.
25
+ Good for quant models with many parameters.
26
+
27
+ Attributes:
28
+ lr: Learning rate
29
+ betas: Tuple of (beta1, beta2) for moment estimates
30
+ eps: Small value for numerical stability
31
+ weight_decay: Weight decay coefficient
32
+ m: First moment estimates
33
+ v: Factorized second moment estimates
34
+
35
+ Examples:
36
+ >>> optimizer = AdaFactor(lr=0.001, betas=(0.9, 0.999))
37
+ >>> for param in model.parameters():
38
+ >>> optimizer.step(param)
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ params: Optional[List[Tensor]] = None,
44
+ lr: float = 0.001,
45
+ betas: tuple = (0.9, 0.999),
46
+ eps: float = 1e-30,
47
+ weight_decay: float = 0.0,
48
+ factor_decay: float = 0.8
49
+ ):
50
+ """
51
+ Initialize AdaFactor optimizer.
52
+
53
+ Args:
54
+ params: Optional list of parameters to optimize
55
+ lr: Learning rate
56
+ betas: Tuple of (beta1, beta2) for exponential decay rates
57
+ eps: Small value to prevent division by zero
58
+ weight_decay: Weight decay coefficient
59
+ factor_decay: Decay factor for second moment factorization
60
+ """
61
+ self.params = params if params is not None else []
62
+ self.lr = lr
63
+ self.beta1, self.beta2 = betas
64
+ self.eps = eps
65
+ self.weight_decay = weight_decay
66
+ self.factor_decay = factor_decay
67
+ self.m: Dict[int, Any] = {} # First moment
68
+ self.v_row: Dict[int, Any] = {} # Row factors for second moment
69
+ self.v_col: Dict[int, Any] = {} # Column factors for second moment
70
+ self.step_count = 0
71
+
72
+ def step(self, param: Optional[Tensor] = None):
73
+ """Perform a single optimization step."""
74
+ if param is not None:
75
+ self._update_param(param)
76
+ else:
77
+ for p in self.params:
78
+ self._update_param(p)
79
+ self.step_count += 1
80
+
81
+ def _update_param(self, param: Tensor):
82
+ """Update a single parameter using AdaFactor algorithm."""
83
+ if not param.requires_grad:
84
+ return
85
+
86
+ if param.grad is None:
87
+ return
88
+
89
+ param_id = id(param)
90
+
91
+ if HAS_NUMPY:
92
+ try:
93
+ grad = param.grad
94
+ if isinstance(grad, np.ndarray):
95
+ grad_arr = grad
96
+ else:
97
+ grad_arr = np.array(grad, dtype=np.float64)
98
+
99
+ param_arr = param.numpy if param.numpy is not None else np.array(param.data, dtype=np.float64)
100
+
101
+ if self.weight_decay > 0:
102
+ grad_arr = grad_arr + self.weight_decay * param_arr
103
+
104
+ # Initialize moments if needed
105
+ if param_id not in self.m:
106
+ self.m[param_id] = np.zeros_like(param_arr, dtype=np.float64)
107
+ # Factorize second moment for 2D arrays
108
+ if grad_arr.ndim == 2:
109
+ self.v_row[param_id] = np.zeros(grad_arr.shape[0], dtype=np.float64)
110
+ self.v_col[param_id] = np.zeros(grad_arr.shape[1], dtype=np.float64)
111
+ else:
112
+ self.v_row[param_id] = np.zeros_like(grad_arr, dtype=np.float64)
113
+ self.v_col[param_id] = None
114
+
115
+ # Update first moment
116
+ m = self.m[param_id]
117
+ m[:] = self.beta1 * m + (1.0 - self.beta1) * grad_arr
118
+
119
+ # Update factorized second moment
120
+ if grad_arr.ndim == 2 and self.v_col[param_id] is not None:
121
+ # 2D case: factorize
122
+ v_row = self.v_row[param_id]
123
+ v_col = self.v_col[param_id]
124
+ grad_sq = grad_arr ** 2
125
+ v_row[:] = self.beta2 * v_row + (1.0 - self.beta2) * np.mean(grad_sq, axis=1)
126
+ v_col[:] = self.beta2 * v_col + (1.0 - self.beta2) * np.mean(grad_sq, axis=0)
127
+ v_hat = np.outer(v_row, v_col) / np.mean(v_row)
128
+ else:
129
+ # 1D case: regular second moment
130
+ v_row = self.v_row[param_id]
131
+ v_row[:] = self.beta2 * v_row + (1.0 - self.beta2) * (grad_arr ** 2)
132
+ v_hat = v_row
133
+
134
+ # Bias correction
135
+ bias_correction1 = 1.0 - (self.beta1 ** self.step_count)
136
+ m_hat = m / bias_correction1
137
+
138
+ # Update parameter
139
+ v_hat_sqrt = np.sqrt(v_hat) + self.eps
140
+ update = m_hat / v_hat_sqrt
141
+ param_update = self.lr * update
142
+ new_param_arr = param_arr - param_update
143
+ param.data = new_param_arr
144
+
145
+ except (ValueError, TypeError, AttributeError):
146
+ # Fallback to simplified Adam-like update
147
+ self._update_param_fallback(param)
148
+ else:
149
+ self._update_param_fallback(param)
150
+
151
+ def _update_param_fallback(self, param: Tensor):
152
+ """Fallback update using Tensor operations (simplified)."""
153
+ # Simplified fallback - use regular Adam-like update
154
+ if param.grad is None:
155
+ return
156
+
157
+ param_id = id(param)
158
+
159
+ if param_id not in self.m:
160
+ if isinstance(param.data[0], list):
161
+ self.m[param_id] = Tensor([[0.0] * len(row) for row in param.data])
162
+ self.v_row[param_id] = Tensor([[0.0] * len(row) for row in param.data])
163
+ else:
164
+ self.m[param_id] = Tensor([0.0] * len(param.data))
165
+ self.v_row[param_id] = Tensor([0.0] * len(param.data))
166
+
167
+ grad = Tensor(param.grad)
168
+ if self.weight_decay > 0:
169
+ grad = ops.add(grad, ops.mul(param, self.weight_decay))
170
+
171
+ m_prev = self.m[param_id]
172
+ m_new = ops.add(ops.mul(m_prev, self.beta1), ops.mul(grad, 1.0 - self.beta1))
173
+ self.m[param_id] = m_new
174
+
175
+ v_prev = self.v_row[param_id]
176
+ grad_sq = ops.mul(grad, grad)
177
+ v_new = ops.add(ops.mul(v_prev, self.beta2), ops.mul(grad_sq, 1.0 - self.beta2))
178
+ self.v_row[param_id] = v_new
179
+
180
+ bias_correction1 = 1.0 - (self.beta1 ** self.step_count)
181
+ m_hat = ops.div(m_new, bias_correction1)
182
+ v_hat = v_new
183
+
184
+ v_hat_sqrt = ops.pow(ops.add(v_hat, self.eps), 0.5)
185
+ update = ops.div(m_hat, v_hat_sqrt)
186
+ param_update = ops.mul(update, self.lr)
187
+
188
+ if param.requires_grad:
189
+ param_detached = param.detach()
190
+ param_detached.sub_(param_update)
191
+ param.data = param_detached.data
192
+ else:
193
+ param.sub_(param_update)
194
+
195
+ def zero_grad(self, param: Optional[Tensor] = None):
196
+ """Clear gradients."""
197
+ if param is not None:
198
+ param.zero_grad()
199
+ else:
200
+ for p in self.params:
201
+ p.zero_grad()
202
+
203
+ def add_param_group(self, params: List[Tensor]):
204
+ """Add a parameter group to optimize."""
205
+ self.params.extend(params)
206
+
@@ -0,0 +1,157 @@
1
+ """
2
+ AdaGrad optimizer implementation.
3
+
4
+ AdaGrad (Adaptive Gradient) adapts learning rates by accumulating squared gradients.
5
+ """
6
+
7
+ from typing import List, Optional, Dict, Any
8
+ from quantml.tensor import Tensor
9
+ from quantml import ops
10
+
11
+ # Try to import NumPy
12
+ try:
13
+ import numpy as np
14
+ HAS_NUMPY = True
15
+ except ImportError:
16
+ HAS_NUMPY = False
17
+ np = None
18
+
19
+
20
+ class AdaGrad:
21
+ """
22
+ AdaGrad optimizer.
23
+
24
+ AdaGrad adapts learning rates by accumulating squared gradients.
25
+ Learning rates decrease for parameters with large gradients.
26
+
27
+ Attributes:
28
+ lr: Learning rate
29
+ eps: Small value for numerical stability
30
+ weight_decay: Weight decay coefficient
31
+ accum: Accumulated squared gradients
32
+
33
+ Examples:
34
+ >>> optimizer = AdaGrad(lr=0.01)
35
+ >>> for param in model.parameters():
36
+ >>> optimizer.step(param)
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ params: Optional[List[Tensor]] = None,
42
+ lr: float = 0.01,
43
+ eps: float = 1e-10,
44
+ weight_decay: float = 0.0
45
+ ):
46
+ """
47
+ Initialize AdaGrad optimizer.
48
+
49
+ Args:
50
+ params: Optional list of parameters to optimize
51
+ lr: Learning rate
52
+ eps: Small value to prevent division by zero
53
+ weight_decay: Weight decay (L2 regularization) coefficient
54
+ """
55
+ self.params = params if params is not None else []
56
+ self.lr = lr
57
+ self.eps = eps
58
+ self.weight_decay = weight_decay
59
+ self.accum: Dict[int, Any] = {} # Accumulated squared gradients
60
+
61
+ def step(self, param: Optional[Tensor] = None):
62
+ """Perform a single optimization step."""
63
+ if param is not None:
64
+ self._update_param(param)
65
+ else:
66
+ for p in self.params:
67
+ self._update_param(p)
68
+
69
+ def _update_param(self, param: Tensor):
70
+ """Update a single parameter using AdaGrad algorithm."""
71
+ if not param.requires_grad:
72
+ return
73
+
74
+ if param.grad is None:
75
+ return
76
+
77
+ param_id = id(param)
78
+
79
+ if HAS_NUMPY:
80
+ try:
81
+ grad = param.grad
82
+ if isinstance(grad, np.ndarray):
83
+ grad_arr = grad
84
+ else:
85
+ grad_arr = np.array(grad, dtype=np.float64)
86
+
87
+ param_arr = param.numpy if param.numpy is not None else np.array(param.data, dtype=np.float64)
88
+
89
+ if self.weight_decay > 0:
90
+ grad_arr = grad_arr + self.weight_decay * param_arr
91
+
92
+ # Initialize accumulator if needed
93
+ if param_id not in self.accum:
94
+ self.accum[param_id] = np.zeros_like(param_arr, dtype=np.float64)
95
+
96
+ # Accumulate squared gradients
97
+ accum = self.accum[param_id]
98
+ accum[:] = accum + grad_arr ** 2
99
+
100
+ # Update parameter: param = param - lr * grad / (sqrt(accum) + eps)
101
+ update = grad_arr / (np.sqrt(accum) + self.eps)
102
+ param_update = self.lr * update
103
+ new_param_arr = param_arr - param_update
104
+ param.data = new_param_arr
105
+
106
+ except (ValueError, TypeError, AttributeError):
107
+ self._update_param_fallback(param)
108
+ else:
109
+ self._update_param_fallback(param)
110
+
111
+ def _update_param_fallback(self, param: Tensor):
112
+ """Fallback update using Tensor operations."""
113
+ if param.grad is None:
114
+ return
115
+
116
+ param_id = id(param)
117
+
118
+ if param_id not in self.accum:
119
+ if isinstance(param.data[0], list):
120
+ self.accum[param_id] = [[0.0] * len(row) for row in param.data]
121
+ else:
122
+ self.accum[param_id] = [0.0] * len(param.data)
123
+
124
+ grad = param.grad
125
+ if self.weight_decay > 0:
126
+ grad = ops.add(grad, ops.mul(param, self.weight_decay))
127
+
128
+ # Accumulate squared gradients
129
+ grad_sq = ops.mul(grad, grad)
130
+ accum = Tensor(self.accum[param_id])
131
+ new_accum = ops.add(accum, grad_sq)
132
+ self.accum[param_id] = new_accum.data
133
+
134
+ # Update parameter
135
+ accum_sqrt = ops.pow(ops.add(new_accum, self.eps), 0.5)
136
+ update = ops.div(grad, accum_sqrt)
137
+ param_update = ops.mul(update, self.lr)
138
+
139
+ if param.requires_grad:
140
+ param_detached = param.detach()
141
+ param_detached.sub_(param_update)
142
+ param.data = param_detached.data
143
+ else:
144
+ param.sub_(param_update)
145
+
146
+ def zero_grad(self, param: Optional[Tensor] = None):
147
+ """Clear gradients."""
148
+ if param is not None:
149
+ param.zero_grad()
150
+ else:
151
+ for p in self.params:
152
+ p.zero_grad()
153
+
154
+ def add_param_group(self, params: List[Tensor]):
155
+ """Add a parameter group to optimize."""
156
+ self.params.extend(params)
157
+
quantml/optim/adam.py ADDED
@@ -0,0 +1,267 @@
1
+ """
2
+ Adam optimizer implementation.
3
+
4
+ Adam (Adaptive Moment Estimation) is an adaptive learning rate optimizer
5
+ that combines the benefits of AdaGrad and RMSProp.
6
+ """
7
+
8
+ from typing import List, Optional, Dict, Any
9
+ from quantml.tensor import Tensor
10
+ from quantml import ops
11
+
12
+ # Try to import NumPy
13
+ try:
14
+ import numpy as np
15
+ HAS_NUMPY = True
16
+ except ImportError:
17
+ HAS_NUMPY = False
18
+ np = None
19
+
20
+
21
+ class Adam:
22
+ """
23
+ Adam optimizer.
24
+
25
+ Adam maintains per-parameter adaptive learning rates based on estimates
26
+ of first and second moments of gradients.
27
+
28
+ Attributes:
29
+ lr: Learning rate
30
+ betas: Tuple of (beta1, beta2) for moment estimates
31
+ eps: Small value for numerical stability
32
+ m: First moment estimates (momentum)
33
+ v: Second moment estimates (variance)
34
+ step_count: Number of steps taken
35
+
36
+ Examples:
37
+ >>> optimizer = Adam(lr=0.001, betas=(0.9, 0.999))
38
+ >>> for param in model.parameters():
39
+ >>> optimizer.step(param)
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ params: Optional[List[Tensor]] = None,
45
+ lr: float = 0.001,
46
+ betas: tuple = (0.9, 0.999),
47
+ eps: float = 1e-8,
48
+ weight_decay: float = 0.0
49
+ ):
50
+ """
51
+ Initialize Adam optimizer.
52
+
53
+ Args:
54
+ params: Optional list of parameters to optimize
55
+ lr: Learning rate
56
+ betas: Tuple of (beta1, beta2) for exponential decay rates
57
+ eps: Small value to prevent division by zero
58
+ weight_decay: Weight decay (L2 regularization) coefficient
59
+ """
60
+ self.params = params if params is not None else []
61
+ self.lr = lr
62
+ self.beta1, self.beta2 = betas
63
+ self.eps = eps
64
+ self.weight_decay = weight_decay
65
+
66
+ # Moment estimates (stored as NumPy arrays for efficiency)
67
+ self.m: Dict[int, Any] = {} # First moment
68
+ self.v: Dict[int, Any] = {} # Second moment
69
+ self.step_count = 0
70
+
71
+ def step(self, param: Optional[Tensor] = None):
72
+ """
73
+ Perform a single optimization step.
74
+
75
+ Args:
76
+ param: Optional single parameter to update
77
+ """
78
+ if param is not None:
79
+ self._update_param(param)
80
+ else:
81
+ for p in self.params:
82
+ self._update_param(p)
83
+
84
+ self.step_count += 1
85
+
86
+ def _update_param(self, param: Tensor):
87
+ """Update a single parameter using Adam algorithm with direct NumPy operations."""
88
+ if not param.requires_grad:
89
+ return
90
+
91
+ if param.grad is None:
92
+ return
93
+
94
+ param_id = id(param)
95
+
96
+ if HAS_NUMPY:
97
+ try:
98
+ # Get gradient and parameter as NumPy arrays
99
+ grad = param.grad
100
+ if isinstance(grad, np.ndarray):
101
+ grad_arr = grad
102
+ else:
103
+ grad_arr = np.array(grad, dtype=np.float64)
104
+
105
+ param_arr = param.numpy if param.numpy is not None else np.array(param.data, dtype=np.float64)
106
+
107
+ # Initialize moments if needed
108
+ if param_id not in self.m:
109
+ self.m[param_id] = np.zeros_like(param_arr, dtype=np.float64)
110
+ self.v[param_id] = np.zeros_like(param_arr, dtype=np.float64)
111
+
112
+ # Apply weight decay
113
+ if self.weight_decay > 0:
114
+ grad_arr = grad_arr + self.weight_decay * param_arr
115
+
116
+ # Update biased first moment: m = beta1 * m + (1 - beta1) * grad
117
+ m = self.m[param_id]
118
+ m[:] = self.beta1 * m + (1.0 - self.beta1) * grad_arr
119
+
120
+ # Update biased second moment: v = beta2 * v + (1 - beta2) * grad^2
121
+ v = self.v[param_id]
122
+ v[:] = self.beta2 * v + (1.0 - self.beta2) * (grad_arr ** 2)
123
+
124
+ # Bias correction
125
+ bias_correction1 = 1.0 - (self.beta1 ** self.step_count)
126
+ bias_correction2 = 1.0 - (self.beta2 ** self.step_count)
127
+
128
+ # Compute bias-corrected estimates
129
+ m_hat = m / bias_correction1
130
+ v_hat = v / bias_correction2
131
+
132
+ # Update parameter: param = param - lr * m_hat / (sqrt(v_hat) + eps)
133
+ v_hat_sqrt = np.sqrt(v_hat) + self.eps
134
+ update = m_hat / v_hat_sqrt
135
+ param_update = self.lr * update
136
+ new_param_arr = param_arr - param_update
137
+
138
+ # Update parameter data directly
139
+ param.data = new_param_arr
140
+
141
+ except (ValueError, TypeError, AttributeError):
142
+ # Fallback to Tensor operations
143
+ self._update_param_fallback(param)
144
+ else:
145
+ # Fallback to Tensor operations
146
+ self._update_param_fallback(param)
147
+
148
+ def _update_param_fallback(self, param: Tensor):
149
+ """Fallback update using Tensor operations."""
150
+ if param.grad is None:
151
+ return
152
+
153
+ param_id = id(param)
154
+
155
+ # Initialize moments if needed
156
+ if param_id not in self.m:
157
+ if isinstance(param.data[0], list):
158
+ self.m[param_id] = Tensor([[0.0] * len(row) for row in param.data])
159
+ self.v[param_id] = Tensor([[0.0] * len(row) for row in param.data])
160
+ else:
161
+ self.m[param_id] = Tensor([0.0] * len(param.data))
162
+ self.v[param_id] = Tensor([0.0] * len(param.data))
163
+
164
+ # Get gradient
165
+ grad = Tensor(param.grad)
166
+
167
+ # Apply weight decay
168
+ if self.weight_decay > 0:
169
+ grad = ops.add(grad, ops.mul(param, self.weight_decay))
170
+
171
+ # Update biased first moment estimate: m = beta1 * m + (1 - beta1) * grad
172
+ m_prev = self.m[param_id]
173
+ m_new = ops.add(
174
+ ops.mul(m_prev, self.beta1),
175
+ ops.mul(grad, 1.0 - self.beta1)
176
+ )
177
+ self.m[param_id] = m_new
178
+
179
+ # Update biased second moment estimate: v = beta2 * v + (1 - beta2) * grad^2
180
+ v_prev = self.v[param_id]
181
+ grad_sq = ops.mul(grad, grad)
182
+ v_new = ops.add(
183
+ ops.mul(v_prev, self.beta2),
184
+ ops.mul(grad_sq, 1.0 - self.beta2)
185
+ )
186
+ self.v[param_id] = v_new
187
+
188
+ # Bias correction
189
+ bias_correction1 = 1.0 - (self.beta1 ** self.step_count)
190
+ bias_correction2 = 1.0 - (self.beta2 ** self.step_count)
191
+
192
+ # Compute bias-corrected estimates
193
+ m_hat = ops.div(m_new, bias_correction1)
194
+ v_hat = ops.div(v_new, bias_correction2)
195
+
196
+ # Update parameter in-place: param = param - lr * m_hat / (sqrt(v_hat) + eps)
197
+ v_hat_sqrt = ops.pow(ops.add(v_hat, self.eps), 0.5)
198
+ update = ops.div(m_hat, v_hat_sqrt)
199
+ param_update = ops.mul(update, self.lr)
200
+
201
+ # Detach and update in-place
202
+ if param.requires_grad:
203
+ param_detached = param.detach()
204
+ param_detached.sub_(param_update)
205
+ param.data = param_detached.data
206
+ else:
207
+ param.sub_(param_update)
208
+
209
+ def zero_grad(self, param: Optional[Tensor] = None):
210
+ """
211
+ Clear gradients.
212
+
213
+ Args:
214
+ param: Optional single parameter, otherwise clears all
215
+ """
216
+ if param is not None:
217
+ param.zero_grad()
218
+ else:
219
+ for p in self.params:
220
+ p.zero_grad()
221
+
222
+ def add_param_group(self, params: List[Tensor]):
223
+ """Add a parameter group to optimize."""
224
+ self.params.extend(params)
225
+
226
+ def state_dict(self) -> dict:
227
+ """Get optimizer state dictionary."""
228
+ # Convert NumPy arrays to lists for serialization
229
+ m_data = {}
230
+ v_data = {}
231
+ for k, v in self.m.items():
232
+ if HAS_NUMPY and isinstance(v, np.ndarray):
233
+ m_data[k] = v.tolist()
234
+ elif isinstance(v, Tensor):
235
+ m_data[k] = v.data
236
+ else:
237
+ m_data[k] = v
238
+
239
+ for k, v in self.v.items():
240
+ if HAS_NUMPY and isinstance(v, np.ndarray):
241
+ v_data[k] = v.tolist()
242
+ elif isinstance(v, Tensor):
243
+ v_data[k] = v.data
244
+ else:
245
+ v_data[k] = v
246
+
247
+ return {
248
+ 'step_count': self.step_count,
249
+ 'm': m_data,
250
+ 'v': v_data
251
+ }
252
+
253
+ def load_state_dict(self, state_dict: dict):
254
+ """Load optimizer state from dictionary."""
255
+ self.step_count = state_dict.get('step_count', 0)
256
+ # Reconstruct moment arrays from data
257
+ for k, m_data in state_dict.get('m', {}).items():
258
+ if HAS_NUMPY:
259
+ self.m[int(k)] = np.array(m_data, dtype=np.float64)
260
+ else:
261
+ self.m[int(k)] = Tensor(m_data)
262
+ for k, v_data in state_dict.get('v', {}).items():
263
+ if HAS_NUMPY:
264
+ self.v[int(k)] = np.array(v_data, dtype=np.float64)
265
+ else:
266
+ self.v[int(k)] = Tensor(v_data)
267
+