adv-optm 2.2.1.dev2__tar.gz → 2.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/PKG-INFO +5 -9
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/README.md +4 -8
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/__init__.py +1 -1
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Adopt_adv.py +3 -1
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Prodigy_adv.py +4 -4
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Simplified_AdEMAMix.py +3 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/update_util.py +6 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm.egg-info/PKG-INFO +5 -9
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/setup.py +1 -1
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/LICENSE +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/AdaMuon_adv.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/AdamW_adv.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Lion_Prodigy_adv.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Lion_adv.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Muon_adv.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/SignSGD_adv.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/__init__.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/Kourkoutas.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/Muon_AuxAdam.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/Muon_util.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/OrthoGrad.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/__init__.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/factorization_util.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/lion_k.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/param_update.py +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm.egg-info/SOURCES.txt +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm.egg-info/dependency_links.txt +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm.egg-info/requires.txt +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm.egg-info/top_level.txt +0 -0
- {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: adv_optm
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.3
|
|
4
4
|
Summary: A family of highly efficient, lightweight yet powerful optimizers.
|
|
5
5
|
Home-page: https://github.com/Koratahiu/Advanced_Optimizers
|
|
6
6
|
Author: Koratahiu
|
|
@@ -37,6 +37,10 @@ A comprehensive, all-in-one collection of optimization algorithms for deep learn
|
|
|
37
37
|
|
|
38
38
|
## 🔥 What's New
|
|
39
39
|
|
|
40
|
+
### in 2.2.2
|
|
41
|
+
|
|
42
|
+
- `Simplified_AdEMAMix` now uses the same LR as AdamW for all `beta1` and `alpha_grad` values!
|
|
43
|
+
|
|
40
44
|
### in 2.1.x
|
|
41
45
|
|
|
42
46
|
- Added Signum (SignSGD with momentum): A new optimizer in the family (SignSGD_adv)
|
|
@@ -195,14 +199,6 @@ This library integrates multiple state-of-the-art optimization techniques valida
|
|
|
195
199
|
| `beta1` | 0.99 | Controls accumulator memory length:<br>• Small BS: **0.99–0.9999**<br>• Large BS: **0.9** |
|
|
196
200
|
| `Grad α` | 100 | Most critical parameter:<br>• Inversely scales with batch size<br>• **100–10** for small BS (≤32)<br>• **1–0.1** for large BS (≥512) |
|
|
197
201
|
|
|
198
|
-
> ⚠️ **Critical**: Requires **~100x smaller learning rate** than AdamW (e.g., 1e-6 vs 1e-4).
|
|
199
|
-
> For `Prodigy_Adv`, set `initial_d` to:
|
|
200
|
-
> - **LoRA**: `1e-8`
|
|
201
|
-
> - **Full FT**: `1e-10`
|
|
202
|
-
> - **Embedding**: `1e-7`
|
|
203
|
-
|
|
204
|
-
> ⚠️ **Incompatible** with: **Cautious**, **Grams**, **atan2**, and standard update clipping.
|
|
205
|
-
|
|
206
202
|
---
|
|
207
203
|
|
|
208
204
|
### atan2
|
|
@@ -6,6 +6,10 @@ A comprehensive, all-in-one collection of optimization algorithms for deep learn
|
|
|
6
6
|
|
|
7
7
|
## 🔥 What's New
|
|
8
8
|
|
|
9
|
+
### in 2.2.2
|
|
10
|
+
|
|
11
|
+
- `Simplified_AdEMAMix` now uses the same LR as AdamW for all `beta1` and `alpha_grad` values!
|
|
12
|
+
|
|
9
13
|
### in 2.1.x
|
|
10
14
|
|
|
11
15
|
- Added Signum (SignSGD with momentum): A new optimizer in the family (SignSGD_adv)
|
|
@@ -164,14 +168,6 @@ This library integrates multiple state-of-the-art optimization techniques valida
|
|
|
164
168
|
| `beta1` | 0.99 | Controls accumulator memory length:<br>• Small BS: **0.99–0.9999**<br>• Large BS: **0.9** |
|
|
165
169
|
| `Grad α` | 100 | Most critical parameter:<br>• Inversely scales with batch size<br>• **100–10** for small BS (≤32)<br>• **1–0.1** for large BS (≥512) |
|
|
166
170
|
|
|
167
|
-
> ⚠️ **Critical**: Requires **~100x smaller learning rate** than AdamW (e.g., 1e-6 vs 1e-4).
|
|
168
|
-
> For `Prodigy_Adv`, set `initial_d` to:
|
|
169
|
-
> - **LoRA**: `1e-8`
|
|
170
|
-
> - **Full FT**: `1e-10`
|
|
171
|
-
> - **Embedding**: `1e-7`
|
|
172
|
-
|
|
173
|
-
> ⚠️ **Incompatible** with: **Cautious**, **Grams**, **atan2**, and standard update clipping.
|
|
174
|
-
|
|
175
171
|
---
|
|
176
172
|
|
|
177
173
|
### atan2
|
|
@@ -7,7 +7,7 @@ from ..util import param_update
|
|
|
7
7
|
from ..util.factorization_util import _get_effective_shape, _reconstruct_state, _factorize_state, _nnmf
|
|
8
8
|
from ..util.OrthoGrad import _orthogonalize_gradient
|
|
9
9
|
from ..util.Kourkoutas import KourkoutasHelper
|
|
10
|
-
from ..util.update_util import _grams_update, _cautious_update
|
|
10
|
+
from ..util.update_util import _grams_update, _cautious_update, _scale_sim_AdEMAMix_update
|
|
11
11
|
|
|
12
12
|
A = 4 / math.pi
|
|
13
13
|
|
|
@@ -279,6 +279,8 @@ class Adopt_adv(torch.optim.Optimizer):
|
|
|
279
279
|
lr = group['lr']
|
|
280
280
|
step_param_fn = self._step_parameter
|
|
281
281
|
|
|
282
|
+
if self.Simplified_AdEMAMix:
|
|
283
|
+
lr = _scale_sim_AdEMAMix_update(beta1, state['step'] + 1, group["alpha_grad"], lr)
|
|
282
284
|
|
|
283
285
|
step_param_fn(p, grad, state, group, lr, beta1, beta2, random_int_tensor)
|
|
284
286
|
|
|
@@ -9,7 +9,7 @@ from ..util import param_update
|
|
|
9
9
|
from ..util.OrthoGrad import _orthogonalize_gradient
|
|
10
10
|
from ..util.Kourkoutas import KourkoutasHelper
|
|
11
11
|
from ..util.factorization_util import _get_effective_shape, _reconstruct_state, _factorize_state
|
|
12
|
-
from ..util.update_util import _grams_update, _cautious_update
|
|
12
|
+
from ..util.update_util import _grams_update, _cautious_update, _scale_sim_AdEMAMix_update
|
|
13
13
|
|
|
14
14
|
A = 4 / math.pi
|
|
15
15
|
|
|
@@ -188,9 +188,6 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
188
188
|
use_atan2 = False
|
|
189
189
|
if kourkoutas_beta and not (betas[1] > beta2_min):
|
|
190
190
|
raise ValueError(f"For Kourkoutas-β, betas[1] (as beta2_max) must be > beta2_min. Got {betas[1]} and {beta2_min}")
|
|
191
|
-
if Simplified_AdEMAMix and alpha_grad > 0 and not d_limiter:
|
|
192
|
-
# scales d_coef by alpha_grad, this force prodigy to behave well with Simplified_AdEMAMix.
|
|
193
|
-
d_coef = d_coef/alpha_grad
|
|
194
191
|
|
|
195
192
|
defaults = {
|
|
196
193
|
"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "cautious_wd": cautious_wd,
|
|
@@ -349,6 +346,9 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
349
346
|
d = group['d']
|
|
350
347
|
step_param_fn = self._step_parameter
|
|
351
348
|
|
|
349
|
+
if self.Simplified_AdEMAMix:
|
|
350
|
+
dlr = _scale_sim_AdEMAMix_update(self.beta1, state['step'] + 1, group["alpha_grad"], dlr)
|
|
351
|
+
|
|
352
352
|
step_param_fn(p, grad, state, group, beta2, d, dlr, random_int_tensor)
|
|
353
353
|
|
|
354
354
|
state['step'] += 1
|
|
@@ -7,6 +7,7 @@ from ..util import param_update
|
|
|
7
7
|
from ..util.OrthoGrad import _orthogonalize_gradient
|
|
8
8
|
from ..util.Kourkoutas import KourkoutasHelper
|
|
9
9
|
from ..util.factorization_util import _get_effective_shape, _reconstruct_state, _factorize_state
|
|
10
|
+
from ..util.update_util import _scale_sim_AdEMAMix_update
|
|
10
11
|
|
|
11
12
|
# A little helper from the original simplified_AdEMAMix
|
|
12
13
|
def linear_hl_warmup_scheduler(step, beta_end, beta_start=0, warmup=1):
|
|
@@ -237,6 +238,8 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
|
|
|
237
238
|
|
|
238
239
|
lr = group["lr"]
|
|
239
240
|
|
|
241
|
+
lr = _scale_sim_AdEMAMix_update(beta1, state['step'] + 1, group["alpha_grad"], lr)
|
|
242
|
+
|
|
240
243
|
random_int_tensor = None
|
|
241
244
|
|
|
242
245
|
if group.get('compiled_optimizer', False):
|
|
@@ -22,3 +22,9 @@ def _cautious_update(mt: torch.Tensor, grad: torch.Tensor, inplace: bool=False):
|
|
|
22
22
|
update_mt = mt.mul(mask)
|
|
23
23
|
del mask
|
|
24
24
|
return update_mt
|
|
25
|
+
|
|
26
|
+
def _scale_sim_AdEMAMix_update(beta: float, current_step: int, alpha_grad: float, lr: float):
|
|
27
|
+
momentum_scale = (1 - beta ** current_step) / (1 - beta)
|
|
28
|
+
total_scale = 1 / (momentum_scale + alpha_grad)
|
|
29
|
+
lr = lr * total_scale
|
|
30
|
+
return lr
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: adv_optm
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.3
|
|
4
4
|
Summary: A family of highly efficient, lightweight yet powerful optimizers.
|
|
5
5
|
Home-page: https://github.com/Koratahiu/Advanced_Optimizers
|
|
6
6
|
Author: Koratahiu
|
|
@@ -37,6 +37,10 @@ A comprehensive, all-in-one collection of optimization algorithms for deep learn
|
|
|
37
37
|
|
|
38
38
|
## 🔥 What's New
|
|
39
39
|
|
|
40
|
+
### in 2.2.2
|
|
41
|
+
|
|
42
|
+
- `Simplified_AdEMAMix` now uses the same LR as AdamW for all `beta1` and `alpha_grad` values!
|
|
43
|
+
|
|
40
44
|
### in 2.1.x
|
|
41
45
|
|
|
42
46
|
- Added Signum (SignSGD with momentum): A new optimizer in the family (SignSGD_adv)
|
|
@@ -195,14 +199,6 @@ This library integrates multiple state-of-the-art optimization techniques valida
|
|
|
195
199
|
| `beta1` | 0.99 | Controls accumulator memory length:<br>• Small BS: **0.99–0.9999**<br>• Large BS: **0.9** |
|
|
196
200
|
| `Grad α` | 100 | Most critical parameter:<br>• Inversely scales with batch size<br>• **100–10** for small BS (≤32)<br>• **1–0.1** for large BS (≥512) |
|
|
197
201
|
|
|
198
|
-
> ⚠️ **Critical**: Requires **~100x smaller learning rate** than AdamW (e.g., 1e-6 vs 1e-4).
|
|
199
|
-
> For `Prodigy_Adv`, set `initial_d` to:
|
|
200
|
-
> - **LoRA**: `1e-8`
|
|
201
|
-
> - **Full FT**: `1e-10`
|
|
202
|
-
> - **Embedding**: `1e-7`
|
|
203
|
-
|
|
204
|
-
> ⚠️ **Incompatible** with: **Cautious**, **Grams**, **atan2**, and standard update clipping.
|
|
205
|
-
|
|
206
202
|
---
|
|
207
203
|
|
|
208
204
|
### atan2
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|