adv-optm 2.2.1.dev2__tar.gz → 2.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/PKG-INFO +5 -9
  2. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/README.md +4 -8
  3. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/__init__.py +1 -1
  4. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Adopt_adv.py +3 -1
  5. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Prodigy_adv.py +4 -4
  6. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Simplified_AdEMAMix.py +3 -0
  7. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/update_util.py +6 -0
  8. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm.egg-info/PKG-INFO +5 -9
  9. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/setup.py +1 -1
  10. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/LICENSE +0 -0
  11. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/AdaMuon_adv.py +0 -0
  12. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/AdamW_adv.py +0 -0
  13. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Lion_Prodigy_adv.py +0 -0
  14. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Lion_adv.py +0 -0
  15. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/Muon_adv.py +0 -0
  16. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/SignSGD_adv.py +0 -0
  17. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/optim/__init__.py +0 -0
  18. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/Kourkoutas.py +0 -0
  19. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/Muon_AuxAdam.py +0 -0
  20. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/Muon_util.py +0 -0
  21. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/OrthoGrad.py +0 -0
  22. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/__init__.py +0 -0
  23. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/factorization_util.py +0 -0
  24. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/lion_k.py +0 -0
  25. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm/util/param_update.py +0 -0
  26. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm.egg-info/SOURCES.txt +0 -0
  27. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm.egg-info/dependency_links.txt +0 -0
  28. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm.egg-info/requires.txt +0 -0
  29. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/adv_optm.egg-info/top_level.txt +0 -0
  30. {adv_optm-2.2.1.dev2 → adv_optm-2.2.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 2.2.1.dev2
3
+ Version: 2.2.3
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -37,6 +37,10 @@ A comprehensive, all-in-one collection of optimization algorithms for deep learn
37
37
 
38
38
  ## 🔥 What's New
39
39
 
40
+ ### in 2.2.2
41
+
42
+ - `Simplified_AdEMAMix` now uses the same LR as AdamW for all `beta1` and `alpha_grad` values!
43
+
40
44
  ### in 2.1.x
41
45
 
42
46
  - Added Signum (SignSGD with momentum): A new optimizer in the family (SignSGD_adv)
@@ -195,14 +199,6 @@ This library integrates multiple state-of-the-art optimization techniques valida
195
199
  | `beta1` | 0.99 | Controls accumulator memory length:<br>• Small BS: **0.99–0.9999**<br>• Large BS: **0.9** |
196
200
  | `Grad α` | 100 | Most critical parameter:<br>• Inversely scales with batch size<br>• **100–10** for small BS (≤32)<br>• **1–0.1** for large BS (≥512) |
197
201
 
198
- > ⚠️ **Critical**: Requires **~100x smaller learning rate** than AdamW (e.g., 1e-6 vs 1e-4).
199
- > For `Prodigy_Adv`, set `initial_d` to:
200
- > - **LoRA**: `1e-8`
201
- > - **Full FT**: `1e-10`
202
- > - **Embedding**: `1e-7`
203
-
204
- > ⚠️ **Incompatible** with: **Cautious**, **Grams**, **atan2**, and standard update clipping.
205
-
206
202
  ---
207
203
 
208
204
  ### atan2
@@ -6,6 +6,10 @@ A comprehensive, all-in-one collection of optimization algorithms for deep learn
6
6
 
7
7
  ## 🔥 What's New
8
8
 
9
+ ### in 2.2.2
10
+
11
+ - `Simplified_AdEMAMix` now uses the same LR as AdamW for all `beta1` and `alpha_grad` values!
12
+
9
13
  ### in 2.1.x
10
14
 
11
15
  - Added Signum (SignSGD with momentum): A new optimizer in the family (SignSGD_adv)
@@ -164,14 +168,6 @@ This library integrates multiple state-of-the-art optimization techniques valida
164
168
  | `beta1` | 0.99 | Controls accumulator memory length:<br>• Small BS: **0.99–0.9999**<br>• Large BS: **0.9** |
165
169
  | `Grad α` | 100 | Most critical parameter:<br>• Inversely scales with batch size<br>• **100–10** for small BS (≤32)<br>• **1–0.1** for large BS (≥512) |
166
170
 
167
- > ⚠️ **Critical**: Requires **~100x smaller learning rate** than AdamW (e.g., 1e-6 vs 1e-4).
168
- > For `Prodigy_Adv`, set `initial_d` to:
169
- > - **LoRA**: `1e-8`
170
- > - **Full FT**: `1e-10`
171
- > - **Embedding**: `1e-7`
172
-
173
- > ⚠️ **Incompatible** with: **Cautious**, **Grams**, **atan2**, and standard update clipping.
174
-
175
171
  ---
176
172
 
177
173
  ### atan2
@@ -22,4 +22,4 @@ __all__ = [
22
22
  "SignSGD_adv",
23
23
  ]
24
24
 
25
- __version__ = "2.2.1.dev2"
25
+ __version__ = "2.2.3"
@@ -7,7 +7,7 @@ from ..util import param_update
7
7
  from ..util.factorization_util import _get_effective_shape, _reconstruct_state, _factorize_state, _nnmf
8
8
  from ..util.OrthoGrad import _orthogonalize_gradient
9
9
  from ..util.Kourkoutas import KourkoutasHelper
10
- from ..util.update_util import _grams_update, _cautious_update
10
+ from ..util.update_util import _grams_update, _cautious_update, _scale_sim_AdEMAMix_update
11
11
 
12
12
  A = 4 / math.pi
13
13
 
@@ -279,6 +279,8 @@ class Adopt_adv(torch.optim.Optimizer):
279
279
  lr = group['lr']
280
280
  step_param_fn = self._step_parameter
281
281
 
282
+ if self.Simplified_AdEMAMix:
283
+ lr = _scale_sim_AdEMAMix_update(beta1, state['step'] + 1, group["alpha_grad"], lr)
282
284
 
283
285
  step_param_fn(p, grad, state, group, lr, beta1, beta2, random_int_tensor)
284
286
 
@@ -9,7 +9,7 @@ from ..util import param_update
9
9
  from ..util.OrthoGrad import _orthogonalize_gradient
10
10
  from ..util.Kourkoutas import KourkoutasHelper
11
11
  from ..util.factorization_util import _get_effective_shape, _reconstruct_state, _factorize_state
12
- from ..util.update_util import _grams_update, _cautious_update
12
+ from ..util.update_util import _grams_update, _cautious_update, _scale_sim_AdEMAMix_update
13
13
 
14
14
  A = 4 / math.pi
15
15
 
@@ -188,9 +188,6 @@ class Prodigy_adv(torch.optim.Optimizer):
188
188
  use_atan2 = False
189
189
  if kourkoutas_beta and not (betas[1] > beta2_min):
190
190
  raise ValueError(f"For Kourkoutas-β, betas[1] (as beta2_max) must be > beta2_min. Got {betas[1]} and {beta2_min}")
191
- if Simplified_AdEMAMix and alpha_grad > 0 and not d_limiter:
192
- # scales d_coef by alpha_grad, this force prodigy to behave well with Simplified_AdEMAMix.
193
- d_coef = d_coef/alpha_grad
194
191
 
195
192
  defaults = {
196
193
  "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "cautious_wd": cautious_wd,
@@ -349,6 +346,9 @@ class Prodigy_adv(torch.optim.Optimizer):
349
346
  d = group['d']
350
347
  step_param_fn = self._step_parameter
351
348
 
349
+ if self.Simplified_AdEMAMix:
350
+ dlr = _scale_sim_AdEMAMix_update(self.beta1, state['step'] + 1, group["alpha_grad"], dlr)
351
+
352
352
  step_param_fn(p, grad, state, group, beta2, d, dlr, random_int_tensor)
353
353
 
354
354
  state['step'] += 1
@@ -7,6 +7,7 @@ from ..util import param_update
7
7
  from ..util.OrthoGrad import _orthogonalize_gradient
8
8
  from ..util.Kourkoutas import KourkoutasHelper
9
9
  from ..util.factorization_util import _get_effective_shape, _reconstruct_state, _factorize_state
10
+ from ..util.update_util import _scale_sim_AdEMAMix_update
10
11
 
11
12
  # A little helper from the original simplified_AdEMAMix
12
13
  def linear_hl_warmup_scheduler(step, beta_end, beta_start=0, warmup=1):
@@ -237,6 +238,8 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
237
238
 
238
239
  lr = group["lr"]
239
240
 
241
+ lr = _scale_sim_AdEMAMix_update(beta1, state['step'] + 1, group["alpha_grad"], lr)
242
+
240
243
  random_int_tensor = None
241
244
 
242
245
  if group.get('compiled_optimizer', False):
@@ -22,3 +22,9 @@ def _cautious_update(mt: torch.Tensor, grad: torch.Tensor, inplace: bool=False):
22
22
  update_mt = mt.mul(mask)
23
23
  del mask
24
24
  return update_mt
25
+
26
+ def _scale_sim_AdEMAMix_update(beta: float, current_step: int, alpha_grad: float, lr: float):
27
+ momentum_scale = (1 - beta ** current_step) / (1 - beta)
28
+ total_scale = 1 / (momentum_scale + alpha_grad)
29
+ lr = lr * total_scale
30
+ return lr
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 2.2.1.dev2
3
+ Version: 2.2.3
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -37,6 +37,10 @@ A comprehensive, all-in-one collection of optimization algorithms for deep learn
37
37
 
38
38
  ## 🔥 What's New
39
39
 
40
+ ### in 2.2.2
41
+
42
+ - `Simplified_AdEMAMix` now uses the same LR as AdamW for all `beta1` and `alpha_grad` values!
43
+
40
44
  ### in 2.1.x
41
45
 
42
46
  - Added Signum (SignSGD with momentum): A new optimizer in the family (SignSGD_adv)
@@ -195,14 +199,6 @@ This library integrates multiple state-of-the-art optimization techniques valida
195
199
  | `beta1` | 0.99 | Controls accumulator memory length:<br>• Small BS: **0.99–0.9999**<br>• Large BS: **0.9** |
196
200
  | `Grad α` | 100 | Most critical parameter:<br>• Inversely scales with batch size<br>• **100–10** for small BS (≤32)<br>• **1–0.1** for large BS (≥512) |
197
201
 
198
- > ⚠️ **Critical**: Requires **~100x smaller learning rate** than AdamW (e.g., 1e-6 vs 1e-4).
199
- > For `Prodigy_Adv`, set `initial_d` to:
200
- > - **LoRA**: `1e-8`
201
- > - **Full FT**: `1e-10`
202
- > - **Embedding**: `1e-7`
203
-
204
- > ⚠️ **Incompatible** with: **Cautious**, **Grams**, **atan2**, and standard update clipping.
205
-
206
202
  ---
207
203
 
208
204
  ### atan2
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="adv_optm",
8
- version="2.2.1.dev2",
8
+ version="2.2.3",
9
9
  author="Koratahiu",
10
10
  author_email="hiuhonor@gmail.com",
11
11
  license='Apache 2.0',
File without changes
File without changes