adv-optm 1.2.dev1__tar.gz → 1.2.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of adv-optm might be problematic. Click here for more details.

Files changed (28) hide show
  1. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/PKG-INFO +1 -1
  2. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/__init__.py +1 -1
  3. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/optim/AdamW_adv.py +8 -4
  4. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/optim/Muon_adv.py +57 -2
  5. adv_optm-1.2.dev2/adv_optm/util/MuonAdam_helper.py +31 -0
  6. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm.egg-info/PKG-INFO +1 -1
  7. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm.egg-info/SOURCES.txt +1 -0
  8. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/setup.py +1 -1
  9. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/LICENSE +0 -0
  10. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/README.md +0 -0
  11. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/optim/Adopt_adv.py +0 -0
  12. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/optim/Lion_Prodigy_adv.py +0 -0
  13. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/optim/Lion_adv.py +0 -0
  14. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/optim/Prodigy_adv.py +0 -0
  15. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/optim/Simplified_AdEMAMix.py +0 -0
  16. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/optim/__init__.py +0 -0
  17. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/util/BF16_Stochastic_Rounding.py +0 -0
  18. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/util/Effective_Shape.py +0 -0
  19. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/util/Kourkoutas.py +0 -0
  20. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/util/NNMF.py +0 -0
  21. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/util/Newton_Schulz.py +0 -0
  22. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/util/One_Bit_Boolean.py +0 -0
  23. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/util/OrthoGrad.py +0 -0
  24. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm/util/__init__.py +0 -0
  25. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm.egg-info/dependency_links.txt +0 -0
  26. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm.egg-info/requires.txt +0 -0
  27. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/adv_optm.egg-info/top_level.txt +0 -0
  28. {adv_optm-1.2.dev1 → adv_optm-1.2.dev2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 1.2.dev1
3
+ Version: 1.2.dev2
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -18,4 +18,4 @@ __all__ = [
18
18
  "Muon_adv",
19
19
  ]
20
20
 
21
- __version__ = "1.2.dev1"
21
+ __version__ = "1.2.dev2"
@@ -107,6 +107,7 @@ class AdamW_adv(torch.optim.Optimizer):
107
107
  k_logging: int = 0,
108
108
  layer_key_fn: Optional[Callable] = None,
109
109
  nnmf_factor: bool = False,
110
+ _is_delegate: bool = False,
110
111
  ):
111
112
  if not (lr >= 0.0):
112
113
  raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
@@ -137,10 +138,11 @@ class AdamW_adv(torch.optim.Optimizer):
137
138
  self.factored = nnmf_factor
138
139
  self.kourkoutas_beta = kourkoutas_beta
139
140
  self.layer_key_fn = layer_key_fn
140
- super().__init__(params, defaults)
141
-
142
- if self.kourkoutas_beta:
143
- self.kourkoutas_helper = KourkoutasHelper(self)
141
+ if not _is_delegate:
142
+ super().__init__(params, defaults)
143
+ else:
144
+ self.defaults = defaults
145
+ self.kourkoutas_helper = None
144
146
 
145
147
  @property
146
148
  def supports_fused_back_pass(self):
@@ -158,6 +160,8 @@ class AdamW_adv(torch.optim.Optimizer):
158
160
  def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
159
161
  if p.grad is None:
160
162
  return
163
+ if group.get('kourkoutas_beta', False) and self.kourkoutas_helper is None:
164
+ self.kourkoutas_helper = KourkoutasHelper(self)
161
165
 
162
166
  grad = p.grad
163
167
  if grad.dtype != torch.float32 and self.factored:
@@ -1,5 +1,8 @@
1
1
  import torch
2
- from typing import Optional
2
+ from typing import Optional, Callable
3
+
4
+ from .AdamW_adv import AdamW_adv
5
+ from ..util.MuonAdam_helper import MuonAdamHelper
3
6
 
4
7
  from ..util.BF16_Stochastic_Rounding import add_stochastic_
5
8
  from ..util.Newton_Schulz import _newton_schulz_iteration
@@ -18,6 +21,10 @@ class Muon_adv(torch.optim.Optimizer):
18
21
  This implementation is designed for 2D parameters (e.g., linear layers) and
19
22
  can handle other-dimensional parameters (e.g., 1D bias, 4D convolutional layers) by
20
23
  flattening/reshaping them.
24
+
25
+ This version can also operate in a hybrid mode, using an auxiliary AdamW
26
+ optimizer for specific parameters (e.g., biases, norms, embeddings) as
27
+ defined by a `layer_key_fn`.
21
28
 
22
29
  Args:
23
30
  params (iterable): iterable of parameters to optimize or dicts defining
@@ -39,6 +46,16 @@ class Muon_adv(torch.optim.Optimizer):
39
46
  matrices to apply low-rank compression (default: True).
40
47
  nnmf_factor (bool): whether to use the factorization or disable it to use
41
48
  the uncompressed optimizer. (default: False)
49
+ MuonWithAuxAdam (bool): If True, enables the hybrid optimizer mode.
50
+ Parameters designated by `layer_key_fn` will be optimized with
51
+ AdamW_adv instead of Muon. (default: False)
52
+ layer_key_fn (Optional[Callable]): A function that takes a parameter `p`
53
+ and returns a key. If the key is 'adam', the parameter is handled by
54
+ the auxiliary AdamW optimizer. All other keys are handled by Muon.
55
+ Only used when `MuonWithAuxAdam` is True. (default: None)
56
+ adam_kwargs (Optional[dict]): A dictionary of keyword arguments to pass
57
+ to the auxiliary AdamW_adv optimizer. Only used when
58
+ `MuonWithAuxAdam` is True. (default: None)
42
59
  """
43
60
 
44
61
  def __init__(
@@ -55,6 +72,11 @@ class Muon_adv(torch.optim.Optimizer):
55
72
  vector_reshape_muon: bool = False,
56
73
  vector_reshape: bool = True,
57
74
  nnmf_factor: bool = False,
75
+ # hybrid optimizer mode
76
+ MuonWithAuxAdam: bool = False,
77
+ layer_key_fn: Optional[Callable] = None,
78
+ muon_adam_lr: float = 1e-4,
79
+ adam_kwargs: Optional[dict] = None,
58
80
  ):
59
81
  if not (lr >= 0.0):
60
82
  raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
@@ -73,8 +95,29 @@ class Muon_adv(torch.optim.Optimizer):
73
95
  "vector_reshape_muon": vector_reshape_muon,
74
96
  }
75
97
  self.stochastic_rounding = stochastic_rounding
98
+
99
+ self.MuonWithAuxAdam = MuonWithAuxAdam
100
+ self.helper = None
101
+ self.aux_adam = None
102
+
103
+ if self.MuonWithAuxAdam:
104
+ adam_kwargs = adam_kwargs or {}
105
+ # Create a delegate AdamW optimizer to get its default hyperparameters.
106
+ self.aux_adam = AdamW_adv(
107
+ [],
108
+ lr=muon_adam_lr,
109
+ **adam_kwargs,
110
+ _is_delegate=True
111
+ )
112
+ # Update the defaults dictionary
113
+ defaults.update(self.aux_adam.defaults)
114
+
76
115
  super().__init__(params, defaults)
77
116
 
117
+ if self.MuonWithAuxAdam:
118
+ self.helper = MuonAdamHelper(self, layer_key_fn)
119
+
120
+
78
121
  @property
79
122
  def supports_fused_back_pass(self):
80
123
  return True
@@ -89,6 +132,18 @@ class Muon_adv(torch.optim.Optimizer):
89
132
 
90
133
  @torch.no_grad()
91
134
  def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
135
+ if self.MuonWithAuxAdam:
136
+ optim_type = self.helper.get_optimizer_type(p)
137
+ if optim_type == 'adam':
138
+ # Delegate to the AdamW_adv optimizer's logic.
139
+ # We need to temporarily "lend" our state and param_groups
140
+ # to the delegate so it has the full context to work with,
141
+ # especially for features like Kourkoutas-beta.
142
+ self.aux_adam.state = self.state
143
+ self.aux_adam.param_groups = self.param_groups
144
+ self.aux_adam.step_parameter(p, group, i)
145
+ return
146
+
92
147
  if p.grad is None:
93
148
  return
94
149
 
@@ -244,4 +299,4 @@ class Muon_adv(torch.optim.Optimizer):
244
299
  for i, p in enumerate(group['params']):
245
300
  self.step_parameter(p, group, i)
246
301
 
247
- return loss
302
+ return loss
@@ -0,0 +1,31 @@
1
+ from torch.optim import Optimizer
2
+ from typing import Callable, Optional
3
+
4
+ class MuonAdamHelper:
5
+ """
6
+ A helper class for Muon_adv to decide whether to use Muon or a delegate
7
+ AdamW optimizer for a given parameter based on a keying function.
8
+ """
9
+ def __init__(self, optimizer: Optimizer, layer_key_fn: Optional[Callable]):
10
+ if not hasattr(optimizer, 'param_groups'):
11
+ raise TypeError("optimizer must be a valid torch.optim.Optimizer instance.")
12
+ self.optimizer = optimizer
13
+
14
+ if layer_key_fn is None:
15
+ # If no function is provided, default all parameters to 'muon'.
16
+ self.layer_key_fn = lambda p: 'muon'
17
+ else:
18
+ self.layer_key_fn = layer_key_fn
19
+
20
+ def get_optimizer_type(self, p: "torch.Tensor") -> str:
21
+ """
22
+ Returns the designated optimizer type ('adam' or 'muon') for a parameter.
23
+
24
+ The user-provided layer_key_fn should return 'adam' for parameters
25
+ to be handled by the auxiliary AdamW optimizer. Any other return
26
+ value is treated as 'muon'.
27
+ """
28
+ key = self.layer_key_fn(p)
29
+ if key == 'adam':
30
+ return 'adam'
31
+ return 'muon'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 1.2.dev1
3
+ Version: 1.2.dev2
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -18,6 +18,7 @@ adv_optm/optim/__init__.py
18
18
  adv_optm/util/BF16_Stochastic_Rounding.py
19
19
  adv_optm/util/Effective_Shape.py
20
20
  adv_optm/util/Kourkoutas.py
21
+ adv_optm/util/MuonAdam_helper.py
21
22
  adv_optm/util/NNMF.py
22
23
  adv_optm/util/Newton_Schulz.py
23
24
  adv_optm/util/One_Bit_Boolean.py
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="adv_optm",
8
- version="1.2.dev1",
8
+ version="1.2.dev2",
9
9
  author="Koratahiu",
10
10
  author_email="hiuhonor@gmail.com",
11
11
  license='Apache 2.0',
File without changes
File without changes
File without changes