adv-optm 1.2.dev3__tar.gz → 1.2.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of adv-optm might be problematic. Click here for more details.

Files changed (29) hide show
  1. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/PKG-INFO +1 -1
  2. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/__init__.py +1 -1
  3. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/optim/AdaMuon_adv.py +43 -21
  4. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/optim/Muon_adv.py +35 -17
  5. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm.egg-info/PKG-INFO +1 -1
  6. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/setup.py +1 -1
  7. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/LICENSE +0 -0
  8. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/README.md +0 -0
  9. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/optim/AdamW_adv.py +0 -0
  10. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/optim/Adopt_adv.py +0 -0
  11. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/optim/Lion_Prodigy_adv.py +0 -0
  12. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/optim/Lion_adv.py +0 -0
  13. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/optim/Prodigy_adv.py +0 -0
  14. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/optim/Simplified_AdEMAMix.py +0 -0
  15. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/optim/__init__.py +0 -0
  16. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/util/BF16_Stochastic_Rounding.py +0 -0
  17. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/util/Effective_Shape.py +0 -0
  18. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/util/Kourkoutas.py +0 -0
  19. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/util/MuonAdam_helper.py +0 -0
  20. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/util/NNMF.py +0 -0
  21. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/util/Newton_Schulz.py +0 -0
  22. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/util/One_Bit_Boolean.py +0 -0
  23. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/util/OrthoGrad.py +0 -0
  24. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm/util/__init__.py +0 -0
  25. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm.egg-info/SOURCES.txt +0 -0
  26. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm.egg-info/dependency_links.txt +0 -0
  27. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm.egg-info/requires.txt +0 -0
  28. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/adv_optm.egg-info/top_level.txt +0 -0
  29. {adv_optm-1.2.dev3 → adv_optm-1.2.dev5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 1.2.dev3
3
+ Version: 1.2.dev5
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -20,4 +20,4 @@ __all__ = [
20
20
  "AdaMuon_adv",
21
21
  ]
22
22
 
23
- __version__ = "1.2.dev3"
23
+ __version__ = "1.2.dev5"
@@ -135,7 +135,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
135
135
  print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling cautious.")
136
136
  nesterov = False
137
137
 
138
- defaults = {
138
+ muon_defaults = {
139
139
  "lr": lr, "betas": betas, "weight_decay": weight_decay,
140
140
  "eps": eps, "rms_target": rms_target, "ns_steps": ns_steps,
141
141
  "ns_eps": ns_eps, "ns_coeffs": ns_coeffs, "nnmf_factor": nnmf_factor,
@@ -154,22 +154,45 @@ class AdaMuon_adv(torch.optim.Optimizer):
154
154
  self.helper = None
155
155
  self.aux_adam = None
156
156
 
157
- if self.MuonWithAuxAdam:
158
- adam_kwargs = adam_kwargs or {}
159
- # Create a delegate AdamW optimizer to get its default hyperparameters.
160
- self.aux_adam = AdamW_adv(
161
- [],
162
- lr=muon_adam_lr,
163
- **adam_kwargs,
164
- _is_delegate=True
165
- )
166
- # Update the defaults dictionary
167
- defaults.update(self.aux_adam.defaults)
168
-
169
- super().__init__(params, defaults)
157
+ if not self.MuonWithAuxAdam:
158
+ super().__init__(params, muon_defaults)
159
+ return
170
160
 
171
- if self.MuonWithAuxAdam:
172
- self.helper = MuonAdamHelper(self, layer_key_fn)
161
+ # HYBRID OPTIMIZER LOGIC
162
+ adam_kwargs = adam_kwargs or {}
163
+ self.aux_adam = AdamW_adv(
164
+ [],
165
+ lr=muon_adam_lr,
166
+ **adam_kwargs,
167
+ _is_delegate=True
168
+ )
169
+ adam_defaults = self.aux_adam.defaults
170
+
171
+ final_param_groups = []
172
+ _layer_key_fn = layer_key_fn if layer_key_fn is not None else lambda p: 'muon'
173
+
174
+ for group in params:
175
+ # All params in a group are of the same type
176
+ first_param = group['params'][0]
177
+ key = _layer_key_fn(first_param)
178
+ optim_type = 'adam' if key == 'adam' else 'muon'
179
+
180
+ new_group = group.copy()
181
+ defaults_to_use = adam_defaults if optim_type == 'adam' else muon_defaults
182
+
183
+ for key, value in defaults_to_use.items():
184
+ new_group.setdefault(key, value)
185
+ if '_kourkoutas_beta' not in new_group:
186
+ if optim_type == 'adam':
187
+ new_group['_kourkoutas_beta'] = False
188
+ else:
189
+ new_group['_kourkoutas_beta'] = muon_defaults['_kourkoutas_beta']
190
+ final_param_groups.append(new_group)
191
+
192
+ super().__init__(final_param_groups, {})
193
+
194
+ # Now that self is initialized, create the helper
195
+ self.helper = MuonAdamHelper(self, layer_key_fn)
173
196
 
174
197
 
175
198
  @property
@@ -196,21 +219,20 @@ class AdaMuon_adv(torch.optim.Optimizer):
196
219
 
197
220
  @torch.no_grad()
198
221
  def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
222
+ if group['_kourkoutas_beta'] and self._kourkoutas_helper is None:
223
+ self._kourkoutas_helper = KourkoutasHelper(self)
224
+
199
225
  if self.MuonWithAuxAdam:
200
226
  optim_type = self.helper.get_optimizer_type(p)
201
227
  if optim_type == 'adam':
202
228
  # Delegate to the AdamW_adv optimizer's logic.
203
229
  # We need to temporarily "lend" our state and param_groups
204
- # to the delegate so it has the full context to work with,
205
- # especially for features like Kourkoutas-beta.
206
230
  self.aux_adam.state = self.state
207
231
  self.aux_adam.param_groups = self.param_groups
232
+
208
233
  self.aux_adam.step_parameter(p, group, i)
209
234
  return
210
235
 
211
- if group['_kourkoutas_beta'] and self._kourkoutas_helper is None:
212
- self._kourkoutas_helper = KourkoutasHelper(self)
213
-
214
236
  if p.grad is None:
215
237
  return
216
238
 
@@ -100,7 +100,7 @@ class Muon_adv(torch.optim.Optimizer):
100
100
  print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling cautious.")
101
101
  nesterov = False
102
102
 
103
- defaults = {
103
+ muon_defaults = {
104
104
  "lr": lr, "beta1": beta1, "weight_decay": weight_decay,
105
105
  "nesterov": nesterov, "ns_steps": ns_steps, "ns_eps": ns_eps,
106
106
  "ns_coeffs": ns_coeffs, "nnmf_factor": nnmf_factor,
@@ -114,23 +114,41 @@ class Muon_adv(torch.optim.Optimizer):
114
114
  self.helper = None
115
115
  self.aux_adam = None
116
116
 
117
- if self.MuonWithAuxAdam:
118
- adam_kwargs = adam_kwargs or {}
119
- # Create a delegate AdamW optimizer to get its default hyperparameters.
120
- self.aux_adam = AdamW_adv(
121
- [],
122
- lr=muon_adam_lr,
123
- **adam_kwargs,
124
- _is_delegate=True
125
- )
126
- # Update the defaults dictionary
127
- defaults.update(self.aux_adam.defaults)
128
-
129
- super().__init__(params, defaults)
117
+ if not self.MuonWithAuxAdam:
118
+ super().__init__(params, muon_defaults)
119
+ return
130
120
 
131
- if self.MuonWithAuxAdam:
132
- self.helper = MuonAdamHelper(self, layer_key_fn)
121
+ # HYBRID OPTIMIZER LOGIC
122
+ adam_kwargs = adam_kwargs or {}
123
+ self.aux_adam = AdamW_adv(
124
+ [],
125
+ lr=muon_adam_lr,
126
+ **adam_kwargs,
127
+ _is_delegate=True
128
+ )
129
+ adam_defaults = self.aux_adam.defaults
130
+
131
+ final_param_groups = []
132
+ _layer_key_fn = layer_key_fn if layer_key_fn is not None else lambda p: 'muon'
133
+
134
+ for group in params:
135
+ first_param = group['params'][0]
136
+ key = _layer_key_fn(first_param)
137
+ optim_type = 'adam' if key == 'adam' else 'muon'
138
+
139
+ new_group = group.copy()
140
+ defaults_to_use = adam_defaults if optim_type == 'adam' else muon_defaults
141
+
142
+ for key, value in defaults_to_use.items():
143
+ new_group.setdefault(key, value)
144
+
145
+ final_param_groups.append(new_group)
146
+
147
+ super().__init__(final_param_groups, {})
133
148
 
149
+ # Now that self is initialized, create the helper
150
+ self.helper = MuonAdamHelper(self, layer_key_fn)
151
+
134
152
 
135
153
  @property
136
154
  def supports_fused_back_pass(self):
@@ -335,4 +353,4 @@ class Muon_adv(torch.optim.Optimizer):
335
353
  for i, p in enumerate(group['params']):
336
354
  self.step_parameter(p, group, i)
337
355
 
338
- return loss
356
+ return loss
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 1.2.dev3
3
+ Version: 1.2.dev5
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="adv_optm",
8
- version="1.2.dev3",
8
+ version="1.2.dev5",
9
9
  author="Koratahiu",
10
10
  author_email="hiuhonor@gmail.com",
11
11
  license='Apache 2.0',
File without changes
File without changes
File without changes