adv-optm 1.2.dev10__tar.gz → 1.2.dev12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of adv-optm might be problematic. Click here for more details.

Files changed (28) hide show
  1. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/PKG-INFO +1 -1
  2. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/__init__.py +1 -1
  3. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/optim/AdaMuon_adv.py +3 -65
  4. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/optim/AdamW_adv.py +4 -8
  5. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/optim/Muon_adv.py +5 -66
  6. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/util/Kourkoutas.py +1 -40
  7. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm.egg-info/PKG-INFO +1 -1
  8. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/setup.py +1 -1
  9. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/LICENSE +0 -0
  10. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/README.md +0 -0
  11. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/optim/Adopt_adv.py +0 -0
  12. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/optim/Lion_Prodigy_adv.py +0 -0
  13. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/optim/Lion_adv.py +0 -0
  14. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/optim/Prodigy_adv.py +0 -0
  15. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/optim/Simplified_AdEMAMix.py +0 -0
  16. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/optim/__init__.py +0 -0
  17. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/util/BF16_Stochastic_Rounding.py +0 -0
  18. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/util/Effective_Shape.py +0 -0
  19. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/util/NNMF.py +0 -0
  20. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/util/Newton_Schulz.py +0 -0
  21. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/util/One_Bit_Boolean.py +0 -0
  22. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/util/OrthoGrad.py +0 -0
  23. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm/util/__init__.py +0 -0
  24. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm.egg-info/SOURCES.txt +0 -0
  25. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm.egg-info/dependency_links.txt +0 -0
  26. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm.egg-info/requires.txt +0 -0
  27. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/adv_optm.egg-info/top_level.txt +0 -0
  28. {adv_optm-1.2.dev10 → adv_optm-1.2.dev12}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 1.2.dev10
3
+ Version: 1.2.dev12
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -20,4 +20,4 @@ __all__ = [
20
20
  "AdaMuon_adv",
21
21
  ]
22
22
 
23
- __version__ = "1.2.dev10"
23
+ __version__ = "1.2.dev12"
@@ -11,7 +11,7 @@ from ..util.One_Bit_Boolean import _pack_bools, _unpack_bools
11
11
 
12
12
  class AdaMuon_adv(torch.optim.Optimizer):
13
13
  """
14
- Implements the AdaMuon optimizer algorithm.
14
+ IImplements an advanced AdaMuon optimizer algorithm.
15
15
 
16
16
  AdaMuon combines the geometry-aware updates of Muon with the element-wise
17
17
  adaptivity of Adam. It is designed for 2D parameters (e.g., linear layers)
@@ -25,9 +25,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
25
25
  3. An RMS-aligned rescaling strategy to match the update magnitude of Adam,
26
26
  allowing for reuse of learning rate schedules.
27
27
 
28
- Can also operate in a hybrid mode, using an auxiliary AdamW
29
- optimizer for specific parameters (e.g., biases, norms, embeddings) as
30
- defined by a `layer_key_fn`.
31
28
 
32
29
  Args:
33
30
  params (iterable): iterable of parameters to optimize or dicts defining
@@ -69,12 +66,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
69
66
  (default: 128)
70
67
  nnmf_factor (bool): whether to use the factorization or disable it to use
71
68
  the uncompressed optimizer. (default: False)
72
- MuonWithAuxAdam (bool): If True, enables the hybrid optimizer mode.
73
- Parameters designated by `layer_key_fn` will be optimized with
74
- AdamW_adv instead of Muon. (default: False)
75
- adam_kwargs (Optional[dict]): A dictionary of keyword arguments to pass
76
- to the auxiliary AdamW_adv optimizer. Only used when
77
- `MuonWithAuxAdam` is True. (default: None)
78
69
  """
79
70
 
80
71
  def __init__(
@@ -99,10 +90,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
99
90
  low_rank_ortho: bool = False,
100
91
  ortho_rank: int = 128,
101
92
  nnmf_factor: bool = False,
102
- # hybrid optimizer mode
103
- MuonWithAuxAdam: bool = False,
104
- muon_adam_lr: float = 1e-4,
105
- adam_kwargs: Optional[dict] = None,
106
93
  ):
107
94
  if not (lr >= 0.0):
108
95
  raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
@@ -114,7 +101,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
114
101
  print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling cautious.")
115
102
  nesterov = False
116
103
 
117
- muon_defaults = {
104
+ defaults = {
118
105
  "lr": lr, "betas": betas, "weight_decay": weight_decay,
119
106
  "eps": eps, "rms_target": rms_target, "ns_steps": ns_steps,
120
107
  "ns_eps": ns_eps, "ns_coeffs": ns_coeffs, "nnmf_factor": nnmf_factor,
@@ -127,34 +114,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
127
114
  }
128
115
  self.stochastic_rounding = stochastic_rounding
129
116
 
130
- self.MuonWithAuxAdam = MuonWithAuxAdam
131
- self.aux_adam = None
132
-
133
- if not self.MuonWithAuxAdam:
134
- super().__init__(params, muon_defaults)
135
- return
136
-
137
- # HYBRID OPTIMIZER LOGIC
138
- adam_kwargs = adam_kwargs or {}
139
- self.aux_adam = AdamW_adv(
140
- [],
141
- lr=muon_adam_lr,
142
- **adam_kwargs,
143
- _is_delegate=True
144
- )
145
- adam_defaults = self.aux_adam.defaults
146
-
147
- final_param_groups = []
148
- for group in params:
149
- optim_type = group.get('optim_type', 'muon')
150
- defaults_to_use = adam_defaults if optim_type == 'adam' else muon_defaults
151
-
152
- new_group = group.copy()
153
- for key, value in defaults_to_use.items():
154
- new_group.setdefault(key, value)
155
- final_param_groups.append(new_group)
156
-
157
- super().__init__(final_param_groups, muon_defaults)
117
+ super().__init__(params, defaults)
158
118
 
159
119
 
160
120
  @property
@@ -169,30 +129,8 @@ class AdaMuon_adv(torch.optim.Optimizer):
169
129
  def supports_flat_params(self):
170
130
  return False
171
131
 
172
- @property
173
- def kourkoutas_helper(self):
174
- """
175
- Exposes the kourkoutas_helper from the auxiliary AdamW optimizer,
176
- if it exists. This allows external access for logging K-b.
177
- """
178
- if self.aux_adam and hasattr(self.aux_adam, 'kourkoutas_helper'):
179
- return self.aux_adam.kourkoutas_helper
180
- return None
181
-
182
132
  @torch.no_grad()
183
133
  def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
184
- if self.MuonWithAuxAdam:
185
- optim_type = group.get('optim_type')
186
- if optim_type == 'adam':
187
- # Delegate to the AdamW_adv optimizer's logic.
188
- # We need to temporarily "lend" our state and param_groups
189
- # to the delegate so it has the full context to work with,
190
- # especially for features like Kourkoutas-beta.
191
- self.aux_adam.state = self.state
192
- self.aux_adam.param_groups = self.param_groups
193
- self.aux_adam.step_parameter(p, group, i)
194
- return
195
-
196
134
  if p.grad is None:
197
135
  return
198
136
 
@@ -107,7 +107,6 @@ class AdamW_adv(torch.optim.Optimizer):
107
107
  k_logging: int = 0,
108
108
  layer_key_fn: Optional[Callable] = None,
109
109
  nnmf_factor: bool = False,
110
- _is_delegate: bool = False,
111
110
  ):
112
111
  if not (lr >= 0.0):
113
112
  raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
@@ -138,11 +137,10 @@ class AdamW_adv(torch.optim.Optimizer):
138
137
  self.factored = nnmf_factor
139
138
  self.kourkoutas_beta = kourkoutas_beta
140
139
  self.layer_key_fn = layer_key_fn
141
- if not _is_delegate:
142
- super().__init__(params, defaults)
143
- else:
144
- self.defaults = defaults
145
- self.kourkoutas_helper = None
140
+ super().__init__(params, defaults)
141
+
142
+ if self.kourkoutas_beta:
143
+ self.kourkoutas_helper = KourkoutasHelper(self)
146
144
 
147
145
  @property
148
146
  def supports_fused_back_pass(self):
@@ -160,8 +158,6 @@ class AdamW_adv(torch.optim.Optimizer):
160
158
  def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
161
159
  if p.grad is None:
162
160
  return
163
- if group.get('kourkoutas_beta', False) and self.kourkoutas_helper is None:
164
- self.kourkoutas_helper = KourkoutasHelper(self)
165
161
 
166
162
  grad = p.grad
167
163
  if grad.dtype != torch.float32 and self.factored:
@@ -23,10 +23,6 @@ class Muon_adv(torch.optim.Optimizer):
23
23
  This implementation is designed for 2D parameters (e.g., linear layers) and
24
24
  can handle other-dimensional parameters (e.g., 1D bias, 4D convolutional layers) by
25
25
  flattening/reshaping them.
26
-
27
- Can also operate in a hybrid mode, using an auxiliary AdamW
28
- optimizer for specific parameters (e.g., biases, norms, embeddings) as
29
- defined by a `layer_key_fn`.
30
26
 
31
27
  Args:
32
28
  params (iterable): iterable of parameters to optimize or dicts defining
@@ -69,12 +65,6 @@ class Muon_adv(torch.optim.Optimizer):
69
65
  normuon_lr_scale (float): Scaling factor for the NorMuon learning rate.
70
66
  (default: 0.2)
71
67
  normuon_atan2 (bool): whether to use the atan2 for NorMuon. (default: False)
72
- MuonWithAuxAdam (bool): If True, enables the hybrid optimizer mode.
73
- Parameters designated by `layer_key_fn` will be optimized with
74
- AdamW_adv instead of Muon. (default: False)
75
- adam_kwargs (Optional[dict]): A dictionary of keyword arguments to pass
76
- to the auxiliary AdamW_adv optimizer. Only used when
77
- `MuonWithAuxAdam` is True. (default: None)
78
68
  """
79
69
 
80
70
  def __init__(
@@ -102,10 +92,6 @@ class Muon_adv(torch.optim.Optimizer):
102
92
  normuon_eps: float = 1e-8,
103
93
  normuon_lr_scale: float = 0.2,
104
94
  normuon_atan2: bool = False,
105
- # hybrid optimizer mode
106
- MuonWithAuxAdam: bool = False,
107
- muon_adam_lr: float = 1e-4,
108
- adam_kwargs: Optional[dict] = None,
109
95
  ):
110
96
  if not (lr >= 0.0):
111
97
  raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
@@ -121,7 +107,7 @@ class Muon_adv(torch.optim.Optimizer):
121
107
  print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling cautious.")
122
108
  nesterov = False
123
109
 
124
- muon_defaults = {
110
+ defaults = {
125
111
  "lr": lr, "beta1": beta1, "weight_decay": weight_decay,
126
112
  "nesterov": nesterov, "ns_steps": ns_steps, "ns_eps": ns_eps,
127
113
  "ns_coeffs": ns_coeffs, "nnmf_factor": nnmf_factor,
@@ -137,34 +123,7 @@ class Muon_adv(torch.optim.Optimizer):
137
123
  }
138
124
  self.stochastic_rounding = stochastic_rounding
139
125
 
140
- self.MuonWithAuxAdam = MuonWithAuxAdam
141
- self.aux_adam = None
142
-
143
- if not self.MuonWithAuxAdam:
144
- super().__init__(params, muon_defaults)
145
- return
146
-
147
- # HYBRID OPTIMIZER LOGIC
148
- adam_kwargs = adam_kwargs or {}
149
- self.aux_adam = AdamW_adv(
150
- [],
151
- lr=muon_adam_lr,
152
- **adam_kwargs,
153
- _is_delegate=True
154
- )
155
- adam_defaults = self.aux_adam.defaults
156
-
157
- final_param_groups = []
158
- for group in params:
159
- optim_type = group.get('optim_type', 'muon')
160
- defaults_to_use = adam_defaults if optim_type == 'adam' else muon_defaults
161
-
162
- new_group = group.copy()
163
- for key, value in defaults_to_use.items():
164
- new_group.setdefault(key, value)
165
- final_param_groups.append(new_group)
166
-
167
- super().__init__(final_param_groups, muon_defaults)
126
+ super().__init__(params, defaults)
168
127
 
169
128
 
170
129
  @property
@@ -179,30 +138,8 @@ class Muon_adv(torch.optim.Optimizer):
179
138
  def supports_flat_params(self):
180
139
  return False
181
140
 
182
- @property
183
- def kourkoutas_helper(self):
184
- """
185
- Exposes the kourkoutas_helper from the auxiliary AdamW optimizer,
186
- if it exists. This allows external access for logging K-b.
187
- """
188
- if self.aux_adam and hasattr(self.aux_adam, 'kourkoutas_helper'):
189
- return self.aux_adam.kourkoutas_helper
190
- return None
191
-
192
141
  @torch.no_grad()
193
142
  def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
194
- if self.MuonWithAuxAdam:
195
- optim_type = group.get('optim_type')
196
- if optim_type == 'adam':
197
- # Delegate to the AdamW_adv optimizer's logic.
198
- # We need to temporarily "lend" our state and param_groups
199
- # to the delegate so it has the full context to work with,
200
- # especially for features like Kourkoutas-beta.
201
- self.aux_adam.state = self.state
202
- self.aux_adam.param_groups = self.param_groups
203
- self.aux_adam.step_parameter(p, group, i)
204
- return
205
-
206
143
  if p.grad is None:
207
144
  return
208
145
 
@@ -242,7 +179,9 @@ class Muon_adv(torch.optim.Optimizer):
242
179
 
243
180
  # NorMuon state initialization
244
181
  if group['normuon_variant']:
245
- if len(p.shape) >= 2 or state['reshaped_1d_muon']:
182
+ if state['factored']:
183
+ state['normuon_v'] = torch.zeros(d1, device=p.device, dtype=torch.float32)
184
+ elif len(p.shape) >= 2 or state['reshaped_1d_muon']:
246
185
  num_rows = p.shape[0] if len(p.shape) >= 2 else state['effective_shape'][0]
247
186
  state['normuon_v'] = torch.zeros(num_rows, device=p.device, dtype=torch.float32)
248
187
 
@@ -24,9 +24,6 @@ class KourkoutasHelper:
24
24
  # making it compatible with fused back pass mechanisms.
25
25
  self._build_layer_info_if_needed()
26
26
 
27
- if self.optimizer.param_groups[0].get('k_logging', 0) > 0:
28
- self.print_layer_info()
29
-
30
27
  def _build_layer_info_if_needed(self):
31
28
  """Builds a map of layers and the parameters they contain."""
32
29
  if self._layer_info_built:
@@ -53,31 +50,9 @@ class KourkoutasHelper:
53
50
  if layer_key not in self.layer_info:
54
51
  self.layer_info[layer_key] = {'params': [], 'group_ref': group}
55
52
  self.layer_info[layer_key]['params'].append(p)
56
-
57
- k_logging_interval = self.optimizer.param_groups[0].get('k_logging', 0)
58
- if k_logging_interval > 0:
59
- print(f"[Kourkoutas-β Debug] Layer info built. Found {len(self.layer_info)} unique layers/buckets.")
60
53
 
61
54
  self._layer_info_built = True
62
55
 
63
- def print_layer_info(self):
64
- """Prints the contents of self.layer_info for debugging."""
65
- print("\n--- BEGIN self.layer_info DUMP ---")
66
- if not self.layer_info:
67
- print("Layer info is empty. Make sure the optimizer has parameters.")
68
- return
69
-
70
- for layer_key, info in self.layer_info.items():
71
- param_count = len(info['params'])
72
- first_param_details = ""
73
- if param_count > 0:
74
- p = info['params'][0]
75
- first_param_details = f" (Example param shape: {list(p.shape)}, dtype: {p.dtype})"
76
-
77
- print(f"Key: {layer_key}, Params: {param_count}{first_param_details}")
78
-
79
- print("--- END self.layer_info DUMP ---\n")
80
-
81
56
  def prepare_step(self, current_step: int):
82
57
  """
83
58
  Calculates dynamic beta2 for all layers using the completed scalar accumulators
@@ -85,9 +60,8 @@ class KourkoutasHelper:
85
60
  """
86
61
 
87
62
  beta2_log = []
88
- first_layer_key = next(iter(self.layer_info), None)
89
63
  # These are just for the sample log, initialize them
90
- sun, pooled_grad_norm, prev_r_ema_val, r_ema_tensor = (torch.tensor(0.0),)*4
64
+ sun, pooled_grad_norm, r_ema_tensor = (torch.tensor(0.0),)*3
91
65
 
92
66
  # The optimizer that owns this helper holds the master defaults for K-b.
93
67
  # This is crucial in hybrid optimizers where some param_groups might not
@@ -124,7 +98,6 @@ class KourkoutasHelper:
124
98
  accumulator = self.layer_state[layer_key]['sum_sq_accumulator']
125
99
 
126
100
  pooled_grad_norm = torch.sqrt(accumulator)
127
- prev_r_ema_val = r_ema_tensor.item() # for logging
128
101
 
129
102
  # Update the persistent EMA tensor in-place.
130
103
  r_ema_tensor.mul_(ema_alpha).add_(pooled_grad_norm, alpha=1.0 - ema_alpha)
@@ -150,21 +123,9 @@ class KourkoutasHelper:
150
123
  if beta2_log:
151
124
  beta2_tensor = torch.tensor(beta2_log, device='cpu')
152
125
  self.last_beta2_stats = {
153
- 'min': beta2_tensor.min().item(),
154
- 'max': beta2_tensor.max().item(),
155
126
  'mean': beta2_tensor.mean().item(),
156
127
  }
157
128
 
158
- # Handle periodic console logging
159
- k_logging_interval = self.optimizer.param_groups[0].get('k_logging', 0)
160
- is_logging_step = k_logging_interval > 0 and (current_step + 1) % k_logging_interval == 0
161
- if is_logging_step and self.last_beta2_stats:
162
- if first_layer_key:
163
- print(f"\n[Kourkoutas-β Debug] Step {current_step + 1} - Sample Layer '{first_layer_key}':")
164
- print(f" - Grad Norm: {pooled_grad_norm.item():.4e}, Prev EMA: {prev_r_ema_val:.4e}, New EMA: {r_ema_tensor.item():.4e}")
165
- print(f" - Sunspike: {sun.item():.4f}, Dynamic Beta2: {self.layer_state[first_layer_key]['dynamic_beta2']:.4f}")
166
- print(f"[Kourkoutas-β Debug] Step {current_step + 1} Overall Beta2 Stats: Min={self.last_beta2_stats['min']:.4f}, Max={self.last_beta2_stats['max']:.4f}, Mean={self.last_beta2_stats['mean']:.4f}")
167
-
168
129
  def maybe_prepare_step(self, current_step: int):
169
130
  """
170
131
  A universal guard that calls prepare_step() exactly once per training step.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 1.2.dev10
3
+ Version: 1.2.dev12
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="adv_optm",
8
- version="1.2.dev10",
8
+ version="1.2.dev12",
9
9
  author="Koratahiu",
10
10
  author_email="hiuhonor@gmail.com",
11
11
  license='Apache 2.0',
File without changes
File without changes
File without changes