adv-optm 1.2.dev4__py3-none-any.whl → 1.2.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of adv-optm might be problematic. Click here for more details.

adv_optm/__init__.py CHANGED
@@ -20,4 +20,4 @@ __all__ = [
20
20
  "AdaMuon_adv",
21
21
  ]
22
22
 
23
- __version__ = "1.2.dev4"
23
+ __version__ = "1.2.dev6"
@@ -182,7 +182,11 @@ class AdaMuon_adv(torch.optim.Optimizer):
182
182
 
183
183
  for key, value in defaults_to_use.items():
184
184
  new_group.setdefault(key, value)
185
-
185
+ if '_kourkoutas_beta' not in new_group:
186
+ if optim_type == 'adam':
187
+ new_group['_kourkoutas_beta'] = False
188
+ else:
189
+ new_group['_kourkoutas_beta'] = muon_defaults['_kourkoutas_beta']
186
190
  final_param_groups.append(new_group)
187
191
 
188
192
  super().__init__(final_param_groups, {})
@@ -225,10 +229,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
225
229
  # We need to temporarily "lend" our state and param_groups
226
230
  self.aux_adam.state = self.state
227
231
  self.aux_adam.param_groups = self.param_groups
228
-
229
- # Ensure the aux optimizer uses the same Kourkoutas helper instance.
230
- if self._kourkoutas_helper is not None:
231
- self.aux_adam.kourkoutas_helper = self._kourkoutas_helper
232
232
 
233
233
  self.aux_adam.step_parameter(p, group, i)
234
234
  return
@@ -86,9 +86,17 @@ class KourkoutasHelper:
86
86
  # These are just for the sample log, initialize them
87
87
  sun, pooled_grad_norm, prev_r_ema_val, r_ema_tensor = (torch.tensor(0.0),)*4
88
88
 
89
+ # The optimizer that owns this helper holds the master defaults for K-b.
90
+ # This is crucial in hybrid optimizers where some param_groups might not
91
+ # have all K-b keys populated, preventing KeyErrors.
92
+ master_defaults = self.optimizer.defaults
93
+
89
94
  for layer_key, info in self.layer_info.items():
90
95
  params, group = info['params'], info['group_ref']
91
96
 
97
+ if not group.get('kourkoutas_beta', False):
98
+ continue
99
+
92
100
  first_param_in_layer = info['params'][0]
93
101
  param_state = self.optimizer.state[first_param_in_layer]
94
102
 
@@ -100,6 +108,15 @@ class KourkoutasHelper:
100
108
  if 'kourkoutas_r_ema' not in param_state:
101
109
  param_state['kourkoutas_r_ema'] = torch.tensor(0.0, device=first_param_in_layer.device, dtype=torch.float32)
102
110
 
111
+ # Use group-specific K-b settings, falling back to the optimizer's master defaults.
112
+ # This makes the helper robust against param groups that enable kourkoutas_beta
113
+ # but are missing the other required hyperparameters.
114
+ ema_alpha = group.get('ema_alpha', master_defaults['ema_alpha'])
115
+ beta2_max = group.get('betas', master_defaults['betas'])[1]
116
+ beta2_min = group.get('beta2_min', master_defaults['beta2_min'])
117
+ tiny_spike = group.get('tiny_spike', master_defaults['tiny_spike'])
118
+ k_warmup_steps = group.get('k_warmup_steps', master_defaults['k_warmup_steps'])
119
+
103
120
  r_ema_tensor = param_state['kourkoutas_r_ema']
104
121
  accumulator = self.layer_state[layer_key]['sum_sq_accumulator']
105
122
 
@@ -107,17 +124,16 @@ class KourkoutasHelper:
107
124
  prev_r_ema_val = r_ema_tensor.item() # for logging
108
125
 
109
126
  # Update the persistent EMA tensor in-place.
110
- r_ema_tensor.mul_(group['ema_alpha']).add_(pooled_grad_norm, alpha=1.0 - group['ema_alpha'])
127
+ r_ema_tensor.mul_(ema_alpha).add_(pooled_grad_norm, alpha=1.0 - ema_alpha)
111
128
 
112
- beta2_max = group['betas'][1]
113
129
  sun = torch.tensor(0.0, device=r_ema_tensor.device) # Default sun to 0 for warmup
114
130
 
115
- if current_step < group['k_warmup_steps']:
131
+ if current_step < k_warmup_steps:
116
132
  beta2 = beta2_max
117
133
  else:
118
- raw = pooled_grad_norm / (r_ema_tensor + group['tiny_spike'])
134
+ raw = pooled_grad_norm / (r_ema_tensor + tiny_spike)
119
135
  sun = raw / (1.0 + raw)
120
- beta2 = beta2_max - (beta2_max - group['beta2_min']) * sun
136
+ beta2 = beta2_max - (beta2_max - beta2_min) * sun
121
137
 
122
138
  # Store the final calculated beta2 in the helper's transient state for this step.
123
139
  self.layer_state[layer_key]['dynamic_beta2'] = beta2.item() if isinstance(beta2, torch.Tensor) else beta2
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 1.2.dev4
3
+ Version: 1.2.dev6
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -1,5 +1,5 @@
1
- adv_optm/__init__.py,sha256=bB7_VywKpvZbcGCjtZoF8giQgcUgoziISBgIaEUpcAw,379
2
- adv_optm/optim/AdaMuon_adv.py,sha256=s5UkR2YJ_Z10SiBokT97eq4tCHc2D8BEOFDx5AOMryQ,20983
1
+ adv_optm/__init__.py,sha256=PXTst9vLSSIgeIiqqsDcq1FehR-HFUCBT7cP6bDTteA,379
2
+ adv_optm/optim/AdaMuon_adv.py,sha256=hTGSH8wzmQ-NYIcqV6EAEbqCxxfEwmmMWaIadX1qiuQ,21009
3
3
  adv_optm/optim/AdamW_adv.py,sha256=7IvdD1rqYeHZwQCZU9X0H7x87MCKcHQ5M68GLuMCkvE,17702
4
4
  adv_optm/optim/Adopt_adv.py,sha256=C2FsEZGvCk9q4YNKAj0qIxdZ5AfPlda-1lIpSX0a1nE,21256
5
5
  adv_optm/optim/Lion_Prodigy_adv.py,sha256=LEA3UYJpPeFnmxeniLNv1u2LKKj4ufx3Bq_MLw-nWXk,14617
@@ -10,15 +10,15 @@ adv_optm/optim/Simplified_AdEMAMix.py,sha256=sY-vThMVgADRh0ar9WHkrM2n8UcgQLQC1YV
10
10
  adv_optm/optim/__init__.py,sha256=hpUWE6CKtt_rvMdgQVb3PtjhfZAvAxTq6hp8H8rIpBo,489
11
11
  adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
12
12
  adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
13
- adv_optm/util/Kourkoutas.py,sha256=MDQaNVH8jqzaefks2RShveo44dpYDz88WStwUJ3iF0s,8724
13
+ adv_optm/util/Kourkoutas.py,sha256=WPAjxaH9pGVtLK_QJcwjkJOnN02Hfyu0F2T90hbhtqo,9662
14
14
  adv_optm/util/MuonAdam_helper.py,sha256=7rnNMujZVDaqo1g22QscMyPlZvIHQQSLHMED9_I8QWU,1250
15
15
  adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
16
16
  adv_optm/util/Newton_Schulz.py,sha256=wJ_sKRaGVIsOofQ737my4ng494qX_pfgOqlDDmYtnCg,1377
17
17
  adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
18
18
  adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
19
19
  adv_optm/util/__init__.py,sha256=jAaUfaAjFrTJ6-Q915ezAbq0efRbpYjriW2OdeCbSzo,433
20
- adv_optm-1.2.dev4.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
21
- adv_optm-1.2.dev4.dist-info/METADATA,sha256=jNczVxIPq0LuusXuGrZ23CQ4CrMNOfJdBDpDQgulMUw,14022
22
- adv_optm-1.2.dev4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
23
- adv_optm-1.2.dev4.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
24
- adv_optm-1.2.dev4.dist-info/RECORD,,
20
+ adv_optm-1.2.dev6.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
21
+ adv_optm-1.2.dev6.dist-info/METADATA,sha256=IIuYXoR2_uWQo7LjFO-MnBymyygc-Ntd9Fh0HHxnieA,14022
22
+ adv_optm-1.2.dev6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
23
+ adv_optm-1.2.dev6.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
24
+ adv_optm-1.2.dev6.dist-info/RECORD,,