adv-optm 1.1.0.dev4__tar.gz → 1.1.0.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of adv-optm might be problematic. Click here for more details.

Files changed (25) hide show
  1. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/PKG-INFO +1 -1
  2. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/__init__.py +1 -1
  3. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/optim/AdamW_adv.py +3 -3
  4. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/optim/Adopt_adv.py +3 -3
  5. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/optim/Lion_Prodigy_adv.py +1 -1
  6. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/optim/Lion_adv.py +1 -1
  7. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/optim/Prodigy_adv.py +13 -6
  8. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/optim/Simplified_AdEMAMix.py +3 -3
  9. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/util/Kourkoutas.py +54 -25
  10. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm.egg-info/PKG-INFO +1 -1
  11. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/setup.py +1 -1
  12. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/LICENSE +0 -0
  13. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/README.md +0 -0
  14. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/optim/__init__.py +0 -0
  15. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/util/BF16_Stochastic_Rounding.py +0 -0
  16. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/util/Effective_Shape.py +0 -0
  17. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/util/NNMF.py +0 -0
  18. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/util/One_Bit_Boolean.py +0 -0
  19. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/util/OrthoGrad.py +0 -0
  20. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm/util/__init__.py +0 -0
  21. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm.egg-info/SOURCES.txt +0 -0
  22. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm.egg-info/dependency_links.txt +0 -0
  23. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm.egg-info/requires.txt +0 -0
  24. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/adv_optm.egg-info/top_level.txt +0 -0
  25. {adv_optm-1.1.0.dev4 → adv_optm-1.1.0.dev5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 1.1.0.dev4
3
+ Version: 1.1.0.dev5
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -16,4 +16,4 @@ __all__ = [
16
16
  "Lion_Prodigy_adv",
17
17
  ]
18
18
 
19
- __version__ = "1.1.0.dev4"
19
+ __version__ = "1.1.0.dev5"
@@ -100,8 +100,8 @@ class AdamW_adv(torch.optim.Optimizer):
100
100
  alpha: float = 5.0,
101
101
  t_alpha: int | None = None,
102
102
  kourkoutas_beta: bool = False,
103
- beta2_min: float = 0.88,
104
- ema_alpha: float = 0.93,
103
+ beta2_min: float = 0.9,
104
+ ema_alpha: float = 0.95,
105
105
  tiny_spike: float = 1e-9,
106
106
  k_warmup_steps: int = 0,
107
107
  k_logging: int = 0,
@@ -167,7 +167,7 @@ class AdamW_adv(torch.optim.Optimizer):
167
167
  state = self.state[p]
168
168
 
169
169
  # State Initialization
170
- if len(state) == 0:
170
+ if 'step' not in state:
171
171
  state['step'] = 0
172
172
 
173
173
  should_factor = (
@@ -120,8 +120,8 @@ class Adopt_adv(torch.optim.Optimizer):
120
120
  Simplified_AdEMAMix: bool = False,
121
121
  alpha_grad: float = 100.0,
122
122
  kourkoutas_beta: bool = False,
123
- beta2_min: float = 0.88,
124
- ema_alpha: float = 0.93,
123
+ beta2_min: float = 0.9,
124
+ ema_alpha: float = 0.95,
125
125
  tiny_spike: float = 1e-9,
126
126
  k_warmup_steps: int = 0,
127
127
  k_logging: int = 0,
@@ -195,7 +195,7 @@ class Adopt_adv(torch.optim.Optimizer):
195
195
  state = self.state[p]
196
196
 
197
197
  # State Initialization
198
- if len(state) == 0:
198
+ if 'step' not in state:
199
199
  state['step'] = 0
200
200
 
201
201
  should_factor = (
@@ -151,7 +151,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
151
151
  state = self.state[p]
152
152
 
153
153
  # State Initialization
154
- if len(state) == 0:
154
+ if 'step' not in state:
155
155
  state['step'] = 0
156
156
 
157
157
  should_factor = (
@@ -99,7 +99,7 @@ class Lion_adv(torch.optim.Optimizer):
99
99
  state = self.state[p]
100
100
 
101
101
  # State Initialization
102
- if len(state) == 0:
102
+ if 'step' not in state:
103
103
  state['step'] = 0
104
104
 
105
105
  should_factor = (
@@ -88,6 +88,9 @@ class Prodigy_adv(torch.optim.Optimizer):
88
88
  prodigy_steps (int): If greater than zero, disable Prodigy's stepsize adjustments
89
89
  after the specified optimiser step and release all state memory required by Prodigy
90
90
  (default: 0).
91
+ d_limiter (bool): whether to clamp the new step size estimate (`d_hat`)
92
+ to prevent sudden, volatile increases in the adaptive step size (`d`).
93
+ (default: False)
91
94
  kourkoutas_beta (bool): whether to enable the layer-wise dynamic β₂ logic.
92
95
  If `False`, the optimizer behaves as standard AdamW/Prodigy. (default: False)
93
96
  beta2_min (float): The minimum value for dynamic β₂, used during periods of
@@ -141,9 +144,11 @@ class Prodigy_adv(torch.optim.Optimizer):
141
144
  fsdp_in_use: bool = False,
142
145
  slice_p: int = 11,
143
146
  prodigy_steps: int = 0,
147
+ d_limiter: bool = False,
148
+ # K-b parameters
144
149
  kourkoutas_beta: bool = False,
145
- beta2_min: float = 0.88,
146
- ema_alpha: float = 0.93,
150
+ beta2_min: float = 0.9,
151
+ ema_alpha: float = 0.95,
147
152
  tiny_spike: float = 1e-9,
148
153
  k_warmup_steps: int = 0,
149
154
  k_logging: int = 0,
@@ -175,8 +180,8 @@ class Prodigy_adv(torch.optim.Optimizer):
175
180
  use_atan2 = False
176
181
  if kourkoutas_beta and not (betas[1] > beta2_min):
177
182
  raise ValueError(f"For Kourkoutas-β, betas[1] (as beta2_max) must be > beta2_min. Got {betas[1]} and {beta2_min}")
178
- if Simplified_AdEMAMix and alpha_grad > 0:
179
- # scales d_coef by alpha_grad, this force prodigy to behave well with Simplified_AdEMAMix
183
+ if Simplified_AdEMAMix and alpha_grad > 0 and not d_limiter:
184
+ # scales d_coef by alpha_grad, this force prodigy to behave well with Simplified_AdEMAMix.
180
185
  d_coef = d_coef/alpha_grad
181
186
 
182
187
  defaults = {
@@ -186,7 +191,7 @@ class Prodigy_adv(torch.optim.Optimizer):
186
191
  "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
187
192
  "beta3": beta3, "d": d0, "d0": d0, "d_max": d0, "d_numerator": 0.0, "d_coef": d_coef,
188
193
  "growth_rate": growth_rate, "safeguard_warmup": safeguard_warmup, "k": 0, "slice_p": slice_p,
189
- "fsdp_in_use": fsdp_in_use, "prodigy_steps": prodigy_steps,
194
+ "fsdp_in_use": fsdp_in_use, "prodigy_steps": prodigy_steps, "d_limiter": d_limiter,
190
195
  "alpha_grad": alpha_grad,
191
196
  "kourkoutas_beta": kourkoutas_beta, "beta2_min": beta2_min, "ema_alpha": ema_alpha,
192
197
  "tiny_spike": tiny_spike, "k_warmup_steps": k_warmup_steps, "k_logging": k_logging,
@@ -251,7 +256,7 @@ class Prodigy_adv(torch.optim.Optimizer):
251
256
  state = self.state[p]
252
257
 
253
258
  # State Initialization
254
- if len(state) == 0:
259
+ if 'step' not in state:
255
260
  state['step'] = 0
256
261
 
257
262
  should_factor = (
@@ -512,6 +517,8 @@ class Prodigy_adv(torch.optim.Optimizer):
512
517
  d_hat = self.d
513
518
  if global_d_denom > 0:
514
519
  d_hat = d_coef * global_d_numerator / global_d_denom
520
+ if g_group['d_limiter']:
521
+ d_hat = min(self.d * (2 ** 0.25), d_hat)
515
522
  if self.d == g_group['d0']:
516
523
  self.d = max(self.d, d_hat)
517
524
  d_max = max(d_max, d_hat)
@@ -90,8 +90,8 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
90
90
  stochastic_rounding: bool = True,
91
91
  orthogonal_gradient: bool = False,
92
92
  kourkoutas_beta: bool = False,
93
- beta2_min: float = 0.88,
94
- ema_alpha: float = 0.93,
93
+ beta2_min: float = 0.9,
94
+ ema_alpha: float = 0.95,
95
95
  tiny_spike: float = 1e-9,
96
96
  k_warmup_steps: int = 0,
97
97
  k_logging: int = 0,
@@ -152,7 +152,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
152
152
  state = self.state[p]
153
153
 
154
154
  # State Initialization
155
- if len(state) == 0:
155
+ if 'step' not in state:
156
156
  state['step'] = 0
157
157
 
158
158
  should_factor = (
@@ -11,9 +11,8 @@ class KourkoutasHelper:
11
11
  if not hasattr(optimizer, 'param_groups'):
12
12
  raise TypeError("optimizer must be a valid torch.optim.Optimizer instance.")
13
13
  self.optimizer = optimizer
14
-
15
- # State managed by the helper
16
14
  self.layer_state = {}
15
+
17
16
  self.layer_info = {}
18
17
  self._layer_info_built = False
19
18
  self._current_step_prepared = -1
@@ -25,6 +24,9 @@ class KourkoutasHelper:
25
24
  # making it compatible with fused back pass mechanisms.
26
25
  self._build_layer_info_if_needed()
27
26
 
27
+ if self.optimizer.param_groups[0].get('k_logging', 0) > 0:
28
+ self.print_layer_info()
29
+
28
30
  def _build_layer_info_if_needed(self):
29
31
  """Builds a map of layers and the parameters they contain."""
30
32
  if self._layer_info_built:
@@ -48,6 +50,24 @@ class KourkoutasHelper:
48
50
 
49
51
  self._layer_info_built = True
50
52
 
53
+ def print_layer_info(self):
54
+ """Prints the contents of self.layer_info for debugging."""
55
+ print("\n--- BEGIN self.layer_info DUMP ---")
56
+ if not self.layer_info:
57
+ print("Layer info is empty. Make sure the optimizer has parameters.")
58
+ return
59
+
60
+ for layer_key, info in self.layer_info.items():
61
+ param_count = len(info['params'])
62
+ first_param_details = ""
63
+ if param_count > 0:
64
+ p = info['params'][0]
65
+ first_param_details = f" (Example param shape: {list(p.shape)}, dtype: {p.dtype})"
66
+
67
+ print(f"Key: {layer_key}, Params: {param_count}{first_param_details}")
68
+
69
+ print("--- END self.layer_info DUMP ---\n")
70
+
51
71
  def prepare_step(self, current_step: int):
52
72
  """
53
73
  Calculates dynamic beta2 for all layers using the completed scalar accumulators
@@ -55,44 +75,50 @@ class KourkoutasHelper:
55
75
  """
56
76
 
57
77
  beta2_log = []
78
+ first_layer_key = next(iter(self.layer_info), None)
58
79
  # These are just for the sample log, initialize them
59
- sun, pooled_grad_norm, r_ema = (torch.tensor(0.0),)*3
60
-
80
+ sun, pooled_grad_norm, prev_r_ema_val, r_ema_tensor = (torch.tensor(0.0),)*4
61
81
 
62
82
  for layer_key, info in self.layer_info.items():
63
83
  params, group = info['params'], info['group_ref']
64
-
84
+
85
+ first_param_in_layer = info['params'][0]
86
+ param_state = self.optimizer.state[first_param_in_layer]
87
+
65
88
  if layer_key not in self.layer_state:
66
89
  self.layer_state[layer_key] = {
67
- 'r_ema_grad_norm': torch.tensor(0.0, device=params[0].device, dtype=torch.float32),
68
- 'sum_sq_accumulator': torch.tensor(0.0, device=params[0].device, dtype=torch.float32)
90
+ 'sum_sq_accumulator': torch.tensor(0.0, device=first_param_in_layer.device, dtype=torch.float32)
69
91
  }
70
92
 
71
- layer_state = self.layer_state[layer_key]
72
-
73
- # Use the completed accumulator from the previous step
74
- pooled_grad_norm = torch.sqrt(layer_state['sum_sq_accumulator'])
75
-
76
- r_ema = layer_state['r_ema_grad_norm']
93
+ if 'kourkoutas_r_ema' not in param_state:
94
+ param_state['kourkoutas_r_ema'] = torch.tensor(0.0, device=first_param_in_layer.device, dtype=torch.float32)
77
95
 
78
- # EMA is always updated, even during warmup
79
- r_ema.mul_(group['ema_alpha']).add_(pooled_grad_norm, alpha=1.0 - group['ema_alpha'])
80
-
81
- sun = torch.tensor(0.0, device=r_ema.device) # Default sun to 0 for warmup
96
+ r_ema_tensor = param_state['kourkoutas_r_ema']
97
+ accumulator = self.layer_state[layer_key]['sum_sq_accumulator']
98
+
99
+ pooled_grad_norm = torch.sqrt(accumulator)
100
+ prev_r_ema_val = r_ema_tensor.item() # for logging
101
+
102
+ # Update the persistent EMA tensor in-place.
103
+ r_ema_tensor.mul_(group['ema_alpha']).add_(pooled_grad_norm, alpha=1.0 - group['ema_alpha'])
104
+
82
105
  beta2_max = group['betas'][1]
83
-
84
- # --- CONSOLIDATED WARMUP LOGIC ---
106
+ sun = torch.tensor(0.0, device=r_ema_tensor.device) # Default sun to 0 for warmup
107
+
85
108
  if current_step < group['k_warmup_steps']:
86
109
  beta2 = beta2_max
87
110
  else:
88
- raw = pooled_grad_norm / (r_ema + group['tiny_spike'])
111
+ raw = pooled_grad_norm / (r_ema_tensor + group['tiny_spike'])
89
112
  sun = raw / (1.0 + raw)
90
113
  beta2 = beta2_max - (beta2_max - group['beta2_min']) * sun
91
114
 
92
- layer_state['dynamic_beta2'] = beta2.item() if isinstance(beta2, torch.Tensor) else beta2
93
- layer_state['sum_sq_accumulator'].zero_()
115
+ # Store the final calculated beta2 in the helper's transient state for this step.
116
+ self.layer_state[layer_key]['dynamic_beta2'] = beta2.item() if isinstance(beta2, torch.Tensor) else beta2
117
+
118
+ # Reset the accumulator for the next optimizer step.
119
+ accumulator.zero_()
94
120
 
95
- beta2_log.append(layer_state['dynamic_beta2'])
121
+ beta2_log.append(self.layer_state[layer_key]['dynamic_beta2'])
96
122
 
97
123
  # Always compute stats for TensorBoard
98
124
  if beta2_log:
@@ -107,9 +133,12 @@ class KourkoutasHelper:
107
133
  k_logging_interval = self.optimizer.param_groups[0].get('k_logging', 0)
108
134
  is_logging_step = k_logging_interval > 0 and (current_step + 1) % k_logging_interval == 0
109
135
  if is_logging_step and self.last_beta2_stats:
136
+ if first_layer_key:
137
+ print(f"\n[Kourkoutas-β Debug] Step {current_step + 1} - Sample Layer '{first_layer_key}':")
138
+ print(f" - Grad Norm: {pooled_grad_norm.item():.4e}, Prev EMA: {prev_r_ema_val:.4e}, New EMA: {r_ema_tensor.item():.4e}")
139
+ print(f" - Sunspike: {sun.item():.4f}, Dynamic Beta2: {self.layer_state[first_layer_key]['dynamic_beta2']:.4f}")
110
140
  print(f"[Kourkoutas-β Debug] Step {current_step + 1} Overall Beta2 Stats: Min={self.last_beta2_stats['min']:.4f}, Max={self.last_beta2_stats['max']:.4f}, Mean={self.last_beta2_stats['mean']:.4f}")
111
141
 
112
-
113
142
  def maybe_prepare_step(self, current_step: int):
114
143
  """
115
144
  A universal guard that calls prepare_step() exactly once per training step.
@@ -125,9 +154,9 @@ class KourkoutasHelper:
125
154
  layer_key = self.optimizer.layer_key_fn(p)
126
155
 
127
156
  if layer_key in self.layer_info:
157
+ # Initialize the transient state for this layer if it's the first time in the step.
128
158
  if layer_key not in self.layer_state:
129
159
  self.layer_state[layer_key] = {
130
- 'r_ema_grad_norm': torch.tensor(0.0, device=p.device, dtype=torch.float32),
131
160
  'sum_sq_accumulator': torch.tensor(0.0, device=p.device, dtype=torch.float32)
132
161
  }
133
162
  # Accumulate for the *next* step's prepare_step call
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 1.1.0.dev4
3
+ Version: 1.1.0.dev5
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="adv_optm",
8
- version="1.1.0.dev4",
8
+ version="1.1.0.dev5",
9
9
  author="Koratahiu",
10
10
  author_email="hiuhonor@gmail.com",
11
11
  license='Apache 2.0',
File without changes
File without changes
File without changes