adv-optm 1.1.0.dev4__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of adv-optm might be problematic. Click here for more details.
- adv_optm/__init__.py +1 -1
- adv_optm/optim/AdamW_adv.py +5 -5
- adv_optm/optim/Adopt_adv.py +5 -5
- adv_optm/optim/Lion_Prodigy_adv.py +1 -1
- adv_optm/optim/Lion_adv.py +1 -1
- adv_optm/optim/Prodigy_adv.py +16 -11
- adv_optm/optim/Simplified_AdEMAMix.py +4 -4
- adv_optm/util/Kourkoutas.py +64 -28
- adv_optm-1.1.1.dist-info/METADATA +275 -0
- adv_optm-1.1.1.dist-info/RECORD +20 -0
- adv_optm-1.1.0.dev4.dist-info/METADATA +0 -174
- adv_optm-1.1.0.dev4.dist-info/RECORD +0 -20
- {adv_optm-1.1.0.dev4.dist-info → adv_optm-1.1.1.dist-info}/WHEEL +0 -0
- {adv_optm-1.1.0.dev4.dist-info → adv_optm-1.1.1.dist-info}/licenses/LICENSE +0 -0
- {adv_optm-1.1.0.dev4.dist-info → adv_optm-1.1.1.dist-info}/top_level.txt +0 -0
adv_optm/__init__.py
CHANGED
adv_optm/optim/AdamW_adv.py
CHANGED
|
@@ -10,7 +10,7 @@ from ..util.Kourkoutas import KourkoutasHelper
|
|
|
10
10
|
|
|
11
11
|
class AdamW_adv(torch.optim.Optimizer):
|
|
12
12
|
"""
|
|
13
|
-
Implements
|
|
13
|
+
Implements an advanced AdamW algorithm.
|
|
14
14
|
This is an advanced version of AdamW with optional features like
|
|
15
15
|
low-rank factorization of optimizer states (SMMF), OrthoGrad, etc.
|
|
16
16
|
|
|
@@ -67,7 +67,7 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
67
67
|
"sunspike" ratio calculation to prevent division by zero. Corresponds
|
|
68
68
|
to `ε_spike` in the paper. (default: 1e-9)
|
|
69
69
|
k_warmup_steps (int): The number of initial steps during which β₂ is held
|
|
70
|
-
at a fixed
|
|
70
|
+
at a fixed beta2 value before the
|
|
71
71
|
dynamic logic activates. (default: 0)
|
|
72
72
|
k_logging (int): if > 0 and kourkoutas_beta=True, enables periodic console
|
|
73
73
|
logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
|
|
@@ -100,8 +100,8 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
100
100
|
alpha: float = 5.0,
|
|
101
101
|
t_alpha: int | None = None,
|
|
102
102
|
kourkoutas_beta: bool = False,
|
|
103
|
-
beta2_min: float = 0.
|
|
104
|
-
ema_alpha: float = 0.
|
|
103
|
+
beta2_min: float = 0.9,
|
|
104
|
+
ema_alpha: float = 0.95,
|
|
105
105
|
tiny_spike: float = 1e-9,
|
|
106
106
|
k_warmup_steps: int = 0,
|
|
107
107
|
k_logging: int = 0,
|
|
@@ -167,7 +167,7 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
167
167
|
state = self.state[p]
|
|
168
168
|
|
|
169
169
|
# State Initialization
|
|
170
|
-
if
|
|
170
|
+
if 'step' not in state:
|
|
171
171
|
state['step'] = 0
|
|
172
172
|
|
|
173
173
|
should_factor = (
|
adv_optm/optim/Adopt_adv.py
CHANGED
|
@@ -10,7 +10,7 @@ from ..util.Kourkoutas import KourkoutasHelper
|
|
|
10
10
|
|
|
11
11
|
class Adopt_adv(torch.optim.Optimizer):
|
|
12
12
|
"""
|
|
13
|
-
Implements
|
|
13
|
+
Implements an advanced ADOPT algorithm.
|
|
14
14
|
|
|
15
15
|
The ADOPT update rule modifies Adam by:
|
|
16
16
|
1. **Initialization:** The second moment `v` is initialized as `v₀ = g₀²`.
|
|
@@ -85,7 +85,7 @@ class Adopt_adv(torch.optim.Optimizer):
|
|
|
85
85
|
"sunspike" ratio calculation to prevent division by zero. Corresponds
|
|
86
86
|
to `ε_spike` in the paper. (default: 1e-9)
|
|
87
87
|
k_warmup_steps (int): The number of initial steps during which β₂ is held
|
|
88
|
-
at a fixed
|
|
88
|
+
at a fixed beta2 value before the
|
|
89
89
|
dynamic logic activates. (default: 0)
|
|
90
90
|
k_logging (int): if > 0 and kourkoutas_beta=True, enables periodic console
|
|
91
91
|
logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
|
|
@@ -120,8 +120,8 @@ class Adopt_adv(torch.optim.Optimizer):
|
|
|
120
120
|
Simplified_AdEMAMix: bool = False,
|
|
121
121
|
alpha_grad: float = 100.0,
|
|
122
122
|
kourkoutas_beta: bool = False,
|
|
123
|
-
beta2_min: float = 0.
|
|
124
|
-
ema_alpha: float = 0.
|
|
123
|
+
beta2_min: float = 0.9,
|
|
124
|
+
ema_alpha: float = 0.95,
|
|
125
125
|
tiny_spike: float = 1e-9,
|
|
126
126
|
k_warmup_steps: int = 0,
|
|
127
127
|
k_logging: int = 0,
|
|
@@ -195,7 +195,7 @@ class Adopt_adv(torch.optim.Optimizer):
|
|
|
195
195
|
state = self.state[p]
|
|
196
196
|
|
|
197
197
|
# State Initialization
|
|
198
|
-
if
|
|
198
|
+
if 'step' not in state:
|
|
199
199
|
state['step'] = 0
|
|
200
200
|
|
|
201
201
|
should_factor = (
|
adv_optm/optim/Lion_adv.py
CHANGED
adv_optm/optim/Prodigy_adv.py
CHANGED
|
@@ -14,7 +14,7 @@ from ..util.Kourkoutas import KourkoutasHelper
|
|
|
14
14
|
|
|
15
15
|
class Prodigy_adv(torch.optim.Optimizer):
|
|
16
16
|
"""
|
|
17
|
-
Implements
|
|
17
|
+
Implements an advanced Prodigy algorithm.
|
|
18
18
|
This is an advanced version of Prodigy with optional features like
|
|
19
19
|
low-rank factorization of optimizer states (SMMF), OrthoGrad, etc.
|
|
20
20
|
|
|
@@ -88,6 +88,9 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
88
88
|
prodigy_steps (int): If greater than zero, disable Prodigy's stepsize adjustments
|
|
89
89
|
after the specified optimiser step and release all state memory required by Prodigy
|
|
90
90
|
(default: 0).
|
|
91
|
+
d_limiter (bool): whether to clamp the new step size estimate (`d_hat`)
|
|
92
|
+
to prevent sudden, volatile increases in the adaptive step size (`d`).
|
|
93
|
+
(default: False)
|
|
91
94
|
kourkoutas_beta (bool): whether to enable the layer-wise dynamic β₂ logic.
|
|
92
95
|
If `False`, the optimizer behaves as standard AdamW/Prodigy. (default: False)
|
|
93
96
|
beta2_min (float): The minimum value for dynamic β₂, used during periods of
|
|
@@ -100,7 +103,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
100
103
|
"sunspike" ratio calculation to prevent division by zero. Corresponds
|
|
101
104
|
to `ε_spike` in the paper. (default: 1e-9)
|
|
102
105
|
k_warmup_steps (int): The number of initial steps during which β₂ is held
|
|
103
|
-
at a fixed
|
|
106
|
+
at a fixed beta2 value before the
|
|
104
107
|
dynamic logic activates. (default: 0)
|
|
105
108
|
k_logging (int): if > 0 and kourkoutas_beta=True, enables periodic console
|
|
106
109
|
logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
|
|
@@ -141,9 +144,11 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
141
144
|
fsdp_in_use: bool = False,
|
|
142
145
|
slice_p: int = 11,
|
|
143
146
|
prodigy_steps: int = 0,
|
|
147
|
+
d_limiter: bool = False,
|
|
148
|
+
# K-b parameters
|
|
144
149
|
kourkoutas_beta: bool = False,
|
|
145
|
-
beta2_min: float = 0.
|
|
146
|
-
ema_alpha: float = 0.
|
|
150
|
+
beta2_min: float = 0.9,
|
|
151
|
+
ema_alpha: float = 0.95,
|
|
147
152
|
tiny_spike: float = 1e-9,
|
|
148
153
|
k_warmup_steps: int = 0,
|
|
149
154
|
k_logging: int = 0,
|
|
@@ -175,8 +180,8 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
175
180
|
use_atan2 = False
|
|
176
181
|
if kourkoutas_beta and not (betas[1] > beta2_min):
|
|
177
182
|
raise ValueError(f"For Kourkoutas-β, betas[1] (as beta2_max) must be > beta2_min. Got {betas[1]} and {beta2_min}")
|
|
178
|
-
if Simplified_AdEMAMix and alpha_grad > 0:
|
|
179
|
-
# scales d_coef by alpha_grad, this force prodigy to behave well with Simplified_AdEMAMix
|
|
183
|
+
if Simplified_AdEMAMix and alpha_grad > 0 and not d_limiter:
|
|
184
|
+
# scales d_coef by alpha_grad, this force prodigy to behave well with Simplified_AdEMAMix.
|
|
180
185
|
d_coef = d_coef/alpha_grad
|
|
181
186
|
|
|
182
187
|
defaults = {
|
|
@@ -186,7 +191,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
186
191
|
"beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
|
|
187
192
|
"beta3": beta3, "d": d0, "d0": d0, "d_max": d0, "d_numerator": 0.0, "d_coef": d_coef,
|
|
188
193
|
"growth_rate": growth_rate, "safeguard_warmup": safeguard_warmup, "k": 0, "slice_p": slice_p,
|
|
189
|
-
"fsdp_in_use": fsdp_in_use, "prodigy_steps": prodigy_steps,
|
|
194
|
+
"fsdp_in_use": fsdp_in_use, "prodigy_steps": prodigy_steps, "d_limiter": d_limiter,
|
|
190
195
|
"alpha_grad": alpha_grad,
|
|
191
196
|
"kourkoutas_beta": kourkoutas_beta, "beta2_min": beta2_min, "ema_alpha": ema_alpha,
|
|
192
197
|
"tiny_spike": tiny_spike, "k_warmup_steps": k_warmup_steps, "k_logging": k_logging,
|
|
@@ -251,7 +256,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
251
256
|
state = self.state[p]
|
|
252
257
|
|
|
253
258
|
# State Initialization
|
|
254
|
-
if
|
|
259
|
+
if 'step' not in state:
|
|
255
260
|
state['step'] = 0
|
|
256
261
|
|
|
257
262
|
should_factor = (
|
|
@@ -306,10 +311,8 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
306
311
|
self.kourkoutas_helper.accumulate_gradient_sq_norm(p, grad)
|
|
307
312
|
# Get the dynamic beta2 calculated in prepare_step()
|
|
308
313
|
beta2 = self.kourkoutas_helper.get_beta2(p, group, current_step)
|
|
309
|
-
beta3 = math.sqrt(beta2)
|
|
310
314
|
else:
|
|
311
315
|
beta2 = self.beta2_default
|
|
312
|
-
beta3 = self.beta3
|
|
313
316
|
|
|
314
317
|
if self.use_AdEMAMix:
|
|
315
318
|
beta3_ema = group['beta3_ema']
|
|
@@ -446,7 +449,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
446
449
|
self.d_numerator += (self.d / d0) * self.dlr * torch.dot(grad_flat[::slice_p], p0.data - p_flat[::slice_p]).item()
|
|
447
450
|
|
|
448
451
|
alpha = ((self.d / d0) * self.d) if safeguard_warmup else ((self.d / d0) * self.dlr)
|
|
449
|
-
s.mul_(beta3).add_(grad_flat[::slice_p], alpha=alpha)
|
|
452
|
+
s.mul_(self.beta3).add_(grad_flat[::slice_p], alpha=alpha)
|
|
450
453
|
self.d_denom += s.abs().sum().item()
|
|
451
454
|
|
|
452
455
|
del s, p0, grad_flat, p_flat, alpha
|
|
@@ -512,6 +515,8 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
512
515
|
d_hat = self.d
|
|
513
516
|
if global_d_denom > 0:
|
|
514
517
|
d_hat = d_coef * global_d_numerator / global_d_denom
|
|
518
|
+
if g_group['d_limiter']:
|
|
519
|
+
d_hat = min(self.d * (2 ** 0.25), d_hat)
|
|
515
520
|
if self.d == g_group['d0']:
|
|
516
521
|
self.d = max(self.d, d_hat)
|
|
517
522
|
d_max = max(d_max, d_hat)
|
|
@@ -61,7 +61,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
|
|
|
61
61
|
"sunspike" ratio calculation to prevent division by zero. Corresponds
|
|
62
62
|
to `ε_spike` in the paper. (default: 1e-9)
|
|
63
63
|
k_warmup_steps (int): The number of initial steps during which β₂ is held
|
|
64
|
-
at a fixed
|
|
64
|
+
at a fixed beta2 value before the
|
|
65
65
|
dynamic logic activates. (default: 0)
|
|
66
66
|
k_logging (int): if > 0 and kourkoutas_beta=True, enables periodic console
|
|
67
67
|
logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
|
|
@@ -90,8 +90,8 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
|
|
|
90
90
|
stochastic_rounding: bool = True,
|
|
91
91
|
orthogonal_gradient: bool = False,
|
|
92
92
|
kourkoutas_beta: bool = False,
|
|
93
|
-
beta2_min: float = 0.
|
|
94
|
-
ema_alpha: float = 0.
|
|
93
|
+
beta2_min: float = 0.9,
|
|
94
|
+
ema_alpha: float = 0.95,
|
|
95
95
|
tiny_spike: float = 1e-9,
|
|
96
96
|
k_warmup_steps: int = 0,
|
|
97
97
|
k_logging: int = 0,
|
|
@@ -152,7 +152,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
|
|
|
152
152
|
state = self.state[p]
|
|
153
153
|
|
|
154
154
|
# State Initialization
|
|
155
|
-
if
|
|
155
|
+
if 'step' not in state:
|
|
156
156
|
state['step'] = 0
|
|
157
157
|
|
|
158
158
|
should_factor = (
|
adv_optm/util/Kourkoutas.py
CHANGED
|
@@ -11,9 +11,8 @@ class KourkoutasHelper:
|
|
|
11
11
|
if not hasattr(optimizer, 'param_groups'):
|
|
12
12
|
raise TypeError("optimizer must be a valid torch.optim.Optimizer instance.")
|
|
13
13
|
self.optimizer = optimizer
|
|
14
|
-
|
|
15
|
-
# State managed by the helper
|
|
16
14
|
self.layer_state = {}
|
|
15
|
+
|
|
17
16
|
self.layer_info = {}
|
|
18
17
|
self._layer_info_built = False
|
|
19
18
|
self._current_step_prepared = -1
|
|
@@ -25,14 +24,24 @@ class KourkoutasHelper:
|
|
|
25
24
|
# making it compatible with fused back pass mechanisms.
|
|
26
25
|
self._build_layer_info_if_needed()
|
|
27
26
|
|
|
27
|
+
if self.optimizer.param_groups[0].get('k_logging', 0) > 0:
|
|
28
|
+
self.print_layer_info()
|
|
29
|
+
|
|
28
30
|
def _build_layer_info_if_needed(self):
|
|
29
31
|
"""Builds a map of layers and the parameters they contain."""
|
|
30
32
|
if self._layer_info_built:
|
|
31
33
|
return
|
|
32
34
|
|
|
33
|
-
if
|
|
34
|
-
|
|
35
|
-
|
|
35
|
+
if hasattr(self.optimizer, 'layer_key_fn') and self.optimizer.layer_key_fn is not None:
|
|
36
|
+
# A custom key function was provided by the user. We will use it.
|
|
37
|
+
pass
|
|
38
|
+
else:
|
|
39
|
+
# No key function was provided. Default to coarse, shape-based bucketing.
|
|
40
|
+
self.optimizer.layer_key_fn = lambda p: \
|
|
41
|
+
(id(p),) if p.dim() == 2 and 1 <= p.shape[0] <= 10 and p.shape[1] in {768, 1280, 4096} \
|
|
42
|
+
else tuple(p.shape)
|
|
43
|
+
# This ensures that we won't mix embeddings with tokens (1 to 10)
|
|
44
|
+
# TODO find a better way to safeguard the embeddings
|
|
36
45
|
|
|
37
46
|
for group in self.optimizer.param_groups:
|
|
38
47
|
for p in group['params']:
|
|
@@ -48,6 +57,24 @@ class KourkoutasHelper:
|
|
|
48
57
|
|
|
49
58
|
self._layer_info_built = True
|
|
50
59
|
|
|
60
|
+
def print_layer_info(self):
|
|
61
|
+
"""Prints the contents of self.layer_info for debugging."""
|
|
62
|
+
print("\n--- BEGIN self.layer_info DUMP ---")
|
|
63
|
+
if not self.layer_info:
|
|
64
|
+
print("Layer info is empty. Make sure the optimizer has parameters.")
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
for layer_key, info in self.layer_info.items():
|
|
68
|
+
param_count = len(info['params'])
|
|
69
|
+
first_param_details = ""
|
|
70
|
+
if param_count > 0:
|
|
71
|
+
p = info['params'][0]
|
|
72
|
+
first_param_details = f" (Example param shape: {list(p.shape)}, dtype: {p.dtype})"
|
|
73
|
+
|
|
74
|
+
print(f"Key: {layer_key}, Params: {param_count}{first_param_details}")
|
|
75
|
+
|
|
76
|
+
print("--- END self.layer_info DUMP ---\n")
|
|
77
|
+
|
|
51
78
|
def prepare_step(self, current_step: int):
|
|
52
79
|
"""
|
|
53
80
|
Calculates dynamic beta2 for all layers using the completed scalar accumulators
|
|
@@ -55,44 +82,50 @@ class KourkoutasHelper:
|
|
|
55
82
|
"""
|
|
56
83
|
|
|
57
84
|
beta2_log = []
|
|
85
|
+
first_layer_key = next(iter(self.layer_info), None)
|
|
58
86
|
# These are just for the sample log, initialize them
|
|
59
|
-
sun, pooled_grad_norm,
|
|
60
|
-
|
|
87
|
+
sun, pooled_grad_norm, prev_r_ema_val, r_ema_tensor = (torch.tensor(0.0),)*4
|
|
61
88
|
|
|
62
89
|
for layer_key, info in self.layer_info.items():
|
|
63
90
|
params, group = info['params'], info['group_ref']
|
|
64
|
-
|
|
91
|
+
|
|
92
|
+
first_param_in_layer = info['params'][0]
|
|
93
|
+
param_state = self.optimizer.state[first_param_in_layer]
|
|
94
|
+
|
|
65
95
|
if layer_key not in self.layer_state:
|
|
66
96
|
self.layer_state[layer_key] = {
|
|
67
|
-
'
|
|
68
|
-
'sum_sq_accumulator': torch.tensor(0.0, device=params[0].device, dtype=torch.float32)
|
|
97
|
+
'sum_sq_accumulator': torch.tensor(0.0, device=first_param_in_layer.device, dtype=torch.float32)
|
|
69
98
|
}
|
|
70
99
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
# Use the completed accumulator from the previous step
|
|
74
|
-
pooled_grad_norm = torch.sqrt(layer_state['sum_sq_accumulator'])
|
|
75
|
-
|
|
76
|
-
r_ema = layer_state['r_ema_grad_norm']
|
|
100
|
+
if 'kourkoutas_r_ema' not in param_state:
|
|
101
|
+
param_state['kourkoutas_r_ema'] = torch.tensor(0.0, device=first_param_in_layer.device, dtype=torch.float32)
|
|
77
102
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
103
|
+
r_ema_tensor = param_state['kourkoutas_r_ema']
|
|
104
|
+
accumulator = self.layer_state[layer_key]['sum_sq_accumulator']
|
|
105
|
+
|
|
106
|
+
pooled_grad_norm = torch.sqrt(accumulator)
|
|
107
|
+
prev_r_ema_val = r_ema_tensor.item() # for logging
|
|
108
|
+
|
|
109
|
+
# Update the persistent EMA tensor in-place.
|
|
110
|
+
r_ema_tensor.mul_(group['ema_alpha']).add_(pooled_grad_norm, alpha=1.0 - group['ema_alpha'])
|
|
111
|
+
|
|
82
112
|
beta2_max = group['betas'][1]
|
|
83
|
-
|
|
84
|
-
|
|
113
|
+
sun = torch.tensor(0.0, device=r_ema_tensor.device) # Default sun to 0 for warmup
|
|
114
|
+
|
|
85
115
|
if current_step < group['k_warmup_steps']:
|
|
86
116
|
beta2 = beta2_max
|
|
87
117
|
else:
|
|
88
|
-
raw = pooled_grad_norm / (
|
|
118
|
+
raw = pooled_grad_norm / (r_ema_tensor + group['tiny_spike'])
|
|
89
119
|
sun = raw / (1.0 + raw)
|
|
90
120
|
beta2 = beta2_max - (beta2_max - group['beta2_min']) * sun
|
|
91
121
|
|
|
92
|
-
|
|
93
|
-
layer_state['
|
|
122
|
+
# Store the final calculated beta2 in the helper's transient state for this step.
|
|
123
|
+
self.layer_state[layer_key]['dynamic_beta2'] = beta2.item() if isinstance(beta2, torch.Tensor) else beta2
|
|
124
|
+
|
|
125
|
+
# Reset the accumulator for the next optimizer step.
|
|
126
|
+
accumulator.zero_()
|
|
94
127
|
|
|
95
|
-
beta2_log.append(layer_state['dynamic_beta2'])
|
|
128
|
+
beta2_log.append(self.layer_state[layer_key]['dynamic_beta2'])
|
|
96
129
|
|
|
97
130
|
# Always compute stats for TensorBoard
|
|
98
131
|
if beta2_log:
|
|
@@ -107,9 +140,12 @@ class KourkoutasHelper:
|
|
|
107
140
|
k_logging_interval = self.optimizer.param_groups[0].get('k_logging', 0)
|
|
108
141
|
is_logging_step = k_logging_interval > 0 and (current_step + 1) % k_logging_interval == 0
|
|
109
142
|
if is_logging_step and self.last_beta2_stats:
|
|
143
|
+
if first_layer_key:
|
|
144
|
+
print(f"\n[Kourkoutas-β Debug] Step {current_step + 1} - Sample Layer '{first_layer_key}':")
|
|
145
|
+
print(f" - Grad Norm: {pooled_grad_norm.item():.4e}, Prev EMA: {prev_r_ema_val:.4e}, New EMA: {r_ema_tensor.item():.4e}")
|
|
146
|
+
print(f" - Sunspike: {sun.item():.4f}, Dynamic Beta2: {self.layer_state[first_layer_key]['dynamic_beta2']:.4f}")
|
|
110
147
|
print(f"[Kourkoutas-β Debug] Step {current_step + 1} Overall Beta2 Stats: Min={self.last_beta2_stats['min']:.4f}, Max={self.last_beta2_stats['max']:.4f}, Mean={self.last_beta2_stats['mean']:.4f}")
|
|
111
148
|
|
|
112
|
-
|
|
113
149
|
def maybe_prepare_step(self, current_step: int):
|
|
114
150
|
"""
|
|
115
151
|
A universal guard that calls prepare_step() exactly once per training step.
|
|
@@ -125,9 +161,9 @@ class KourkoutasHelper:
|
|
|
125
161
|
layer_key = self.optimizer.layer_key_fn(p)
|
|
126
162
|
|
|
127
163
|
if layer_key in self.layer_info:
|
|
164
|
+
# Initialize the transient state for this layer if it's the first time in the step.
|
|
128
165
|
if layer_key not in self.layer_state:
|
|
129
166
|
self.layer_state[layer_key] = {
|
|
130
|
-
'r_ema_grad_norm': torch.tensor(0.0, device=p.device, dtype=torch.float32),
|
|
131
167
|
'sum_sq_accumulator': torch.tensor(0.0, device=p.device, dtype=torch.float32)
|
|
132
168
|
}
|
|
133
169
|
# Accumulate for the *next* step's prepare_step call
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: adv_optm
|
|
3
|
+
Version: 1.1.1
|
|
4
|
+
Summary: A family of highly efficient, lightweight yet powerful optimizers.
|
|
5
|
+
Home-page: https://github.com/Koratahiu/Advanced_Optimizers
|
|
6
|
+
Author: Koratahiu
|
|
7
|
+
Author-email: hiuhonor@gmail.com
|
|
8
|
+
License: Apache 2.0
|
|
9
|
+
Keywords: llm,fine-tuning,memory-efficient,low-rank,compression,pytorch,optimizer,adam
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: torch>=2.0
|
|
19
|
+
Dynamic: author
|
|
20
|
+
Dynamic: author-email
|
|
21
|
+
Dynamic: classifier
|
|
22
|
+
Dynamic: description
|
|
23
|
+
Dynamic: description-content-type
|
|
24
|
+
Dynamic: home-page
|
|
25
|
+
Dynamic: keywords
|
|
26
|
+
Dynamic: license
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
Dynamic: requires-dist
|
|
29
|
+
Dynamic: requires-python
|
|
30
|
+
Dynamic: summary
|
|
31
|
+
|
|
32
|
+
# Advanced Optimizers (AIO)
|
|
33
|
+
|
|
34
|
+
A comprehensive, all-in-one collection of optimization algorithms for deep learning, designed for **maximum efficiency**, **minimal memory footprint**, and **superior performance** across diverse model architectures and training scenarios.
|
|
35
|
+
|
|
36
|
+
[](https://pypi.org/project/adv_optm/)
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## 📦 Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install adv_optm
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## 🧠 Core Innovations
|
|
49
|
+
|
|
50
|
+
This library integrates multiple state-of-the-art optimization techniques validated through extensive research and practical training, with **1-bit compression for optimizer states**:
|
|
51
|
+
|
|
52
|
+
### **Memory-Efficient Optimization (SMMF-inspired)**
|
|
53
|
+
- **Paper**: [SMMF: Square-Matricized Momentum Factorization](https://arxiv.org/abs/2412.08894)
|
|
54
|
+
- **Approach**: Uses rank-1 non-negative matrix factorization with reconstruction cycle (factor → reconstruct → update → factor)
|
|
55
|
+
- **Innovation**:
|
|
56
|
+
- First moment split into **1-bit sign + absolute value**
|
|
57
|
+
- Final storage: **four factored vectors + one 1-bit sign state**
|
|
58
|
+
- Preserves Adam-like update quality with drastically reduced memory
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## ⚡ Performance Characteristics
|
|
63
|
+
|
|
64
|
+
### Memory Efficiency (SDXL Model – 6.5GB)
|
|
65
|
+
| Optimizer | Memory Usage | Description |
|
|
66
|
+
|-----------|--------------|-------------|
|
|
67
|
+
| `Adopt_Factored` | 328 MB | 4 small vectors + 1-bit state |
|
|
68
|
+
| `Adopt_Factored + AdEMAMix` | 625 MB | 6 small vectors + two 1-bit states |
|
|
69
|
+
| `Simplified_AdEMAMix` | 328 MB | Same as standard factored (no extra state) |
|
|
70
|
+
|
|
71
|
+
### Speed Comparison (SDXL, Batch Size 4)
|
|
72
|
+
| Optimizer | Speed | Notes |
|
|
73
|
+
|-----------|-------|-------|
|
|
74
|
+
| `Adafactor` | ~8.5s/it | Baseline |
|
|
75
|
+
| `Adopt_Factored` | ~10s/it | +18% overhead from compression |
|
|
76
|
+
| `Adopt_Factored + AdEMAMix` | ~12s/it | +41% overhead (3 factored states) |
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## 🧪 Available Optimizers
|
|
81
|
+
|
|
82
|
+
### Standard Optimizers (All support `factored=True/False`)
|
|
83
|
+
| Optimizer | Description | Best For |
|
|
84
|
+
|-----------|-------------|----------|
|
|
85
|
+
| `Adam_Adv` | Advanced Adam implementation | General purpose |
|
|
86
|
+
| `Adopt_Adv` | Adam-variant with independent beta2 | Stable training for small batch size regimes |
|
|
87
|
+
| `Prodigy_Adv` | Prodigy with D-Adaptation | Adam with automatic LR tuning |
|
|
88
|
+
| `Simplified_AdEMAMix` | Adam variant with accumulator momentum | Small/large batch training when tuned correctly |
|
|
89
|
+
| `Lion_Adv` | Advanced Lion implementation | Memory-constrained environments |
|
|
90
|
+
| `Prodigy_Lion_Adv` | Prodigy + Lion combination | Lion with automatic LR tuning |
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## ⚙️ Feature Matrix
|
|
95
|
+
|
|
96
|
+
| Feature | Adam_Adv | Adopt_Adv | Prodigy_Adv | Simplified_AdEMAMix | Lion_Adv |
|
|
97
|
+
|---------|----------|-----------|-------------|---------------------|----------|
|
|
98
|
+
| Factored | ✓ | ✓ | ✓ | ✓ | ✓ |
|
|
99
|
+
| AdEMAMix | ✓ | ✓ | ✓ | ✗ | ✗ |
|
|
100
|
+
| Simplified_AdEMAMix | ✗ | ✓ | ✓ | ✓ | ✗ |
|
|
101
|
+
| OrthoGrad | ✓ | ✓ | ✓ | ✓ | ✓ |
|
|
102
|
+
| Grams | ✓ | ✓ | ✓ | ✗ | ✗ |
|
|
103
|
+
| Cautious | ✓ | ✓ | ✓ | ✗ | ✓ |
|
|
104
|
+
| atan2 | ✓ | ✓ | ✓ | ✗ | ✗ |
|
|
105
|
+
| Stochastic Rounding | ✓ | ✓ | ✓ | ✓ | ✓ |
|
|
106
|
+
| Fused Backward Pass | ✓ | ✓ | ✓ | ✓ | ✓ |
|
|
107
|
+
| **Kourkoutas-β** | ✓ | ✓ | ✓ | ✓ | ✗ |
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## 🛠️ Comprehensive Feature Guide
|
|
112
|
+
|
|
113
|
+
### A. Universal Safe Features
|
|
114
|
+
*These features work with all optimizers and are generally safe to enable.*
|
|
115
|
+
|
|
116
|
+
| Feature | Description | Recommended Usage | Performance Impact | Theoretical Basis | Compatibility |
|
|
117
|
+
|--------|-------------|-------------------|--------------------|-------------------|--------------|
|
|
118
|
+
| **Fused Back Pass** | Fuses backward pass; gradients used immediately and memory freed on-the-fly | Memory-constrained environments | Reduces peak memory | Memory optimization | All optimizers |
|
|
119
|
+
| **Stochastic Rounding** | Replaces nearest rounding with stochastic rounding to preserve small gradient updates in BF16 | BF16 training | Minimal overhead (<5%) | [Revisiting BFloat16 Training](https://arxiv.org/abs/2010.06192) | All optimizers |
|
|
120
|
+
| **OrthoGrad** | Removes gradient component parallel to weights to reduce overfitting | Full fine-tuning without weight decay | +33% time overhead (BS=4); less at larger BS | [Grokking at Edge](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability) | All optimizers |
|
|
121
|
+
| **Factored** | Memory-efficient optimization via rank-1 1-bit factorization of optimizer states | Large models / memory-limited hardware | Adds compression overhead | [SMMF](https://arxiv.org/abs/2412.08894) | All optimizers |
|
|
122
|
+
|
|
123
|
+
### B. Individual Features
|
|
124
|
+
|
|
125
|
+
| Feature | Description | Recommended Usage | Performance Impact | Theoretical Basis | Compatibility |
|
|
126
|
+
|--------|-------------|-------------------|--------------------|-------------------|--------------|
|
|
127
|
+
| **Cautious** | Only applies update if gradient direction aligns with momentum direction | Accelerating convergence | No overhead | [C-Optim](https://github.com/kyleliang919/C-Optim) | Adam/Adopt/Prodigy/Lion |
|
|
128
|
+
| **Grams** | Update direction derived purely from current gradient | When Cautious is insufficient | No overhead | [Grams](https://github.com/Gunale0926/Grams) | Adam/Adopt/Prodigy |
|
|
129
|
+
| **AdEMAMix** | Dual EMA system that retains relevance of gradients over tens of thousands of steps | Long training runs, especially where model forgetting is a concern | +1 state memory | [AdEMAMix](https://arxiv.org/abs/2409.03137) | Adam/Adopt/Prodigy |
|
|
130
|
+
| **Simplified_AdEMAMix** | Accumulator-based momentum, single EMA variant of AdEMAMix | All scenarios when tuned correctly | No overhead | [Connections](https://arxiv.org/abs/2502.02431) | Adam/Adopt/Prodigy |
|
|
131
|
+
| **atan2** | Robust epsilon replacement with built-in gradient clipping | Use for stable bounded updates (or for Adopt as it needs that) | No overhead | [Adam-atan2](https://github.com/lucidrains/adam-atan2-pytorch) | Adam/Adopt/Prodigy |
|
|
132
|
+
| **Kourkoutas-β** | Layer-wise adaptive β₂ based on gradient “sunspike” ratio | Noisy/small/large-batch/high-LR training | No overhead | [Kourkoutas-β]() | Adam/Adopt/Prodigy/Simplified_AdEMAMix |
|
|
133
|
+
|
|
134
|
+
> **Note**: If both **Cautious** and **Grams** are enabled, **Grams takes precedence** and Cautious is disabled.
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## 🔍 Feature Deep Dives
|
|
139
|
+
|
|
140
|
+
### AdEMAMix
|
|
141
|
+
|
|
142
|
+
- Adds a **slow-decaying second EMA** (`beta3`) that retains gradient memory over tens of thousands of steps.
|
|
143
|
+
- Particularly effective for **small batch sizes**, where Adam’s standard first moment is nearly useless.
|
|
144
|
+
- **Reference**: [AdaMeM: Memory Efficient Momentum for Adafactor](https://openreview.net/forum?id=fZqMVTz7K5)
|
|
145
|
+
|
|
146
|
+
#### Tunable Hyperparameters
|
|
147
|
+
| Parameter | Default | Tuning Guide |
|
|
148
|
+
|-----------|---------|--------------|
|
|
149
|
+
| `beta3` | 0.9999 | • Runs >120k steps: **0.9999**<br>• Runs ≤120k steps: **0.999** |
|
|
150
|
+
| `alpha` | 5 | • Reduce to **2–3** if diverging<br>• Increase to strengthen long-term memory |
|
|
151
|
+
|
|
152
|
+
> ✅ **Pro Tip**: Set `beta1=0` in Adam/Adopt/Prodigy to skip standard EMA entirely and rely solely on AdEMAMix’s slow EMA, ideal for small-batch regimes.
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
### Simplified_AdEMAMix
|
|
157
|
+
|
|
158
|
+
- Introduced in [Connections between Schedule-Free Optimizers, AdEMAMix, and Accelerated SGD Variants (arXiv:2502.02431)](https://arxiv.org/abs/2502.02431).
|
|
159
|
+
- Replaces Adam’s first moment with a **gradient accumulator**, combining the stability of long memory with responsiveness to recent gradients.
|
|
160
|
+
- **Key insight**: Classical momentum **does not accelerate** in noisy (small-batch) regimes; this accumulator do.
|
|
161
|
+
|
|
162
|
+
#### Tunable Hyperparameters
|
|
163
|
+
| Parameter | Default | Tuning Guide |
|
|
164
|
+
|----------|---------|--------------|
|
|
165
|
+
| `beta1` | 0.99 | Controls accumulator memory length:<br>• Small BS: **0.99–0.9999**<br>• Large BS: **0.9** |
|
|
166
|
+
| `Grad α` | 100 | Most critical parameter:<br>• Inversely scales with batch size<br>• **100–10** for small BS (≤32)<br>• **1–0.1** for large BS (≥512) |
|
|
167
|
+
|
|
168
|
+
> ⚠️ **Critical**: Requires **~100x smaller learning rate** than AdamW (e.g., 1e-6 vs 1e-4).
|
|
169
|
+
> For `Prodigy_Adv`, set `initial_d` to:
|
|
170
|
+
> - **LoRA**: `1e-8`
|
|
171
|
+
> - **Full FT**: `1e-10`
|
|
172
|
+
> - **Embedding**: `1e-7`
|
|
173
|
+
|
|
174
|
+
> ⚠️ **Incompatible** with: **Cautious**, **Grams**, **atan2**, and standard gradient clipping.
|
|
175
|
+
|
|
176
|
+
#### Performance Validation
|
|
177
|
+
|
|
178
|
+
**Small Batch Training (SDXL, BS=2, 1.8K steps)**
|
|
179
|
+

|
|
180
|
+
|
|
181
|
+
- **🟢 Prodigy_Adv** (beta1=0.9, d0=1e-5): Final LR = 2.9e-4
|
|
182
|
+
- **🔵 Prodigy_Adv + Simplified_AdEMAMix** (beta1=0.99, α=100, d0=1e-7): Final LR = 5.8e-6
|
|
183
|
+
|
|
184
|
+
**Results**:
|
|
185
|
+
- Faster convergence and higher final performance with Simplified_AdEMAMix
|
|
186
|
+
- D-Adaptation automatically compensates for aggressive updates
|
|
187
|
+
- Generated samples show **significantly better quality**
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
### atan2
|
|
192
|
+
|
|
193
|
+
- Replaces `eps` in Adam-family optimizers with a **scale-invariant**, bounded update rule.
|
|
194
|
+
- Automatically clips updates to **[-2, 2]**, preventing destabilizing jumps.
|
|
195
|
+
- **Highly recommended** for `Adopt_Adv`, which is prone to instability without clipping.
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
### **Kourkoutas-β**
|
|
200
|
+
|
|
201
|
+
**Kourkoutas-β** introduces a **sunspike-driven, layer-wise adaptive second-moment decay (β₂)** as an optional enhancement for `Adam_Adv`, `Adopt_Adv`, `Prodigy_Adv`, and `Simplified_AdEMAMix`.
|
|
202
|
+
|
|
203
|
+
Instead of using a fixed β₂ (e.g., 0.999 or 0.95), it **dynamically modulates β₂ per layer** based on a bounded *sunspike ratio*:
|
|
204
|
+
|
|
205
|
+
- **During gradient bursts** → β₂ ↓ toward `Lower β₂` → faster reaction
|
|
206
|
+
- **During calm phases** → β₂ ↑ toward `The Selected β₂` → stronger smoothing
|
|
207
|
+
|
|
208
|
+
This is especially effective for **noisy training, small batch sizes, and high learning rates**, where gradient norms shift abruptly due to noise or aggressive LR schedules.
|
|
209
|
+
|
|
210
|
+
#### Pros/Cons
|
|
211
|
+
|
|
212
|
+
| **Category** | **Details** |
|
|
213
|
+
|--------------|-------------|
|
|
214
|
+
| ✅ **Pros** | • **Layer-wise adaptation** blends benefits of high β₂ (strong smoothing) and low β₂ (fast reaction).<br>• **Robust to sudden loss landscape shifts**, reacts quickly during gradient bursts, smooths during calm phases.<br>• **High tolerance to aggressive learning rates**. |
|
|
215
|
+
| ⚠️ **Cons** | • **Potentially unstable at the start of training** due to unreliable early gradient norms; mitigated by using `K-β Warmup Steps`. |
|
|
216
|
+
|
|
217
|
+
> 💡 **Best Practice**: Set `K_warmup_steps` equal to your standard LR warmup steps. During warmup, the optimizer uses the static `beta2`; adaptation begins only after warmup ends.
|
|
218
|
+
|
|
219
|
+
> 🔍 **Debugging Aid**: Enable `K_Logging` to monitor (min, max, mean) of dynamic β₂ values across layers every *N* steps.
|
|
220
|
+
|
|
221
|
+
#### 📊 Performance Validation
|
|
222
|
+
|
|
223
|
+
**ADAMW_ADV - full SDXL finetuning (aggressive LR: 3e-5) (BS=4, 2.5K steps)**
|
|
224
|
+
<img width="1460" height="382" alt="image" src="https://github.com/user-attachments/assets/007f278a-fbac-4f3d-9cc7-274c3b959cdd" />
|
|
225
|
+
|
|
226
|
+
- 🟣 Fixed `beta2=0.999`
|
|
227
|
+
- 🟠 Auto K-beta
|
|
228
|
+
|
|
229
|
+
**Observations:**
|
|
230
|
+
- K-beta is clearly better and more robust/stable for high LRs.
|
|
231
|
+
|
|
232
|
+
> 📚 **Reference**:
|
|
233
|
+
> - Paper: [Kourkoutas-β: A Sunspike-Driven Adam Optimizer with Desert Flair](https://arxiv.org/abs/2508.12996)
|
|
234
|
+
> - Code: [kbeta](https://github.com/sck-at-ucy/kbeta)
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Recommended Preset (Tested on LoRA/FT/Embedding)
|
|
239
|
+
|
|
240
|
+
```yaml
|
|
241
|
+
Learning Rate: 1
|
|
242
|
+
optimizer: PRODIGY_Adv
|
|
243
|
+
settings:
|
|
244
|
+
- beta1: 0.99 # Controls momentum decay, ~100-step effective memory. Adjust to 0.999 (1000 steps) or 0.9999 (10000 steps) based on training length and stability needs.
|
|
245
|
+
- beta2: 0.999
|
|
246
|
+
- kourkoutas_beta: True # For Kourkoutas-β
|
|
247
|
+
- K-β Warmup Steps: 50 # Or 100, 200, depending on your run
|
|
248
|
+
- Simplified_AdEMAMix: True
|
|
249
|
+
- Grad α: 100
|
|
250
|
+
- OrthoGrad: True
|
|
251
|
+
- weight_decay: 0.0
|
|
252
|
+
- initial_d:
|
|
253
|
+
• LoRA: 1e-8
|
|
254
|
+
• Full fine-tune: 1e-10
|
|
255
|
+
• Embedding: 1e-7
|
|
256
|
+
- d_coef: 1
|
|
257
|
+
- d_limiter: True # To stablizie Prodigy with Simplified_AdEMAMix
|
|
258
|
+
- factored: False # Can be true or false, quality should not degrade due to Simplified_AdEMAMix’s high tolerance to 1-bit factorization.
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
> ✅ **Why it works**:
|
|
262
|
+
> - `Kourkoutas-β` handles beta2 values
|
|
263
|
+
> - `Simplified_AdEMAMix` ensures responsiveness in small-batch noise
|
|
264
|
+
> - `OrthoGrad` prevents overfitting without weight decay
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
## 📚 References
|
|
269
|
+
|
|
270
|
+
1. [Revisiting BFloat16 Training](https://arxiv.org/abs/2010.06192)
|
|
271
|
+
2. [SMMF: Square-Matricized Momentum Factorization](https://arxiv.org/abs/2412.08894)
|
|
272
|
+
3. [The AdEMAMix Optimizer](https://arxiv.org/abs/2409.03137)
|
|
273
|
+
4. [Connections between Schedule-Free Optimizers, AdEMAMix, and Accelerated SGD](https://arxiv.org/abs/2502.02431)
|
|
274
|
+
5. [AdaMeM: Memory Efficient Momentum for Adafactor](https://openreview.net/forum?id=fZqMVTz7K5)
|
|
275
|
+
6. [Kourkoutas-β: A Sunspike-Driven Adam Optimizer with Desert Flair](https://arxiv.org/abs/2508.12996)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
adv_optm/__init__.py,sha256=TL9XFW3kQQ2Xrxl6UULMftBzNvg7uTIcxMRD0vTttPk,306
|
|
2
|
+
adv_optm/optim/AdamW_adv.py,sha256=ddEUVOif1gfZPgEJNrEGZ2wnha4MPMWw5ppPd8acQ3o,17457
|
|
3
|
+
adv_optm/optim/Adopt_adv.py,sha256=fhH3hS9K6z5Blxc7NFfzpCrUGbl9EQnwLPmKDxBC1zg,21415
|
|
4
|
+
adv_optm/optim/Lion_Prodigy_adv.py,sha256=aJ9orEEw0QYbrDzn1be0SHvOBlIkLwWG9RpWFuNMskM,13163
|
|
5
|
+
adv_optm/optim/Lion_adv.py,sha256=aGNAplZlyXYgVllYcV_s4bK8iC4fv6EizFoWIMNLdBc,8299
|
|
6
|
+
adv_optm/optim/Prodigy_adv.py,sha256=nD59cAWOJJCjZdIiuD5hD9MWO5sTjPQSvq-3dwGTcEM,25875
|
|
7
|
+
adv_optm/optim/Simplified_AdEMAMix.py,sha256=gPjMhKulzmAeO42foe-d7xW0AcB50vKFYsvHgxbD3uc,12949
|
|
8
|
+
adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
|
|
9
|
+
adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
|
|
10
|
+
adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
|
|
11
|
+
adv_optm/util/Kourkoutas.py,sha256=woyJfX7l4eieeg0pC5XrILBLvwECwbD3a6ou1K6qjKU,8706
|
|
12
|
+
adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
|
|
13
|
+
adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
|
|
14
|
+
adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
|
|
15
|
+
adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
|
|
16
|
+
adv_optm-1.1.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
17
|
+
adv_optm-1.1.1.dist-info/METADATA,sha256=F30-DuFinS-633wznIM27NBGU5asYpnKdiExchOFPcI,14019
|
|
18
|
+
adv_optm-1.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
adv_optm-1.1.1.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
|
|
20
|
+
adv_optm-1.1.1.dist-info/RECORD,,
|
|
@@ -1,174 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: adv_optm
|
|
3
|
-
Version: 1.1.0.dev4
|
|
4
|
-
Summary: A family of highly efficient, lightweight yet powerful optimizers.
|
|
5
|
-
Home-page: https://github.com/Koratahiu/Advanced_Optimizers
|
|
6
|
-
Author: Koratahiu
|
|
7
|
-
Author-email: hiuhonor@gmail.com
|
|
8
|
-
License: Apache 2.0
|
|
9
|
-
Keywords: llm,fine-tuning,memory-efficient,low-rank,compression,pytorch,optimizer,adam
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
-
Classifier: Operating System :: OS Independent
|
|
13
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
-
Requires-Python: >=3.8
|
|
16
|
-
Description-Content-Type: text/markdown
|
|
17
|
-
License-File: LICENSE
|
|
18
|
-
Requires-Dist: torch>=2.0
|
|
19
|
-
Dynamic: author
|
|
20
|
-
Dynamic: author-email
|
|
21
|
-
Dynamic: classifier
|
|
22
|
-
Dynamic: description
|
|
23
|
-
Dynamic: description-content-type
|
|
24
|
-
Dynamic: home-page
|
|
25
|
-
Dynamic: keywords
|
|
26
|
-
Dynamic: license
|
|
27
|
-
Dynamic: license-file
|
|
28
|
-
Dynamic: requires-dist
|
|
29
|
-
Dynamic: requires-python
|
|
30
|
-
Dynamic: summary
|
|
31
|
-
|
|
32
|
-
# Advanced Optimizers (AIO)
|
|
33
|
-
|
|
34
|
-
A comprehensive, all-in-one collection of optimization algorithms for deep learning, designed for maximum efficiency, minimal memory footprint, and superior performance across diverse model architectures and training scenarios.
|
|
35
|
-
|
|
36
|
-
[](https://pypi.org/project/adv_optm/)
|
|
37
|
-
|
|
38
|
-
---
|
|
39
|
-
|
|
40
|
-
## 📦 Installation
|
|
41
|
-
|
|
42
|
-
```bash
|
|
43
|
-
pip install adv_optm
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
---
|
|
47
|
-
|
|
48
|
-
## 🧠 Core Innovations
|
|
49
|
-
|
|
50
|
-
This library integrates multiple state-of-the-art optimization techniques validated through extensive research and practical training, with 1-bit compression for optimizer states:
|
|
51
|
-
|
|
52
|
-
### **Memory-Efficient Optimization (SMMF-inspired)**
|
|
53
|
-
- **Paper**: [SMMF: Square-Matricized Momentum Factorization](https://arxiv.org/abs/2412.08894)
|
|
54
|
-
- **Approach**: Uses rank-1 non-negative matrix factorization with reconstruction cycle (factor → reconstruct → update → factor)
|
|
55
|
-
- **Innovation**:
|
|
56
|
-
- First moment split into **1-bit sign + absolute value**
|
|
57
|
-
- Final storage: **four factored vectors + one 1-bit sign state**
|
|
58
|
-
- Preserves Adam-like update quality with drastically reduced memory
|
|
59
|
-
|
|
60
|
-
---
|
|
61
|
-
|
|
62
|
-
## ⚡ Performance Characteristics
|
|
63
|
-
|
|
64
|
-
### Memory Efficiency (SDXL Model - 6.5GB)
|
|
65
|
-
| Optimizer | Memory Usage | Description |
|
|
66
|
-
|-----------|--------------|-------------|
|
|
67
|
-
| `Adopt_Factored` | 328 MB | 4 small vectors + 1-bit state |
|
|
68
|
-
| `Adopt_Factored + AdEMAMix` | 625 MB | 6 small vectors + two 1-bit states |
|
|
69
|
-
| `Simplified_AdEMAMix` | 328 MB | Same as standard factored (no extra state) |
|
|
70
|
-
|
|
71
|
-
### Speed Comparison (SDXL, Batch Size 4)
|
|
72
|
-
| Optimizer | Speed | Notes |
|
|
73
|
-
|-----------|-------|-------|
|
|
74
|
-
| `Adafactor` | ~8.5s/it | Baseline |
|
|
75
|
-
| `Adopt_Factored` | ~10s/it | +18% overhead from compression |
|
|
76
|
-
| `Adopt_Factored + AdEMAMix` | ~12s/it | +41% overhead (3 factored states) |
|
|
77
|
-
|
|
78
|
-
---
|
|
79
|
-
|
|
80
|
-
## 🧪 Available Optimizers
|
|
81
|
-
|
|
82
|
-
### Standard Optimizers (All support `factored=True/False`)
|
|
83
|
-
| Optimizer | Description | Best For |
|
|
84
|
-
|-----------|-------------|----------|
|
|
85
|
-
| `Adam_Adv` | Advanced Adam implementation | General purpose |
|
|
86
|
-
| `Adopt_Adv` | Adam-variant with independent beta2 | Stable training for small batch size regimes |
|
|
87
|
-
| `Prodigy_Adv` | Prodigy with D-Adaptation | Adam with automatic LR tuning |
|
|
88
|
-
| `Simplified_AdEMAMix` | Adam variant with accumulator momentum | Small/large batch training when tuned correctly |
|
|
89
|
-
| `Lion_Adv` | Advanced Lion implementation | Memory-constrained environments |
|
|
90
|
-
| `Prodigy_Lion_Adv` | Prodigy + Lion combination | Lion with automatic LR tuning |
|
|
91
|
-
|
|
92
|
-
### Feature Matrix
|
|
93
|
-
| Feature | Adam_Adv | Adopt_Adv | Prodigy_Adv | Simplified_AdEMAMix | Lion_Adv |
|
|
94
|
-
|---------|----------|-----------|-------------|---------------------|----------|
|
|
95
|
-
| Factored | ✓ | ✓ | ✓ | ✓ | ✓ |
|
|
96
|
-
| AdEMAMix | ✓ | ✓ | ✓ | ✗ | ✗ |
|
|
97
|
-
| Simplified_AdEMAMix | ✗ | ✗ | ✓ | ✓ | ✗ |
|
|
98
|
-
| OrthoGrad | ✓ | ✓ | ✓ | ✓ | ✓ |
|
|
99
|
-
| Grams | ✓ | ✓ | ✓ | ✗ | ✗ |
|
|
100
|
-
| Cautious | ✓ | ✓ | ✓ | ✗ | ✓ |
|
|
101
|
-
| atan2 | ✓ | ✓ | ✓ | ✗ | ✗ |
|
|
102
|
-
| Stochastic Rounding | ✓ | ✓ | ✓ | ✓ | ✓ |
|
|
103
|
-
| Fused Backward Pass | ✓ | ✓ | ✓ | ✓ | ✓ |
|
|
104
|
-
|
|
105
|
-
---
|
|
106
|
-
|
|
107
|
-
## ⚙️ Key Features & Parameters
|
|
108
|
-
|
|
109
|
-
### Comprehensive Feature Guide
|
|
110
|
-
|
|
111
|
-
| Feature | Description | Recommended Usage | Performance Impact | Theoretical Basis | Compatibility |
|
|
112
|
-
|---------|-------------|-------------------|--------------------|-------------------|--------------|
|
|
113
|
-
| **Factored** | Memory-efficient optimization using rank-1 factorization | Enable for large models (>1B params) or limited VRAM | +12-41% time overhead, 1-bit memory usage | [SMMF](https://arxiv.org/abs/2412.08894) | All optimizers |
|
|
114
|
-
| **AdEMAMix** | Dual EMA system for momentum | Use for long training runs (10k+ steps) | +1 state memory. | [AdEMAMix](https://arxiv.org/abs/2409.03137) | Adam/Adopt/Prodigy |
|
|
115
|
-
| **Simplified_AdEMAMix** | Accumulator-based momentum | Small batch training (≤32) | Same memory as standard, no extra overhead | [Schedule-Free Connections](https://arxiv.org/abs/2502.02431) | Adam/Prodigy |
|
|
116
|
-
| **OrthoGrad** | Removes gradient component parallel to weights | Full finetuning without weight decay | +33% time overhead, no memory impact | [Grokking at Edge](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability) | All optimizers |
|
|
117
|
-
| **Stochastic Rounding** | Improves precision for BF16 training | BF16 training | Minimal overhead (<5%) | [Revisiting BFloat16 Training](https://arxiv.org/abs/2010.06192) | All optimizers |
|
|
118
|
-
| **atan2** | Robust eps replacement + built-in clipping | Use with Adopt or unstable training | No overhead | [Adam-atan2](https://github.com/lucidrains/adam-atan2-pytorch) | Adam/Adopt/prodigy |
|
|
119
|
-
| **Cautious** | Update only when the direction align with the gradients | should faster the convergence | No overhead | [C-Optim](https://github.com/kyleliang919/C-Optim) | Adam/Adopt/prodigy |
|
|
120
|
-
| **Grams** | Update direction from the gradients | should have a stronger effect than cautious | No overhead | [Grams](https://github.com/Gunale0926/Grams) | Adam/Adopt/prodigy |
|
|
121
|
-
|
|
122
|
-
---
|
|
123
|
-
|
|
124
|
-
## Simplified_AdEMAMix Parameters
|
|
125
|
-
Simplified_AdEMAMix replaces standard momentum with an accumulator for better small-large batch performance.
|
|
126
|
-
|
|
127
|
-
| Parameter | Recommended Values | Description |
|
|
128
|
-
|-----------|---------------------|-------------|
|
|
129
|
-
| `beta1` | 0.9 (large BS), 0.99-0.9999 (small BS) | Determines memory length of accumulator |
|
|
130
|
-
| `alpha` | 100-10 (small BS), 1-0 (large BS) | Gradient smoothing factor |
|
|
131
|
-
|
|
132
|
-
**Alpha Tuning Guide**:
|
|
133
|
-
| Batch Size | Recommended α | Rationale |
|
|
134
|
-
|------------|---------------|-----------|
|
|
135
|
-
| Small (≤32) | 100, 50, 20, 10 | Emphasizes recent gradients for quick adaptation |
|
|
136
|
-
| Medium (32-512) | 10, 5, 2, 1 | Balanced approach |
|
|
137
|
-
| Large (≥512) | 1, 0.5, 0 | Emphasizes historical gradients for stability |
|
|
138
|
-
|
|
139
|
-
⚠️ **Important**: Use **~100x smaller learning rate** with Simplified_AdEMAMix compared to AdamW (e.g., 1e-6 instead of 1e-4)
|
|
140
|
-
|
|
141
|
-
### 📊 Performance Validation
|
|
142
|
-
Small Batch Training (SDXL, BS=2, 1.8K steps)
|
|
143
|
-

|
|
144
|
-
|
|
145
|
-
- **🟢 Prodigy_adv** (beta1=0.9, d0=1e-5): Final LR=2.9e-4
|
|
146
|
-
- **🔵 Prodigy_adv + Simplified_AdEMAMix** (beta1=0.99, α=100, d0=1e-7): Final LR=5.8e-6
|
|
147
|
-
|
|
148
|
-
**Results**:
|
|
149
|
-
- Simplified_AdEMAMix shows faster convergence and better final performance
|
|
150
|
-
- D-Adaptation automatically handles aggressive updates (50x smaller LR)
|
|
151
|
-
- Generated samples show significantly better quality with Simplified_AdEMAMix
|
|
152
|
-
|
|
153
|
-
---
|
|
154
|
-
|
|
155
|
-
## ⚠️ Known Limitations
|
|
156
|
-
|
|
157
|
-
### 1. Prodigy_Adv Sensitivity
|
|
158
|
-
- Highly sensitive to gradient modifications (Adopt normalization, low-rank factorization)
|
|
159
|
-
- May fail to increase learning rate in some LoRA scenarios
|
|
160
|
-
- **Fix**: Disable factorization or set beta1=0
|
|
161
|
-
|
|
162
|
-
### 2. Aggressive Learning Rates
|
|
163
|
-
- Can destabilize factored first moment
|
|
164
|
-
- **Recommendation**: Check Prodigy learning rate as reference for safe LR threshold
|
|
165
|
-
|
|
166
|
-
---
|
|
167
|
-
|
|
168
|
-
## 📚 References
|
|
169
|
-
|
|
170
|
-
1. [SMMF: Square-Matricized Momentum Factorization](https://arxiv.org/abs/2412.08894)
|
|
171
|
-
2. [The AdEMAMix Optimizer: Better, Faster, Older](https://arxiv.org/abs/2409.03137)
|
|
172
|
-
3. [Connections between Schedule-Free Optimizers, AdEMAMix, and Accelerated SGD Variants](https://arxiv.org/abs/2502.02431)
|
|
173
|
-
|
|
174
|
-
---
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
adv_optm/__init__.py,sha256=H4E_1__pXxRu4PSgQCzGi7WuFqVjTfex2Yduz3B3peI,311
|
|
2
|
-
adv_optm/optim/AdamW_adv.py,sha256=H4XlYZELwiFvXt0A9wMlRNiw9c8rmPMspHDCvR_SZIQ,17487
|
|
3
|
-
adv_optm/optim/Adopt_adv.py,sha256=0uMROjCw3wGOyp0ZX_xjwMVaXHJ395ifntcgY0MZt3M,21460
|
|
4
|
-
adv_optm/optim/Lion_Prodigy_adv.py,sha256=xIrwibQ2i919EHEACLCrKe5JBnS-s2Ai35yeJ1Bn1MA,13159
|
|
5
|
-
adv_optm/optim/Lion_adv.py,sha256=6G1CukJB_pC7l9HwFEuY1ydsNHZFabVmOvcHDsHHVuQ,8295
|
|
6
|
-
adv_optm/optim/Prodigy_adv.py,sha256=EeSfYu8IIeZX1Dk8MlD71vGOpMadtnW2iMhHxPDL2XQ,25574
|
|
7
|
-
adv_optm/optim/Simplified_AdEMAMix.py,sha256=b4GaSI-TX6wFBqGxZeoJPbf2nVRCEtB3WVb1olDgY14,12980
|
|
8
|
-
adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
|
|
9
|
-
adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
|
|
10
|
-
adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
|
|
11
|
-
adv_optm/util/Kourkoutas.py,sha256=st9hO2I0Xcby0LLq1MhxiEsPyNzEkNpJO_WfYvkioKg,6606
|
|
12
|
-
adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
|
|
13
|
-
adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
|
|
14
|
-
adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
|
|
15
|
-
adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
|
|
16
|
-
adv_optm-1.1.0.dev4.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
17
|
-
adv_optm-1.1.0.dev4.dist-info/METADATA,sha256=Ue6x-vthnxradX5tH1ver4LVbWMEMmqPjMVO8KjTdhI,8427
|
|
18
|
-
adv_optm-1.1.0.dev4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
-
adv_optm-1.1.0.dev4.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
|
|
20
|
-
adv_optm-1.1.0.dev4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|