adv-optm 0.1.4__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of adv-optm might be problematic. Click here for more details.
- {adv_optm-0.1.4 → adv_optm-0.1.6}/PKG-INFO +1 -1
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/__init__.py +1 -1
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/optim/AdamW_adv.py +29 -17
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/optim/Prodigy_adv.py +3 -3
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm.egg-info/PKG-INFO +1 -1
- {adv_optm-0.1.4 → adv_optm-0.1.6}/setup.py +1 -1
- {adv_optm-0.1.4 → adv_optm-0.1.6}/LICENSE +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/README.md +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/optim/Adopt_adv.py +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/optim/Lion_Prodigy_adv.py +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/optim/Lion_adv.py +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/optim/__init__.py +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/util/BF16_Stochastic_Rounding.py +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/util/Effective_Shape.py +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/util/NNMF.py +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/util/One_Bit_Boolean.py +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/util/OrthoGrad.py +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm/util/__init__.py +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm.egg-info/SOURCES.txt +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm.egg-info/dependency_links.txt +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm.egg-info/requires.txt +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/adv_optm.egg-info/top_level.txt +0 -0
- {adv_optm-0.1.4 → adv_optm-0.1.6}/setup.cfg +0 -0
|
@@ -21,7 +21,10 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
21
21
|
averages of gradient and its square (default: (0.9, 0.999))
|
|
22
22
|
eps (float): term added to the denominator to improve
|
|
23
23
|
numerical stability (default: 1e-8)
|
|
24
|
-
weight_decay (float): weight decay (L2 penalty) (default: 0)
|
|
24
|
+
weight_decay (float): weight decay (L2 penalty) (default: 0).
|
|
25
|
+
use_bias_correction (bool): whether to use bias correction for the first
|
|
26
|
+
and second moment estimates, as in the original Adam paper.
|
|
27
|
+
(default: True)
|
|
25
28
|
vector_reshape (bool): whether to reshape 1D vectors into 2D
|
|
26
29
|
matrices to apply low-rank compression (default: True).
|
|
27
30
|
stochastic_rounding (bool): whether to use stochastic
|
|
@@ -50,7 +53,7 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
50
53
|
highly recommended to prevent instability at the beginning of training,
|
|
51
54
|
as it gradually introduces the stabilizing slow momentum term. During
|
|
52
55
|
the warmup, `alpha` ramps from 0 to its target value. If `None`,
|
|
53
|
-
the scheduler is disabled
|
|
56
|
+
the scheduler is disabled. (default: None)
|
|
54
57
|
factored (bool): whether to use the factorization or disable it to use
|
|
55
58
|
the uncompressed optimizer. (default: True)
|
|
56
59
|
"""
|
|
@@ -62,6 +65,7 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
62
65
|
betas: tuple[float, float] = (0.9, 0.999),
|
|
63
66
|
eps: float = 1e-8,
|
|
64
67
|
weight_decay: float = 0.0,
|
|
68
|
+
use_bias_correction: bool = True,
|
|
65
69
|
vector_reshape: bool = True,
|
|
66
70
|
stochastic_rounding: bool = True,
|
|
67
71
|
use_atan2: bool = False,
|
|
@@ -86,7 +90,7 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
86
90
|
defaults = {
|
|
87
91
|
"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
|
|
88
92
|
"vector_reshape": vector_reshape, "use_atan2": use_atan2,
|
|
89
|
-
"use_orthograd": use_orthograd,
|
|
93
|
+
"use_orthograd": use_orthograd, "use_bias_correction": use_bias_correction,
|
|
90
94
|
"beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
|
|
91
95
|
}
|
|
92
96
|
self.stochastic_rounding = stochastic_rounding
|
|
@@ -159,17 +163,25 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
159
163
|
if beta1 > 0:
|
|
160
164
|
state['exp_avg'] = torch.zeros_like(p, device=device, dtype=dtype)
|
|
161
165
|
if self.use_AdEMAMix:
|
|
162
|
-
state['exp_avg_slow'] = torch.zeros_like(p, dtype=dtype)
|
|
166
|
+
state['exp_avg_slow'] = torch.zeros_like(p, device=device, dtype=dtype)
|
|
163
167
|
state['exp_avg_sq'] = torch.zeros_like(p, device=device, dtype=dtype)
|
|
164
168
|
|
|
169
|
+
step = state['step'] + 1
|
|
170
|
+
if group['use_bias_correction']:
|
|
171
|
+
bias_correction1 = 1.0 - beta1 ** step
|
|
172
|
+
bias_correction2 = 1.0 - beta2 ** step
|
|
173
|
+
else:
|
|
174
|
+
bias_correction1 = 1
|
|
175
|
+
bias_correction2 = 1
|
|
176
|
+
step_size = group['lr'] / bias_correction1
|
|
177
|
+
|
|
165
178
|
if self.use_AdEMAMix:
|
|
166
179
|
beta3_ema = group['beta3_ema']
|
|
167
180
|
alpha = group['alpha']
|
|
168
181
|
t_alpha = group['t_alpha']
|
|
169
|
-
current_step = state['step'] + 1
|
|
170
182
|
alpha_t = alpha
|
|
171
|
-
if t_alpha is not None and t_alpha > 0 and
|
|
172
|
-
alpha_t = min(
|
|
183
|
+
if t_alpha is not None and t_alpha > 0 and step < t_alpha:
|
|
184
|
+
alpha_t = min(step * alpha / t_alpha, alpha)
|
|
173
185
|
|
|
174
186
|
if state['factored']:
|
|
175
187
|
d1, d2 = state['effective_shape']
|
|
@@ -206,19 +218,19 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
206
218
|
mt_slow.mul_(beta3_ema).add_(grad_reshaped, alpha=1.0 - beta3_ema)
|
|
207
219
|
update = mt + (alpha_t * mt_slow) if beta1 > 0 else grad_reshaped + (alpha_t * mt_slow)
|
|
208
220
|
else:
|
|
209
|
-
update = mt if beta1 > 0 else grad_reshaped
|
|
221
|
+
update = mt.clone() if beta1 > 0 else grad_reshaped.clone()
|
|
210
222
|
del grad_reshaped
|
|
211
223
|
|
|
212
224
|
if group['use_atan2']:
|
|
213
225
|
a = 1.2732395
|
|
214
|
-
denom = vt.sqrt()
|
|
226
|
+
denom = (vt.sqrt() / (bias_correction2**0.5))
|
|
215
227
|
update.atan2_(denom).mul_(a)
|
|
216
228
|
else:
|
|
217
|
-
denom = vt.sqrt()
|
|
218
|
-
update.div_(denom
|
|
229
|
+
denom = (vt.sqrt() / (bias_correction2**0.5)).add_(group['eps'])
|
|
230
|
+
update.div_(denom)
|
|
219
231
|
del denom
|
|
220
232
|
|
|
221
|
-
update.view(p.shape).mul_(
|
|
233
|
+
update = update.view(p.shape).mul_(step_size)
|
|
222
234
|
|
|
223
235
|
# Compress updated moments and store new factors
|
|
224
236
|
if beta1 > 0:
|
|
@@ -252,20 +264,20 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
252
264
|
exp_avg_slow.mul_(beta3_ema).add_(grad, alpha=1 - beta3_ema)
|
|
253
265
|
update = exp_avg + (alpha_t * exp_avg_slow) if beta1 > 0 else grad + (alpha_t * exp_avg_slow)
|
|
254
266
|
else:
|
|
255
|
-
update = exp_avg if beta1 > 0 else grad
|
|
267
|
+
update = exp_avg.clone() if beta1 > 0 else grad.clone()
|
|
256
268
|
|
|
257
269
|
exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)
|
|
258
270
|
|
|
259
271
|
if group['use_atan2']:
|
|
260
272
|
a = 1.2732395
|
|
261
|
-
denom = exp_avg_sq.sqrt()
|
|
273
|
+
denom = (exp_avg_sq.sqrt() / (bias_correction2**0.5))
|
|
262
274
|
update.atan2_(denom).mul_(a)
|
|
263
275
|
else:
|
|
264
|
-
denom = exp_avg_sq.sqrt()
|
|
265
|
-
update.div_(denom
|
|
276
|
+
denom = (exp_avg_sq.sqrt() / (bias_correction2**0.5)).add_(group['eps'])
|
|
277
|
+
update.div_(denom)
|
|
266
278
|
del denom
|
|
267
279
|
|
|
268
|
-
update.mul_(
|
|
280
|
+
update.mul_(step_size)
|
|
269
281
|
|
|
270
282
|
# Decoupled weight decay
|
|
271
283
|
if group["weight_decay"] != 0:
|
|
@@ -265,7 +265,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
265
265
|
mt_slow.mul_(beta3_ema).add_(grad_reshaped, alpha=self.d * (1.0 - beta3_ema))
|
|
266
266
|
update = mt + (alpha_t * mt_slow) if self.beta1 > 0 else grad_reshaped + (alpha_t * mt_slow)
|
|
267
267
|
else:
|
|
268
|
-
update = mt if self.beta1 > 0 else grad_reshaped
|
|
268
|
+
update = mt.clone() if self.beta1 > 0 else grad_reshaped.clone()
|
|
269
269
|
del grad_reshaped
|
|
270
270
|
|
|
271
271
|
if group['use_atan2']:
|
|
@@ -277,7 +277,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
277
277
|
update.div_(denom.add_(self.d * group['eps']))
|
|
278
278
|
del denom
|
|
279
279
|
|
|
280
|
-
update.view(p.shape).mul_(self.dlr)
|
|
280
|
+
update = update.view(p.shape).mul_(self.dlr)
|
|
281
281
|
|
|
282
282
|
# Compress updated moments and store new factors
|
|
283
283
|
if self.beta1 > 0:
|
|
@@ -311,7 +311,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
311
311
|
exp_avg_slow.mul_(beta3_ema).add_(grad, alpha=self.d * (1.0 - beta3_ema))
|
|
312
312
|
update = exp_avg + (alpha_t * exp_avg_slow) if self.beta1 > 0 else grad + (alpha_t * exp_avg_slow)
|
|
313
313
|
else:
|
|
314
|
-
update = exp_avg if self.beta1 > 0 else grad
|
|
314
|
+
update = exp_avg.clone() if self.beta1 > 0 else grad.clone()
|
|
315
315
|
|
|
316
316
|
exp_avg_sq.mul_(self.beta2).addcmul_(grad, grad.conj(), value=self.d * self.d * (1.0 - self.beta2))
|
|
317
317
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|