adv-optm 0.1.4__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of adv-optm might be problematic. Click here for more details.

Files changed (23) hide show
  1. {adv_optm-0.1.4 → adv_optm-0.1.5}/PKG-INFO +1 -1
  2. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/__init__.py +1 -1
  3. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/optim/AdamW_adv.py +29 -17
  4. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/optim/Prodigy_adv.py +2 -2
  5. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm.egg-info/PKG-INFO +1 -1
  6. {adv_optm-0.1.4 → adv_optm-0.1.5}/setup.py +1 -1
  7. {adv_optm-0.1.4 → adv_optm-0.1.5}/LICENSE +0 -0
  8. {adv_optm-0.1.4 → adv_optm-0.1.5}/README.md +0 -0
  9. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/optim/Adopt_adv.py +0 -0
  10. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/optim/Lion_Prodigy_adv.py +0 -0
  11. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/optim/Lion_adv.py +0 -0
  12. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/optim/__init__.py +0 -0
  13. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/util/BF16_Stochastic_Rounding.py +0 -0
  14. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/util/Effective_Shape.py +0 -0
  15. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/util/NNMF.py +0 -0
  16. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/util/One_Bit_Boolean.py +0 -0
  17. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/util/OrthoGrad.py +0 -0
  18. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm/util/__init__.py +0 -0
  19. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm.egg-info/SOURCES.txt +0 -0
  20. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm.egg-info/dependency_links.txt +0 -0
  21. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm.egg-info/requires.txt +0 -0
  22. {adv_optm-0.1.4 → adv_optm-0.1.5}/adv_optm.egg-info/top_level.txt +0 -0
  23. {adv_optm-0.1.4 → adv_optm-0.1.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -14,4 +14,4 @@ __all__ = [
14
14
  "Lion_Prodigy_adv",
15
15
  ]
16
16
 
17
- __version__ = "0.1.4"
17
+ __version__ = "0.1.5"
@@ -21,7 +21,10 @@ class AdamW_adv(torch.optim.Optimizer):
21
21
  averages of gradient and its square (default: (0.9, 0.999))
22
22
  eps (float): term added to the denominator to improve
23
23
  numerical stability (default: 1e-8)
24
- weight_decay (float): weight decay (L2 penalty) (default: 0)
24
+ weight_decay (float): weight decay (L2 penalty) (default: 0).
25
+ use_bias_correction (bool): whether to use bias correction for the first
26
+ and second moment estimates, as in the original Adam paper.
27
+ (default: True)
25
28
  vector_reshape (bool): whether to reshape 1D vectors into 2D
26
29
  matrices to apply low-rank compression (default: True).
27
30
  stochastic_rounding (bool): whether to use stochastic
@@ -50,7 +53,7 @@ class AdamW_adv(torch.optim.Optimizer):
50
53
  highly recommended to prevent instability at the beginning of training,
51
54
  as it gradually introduces the stabilizing slow momentum term. During
52
55
  the warmup, `alpha` ramps from 0 to its target value. If `None`,
53
- the scheduler is disabled and th
56
+ the scheduler is disabled. (default: None)
54
57
  factored (bool): whether to use the factorization or disable it to use
55
58
  the uncompressed optimizer. (default: True)
56
59
  """
@@ -62,6 +65,7 @@ class AdamW_adv(torch.optim.Optimizer):
62
65
  betas: tuple[float, float] = (0.9, 0.999),
63
66
  eps: float = 1e-8,
64
67
  weight_decay: float = 0.0,
68
+ use_bias_correction: bool = True,
65
69
  vector_reshape: bool = True,
66
70
  stochastic_rounding: bool = True,
67
71
  use_atan2: bool = False,
@@ -86,7 +90,7 @@ class AdamW_adv(torch.optim.Optimizer):
86
90
  defaults = {
87
91
  "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
88
92
  "vector_reshape": vector_reshape, "use_atan2": use_atan2,
89
- "use_orthograd": use_orthograd,
93
+ "use_orthograd": use_orthograd, "use_bias_correction": use_bias_correction,
90
94
  "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
91
95
  }
92
96
  self.stochastic_rounding = stochastic_rounding
@@ -159,17 +163,25 @@ class AdamW_adv(torch.optim.Optimizer):
159
163
  if beta1 > 0:
160
164
  state['exp_avg'] = torch.zeros_like(p, device=device, dtype=dtype)
161
165
  if self.use_AdEMAMix:
162
- state['exp_avg_slow'] = torch.zeros_like(p, dtype=dtype)
166
+ state['exp_avg_slow'] = torch.zeros_like(p, device=device, dtype=dtype)
163
167
  state['exp_avg_sq'] = torch.zeros_like(p, device=device, dtype=dtype)
164
168
 
169
+ step = state['step'] + 1
170
+ if group['use_bias_correction']:
171
+ bias_correction1 = 1.0 - beta1 ** step
172
+ bias_correction2 = 1.0 - beta2 ** step
173
+ else:
174
+ bias_correction1 = 1
175
+ bias_correction2 = 1
176
+ step_size = group['lr'] / bias_correction1
177
+
165
178
  if self.use_AdEMAMix:
166
179
  beta3_ema = group['beta3_ema']
167
180
  alpha = group['alpha']
168
181
  t_alpha = group['t_alpha']
169
- current_step = state['step'] + 1
170
182
  alpha_t = alpha
171
- if t_alpha is not None and t_alpha > 0 and current_step < t_alpha:
172
- alpha_t = min(current_step * alpha / t_alpha, alpha)
183
+ if t_alpha is not None and t_alpha > 0 and step < t_alpha:
184
+ alpha_t = min(step * alpha / t_alpha, alpha)
173
185
 
174
186
  if state['factored']:
175
187
  d1, d2 = state['effective_shape']
@@ -206,19 +218,19 @@ class AdamW_adv(torch.optim.Optimizer):
206
218
  mt_slow.mul_(beta3_ema).add_(grad_reshaped, alpha=1.0 - beta3_ema)
207
219
  update = mt + (alpha_t * mt_slow) if beta1 > 0 else grad_reshaped + (alpha_t * mt_slow)
208
220
  else:
209
- update = mt if beta1 > 0 else grad_reshaped
221
+ update = mt.clone() if beta1 > 0 else grad_reshaped.clone()
210
222
  del grad_reshaped
211
223
 
212
224
  if group['use_atan2']:
213
225
  a = 1.2732395
214
- denom = vt.sqrt()
226
+ denom = (vt.sqrt() / (bias_correction2**0.5))
215
227
  update.atan2_(denom).mul_(a)
216
228
  else:
217
- denom = vt.sqrt()
218
- update.div_(denom.add_(group['eps']))
229
+ denom = (vt.sqrt() / (bias_correction2**0.5)).add_(group['eps'])
230
+ update.div_(denom)
219
231
  del denom
220
232
 
221
- update.view(p.shape).mul_(group['lr'])
233
+ update.view(p.shape).mul_(step_size)
222
234
 
223
235
  # Compress updated moments and store new factors
224
236
  if beta1 > 0:
@@ -252,20 +264,20 @@ class AdamW_adv(torch.optim.Optimizer):
252
264
  exp_avg_slow.mul_(beta3_ema).add_(grad, alpha=1 - beta3_ema)
253
265
  update = exp_avg + (alpha_t * exp_avg_slow) if beta1 > 0 else grad + (alpha_t * exp_avg_slow)
254
266
  else:
255
- update = exp_avg if beta1 > 0 else grad
267
+ update = exp_avg.clone() if beta1 > 0 else grad.clone()
256
268
 
257
269
  exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)
258
270
 
259
271
  if group['use_atan2']:
260
272
  a = 1.2732395
261
- denom = exp_avg_sq.sqrt()
273
+ denom = (exp_avg_sq.sqrt() / (bias_correction2**0.5))
262
274
  update.atan2_(denom).mul_(a)
263
275
  else:
264
- denom = exp_avg_sq.sqrt()
265
- update.div_(denom.add_(group['eps']))
276
+ denom = (exp_avg_sq.sqrt() / (bias_correction2**0.5)).add_(group['eps'])
277
+ update.div_(denom)
266
278
  del denom
267
279
 
268
- update.mul_(group['lr'])
280
+ update.mul_(step_size)
269
281
 
270
282
  # Decoupled weight decay
271
283
  if group["weight_decay"] != 0:
@@ -265,7 +265,7 @@ class Prodigy_adv(torch.optim.Optimizer):
265
265
  mt_slow.mul_(beta3_ema).add_(grad_reshaped, alpha=self.d * (1.0 - beta3_ema))
266
266
  update = mt + (alpha_t * mt_slow) if self.beta1 > 0 else grad_reshaped + (alpha_t * mt_slow)
267
267
  else:
268
- update = mt if self.beta1 > 0 else grad_reshaped
268
+ update = mt.clone() if self.beta1 > 0 else grad_reshaped.clone()
269
269
  del grad_reshaped
270
270
 
271
271
  if group['use_atan2']:
@@ -311,7 +311,7 @@ class Prodigy_adv(torch.optim.Optimizer):
311
311
  exp_avg_slow.mul_(beta3_ema).add_(grad, alpha=self.d * (1.0 - beta3_ema))
312
312
  update = exp_avg + (alpha_t * exp_avg_slow) if self.beta1 > 0 else grad + (alpha_t * exp_avg_slow)
313
313
  else:
314
- update = exp_avg if self.beta1 > 0 else grad
314
+ update = exp_avg.clone() if self.beta1 > 0 else grad.clone()
315
315
 
316
316
  exp_avg_sq.mul_(self.beta2).addcmul_(grad, grad.conj(), value=self.d * self.d * (1.0 - self.beta2))
317
317
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="adv_optm",
8
- version="0.1.4",
8
+ version="0.1.5",
9
9
  author="Koratahiu",
10
10
  author_email="hiuhonor@gmail.com",
11
11
  license='Apache 2.0',
File without changes
File without changes
File without changes
File without changes