adv-optm 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of adv-optm might be problematic. Click here for more details.

adv_optm/__init__.py CHANGED
@@ -10,4 +10,4 @@ __all__ = [
10
10
  "Adopt_adv",
11
11
  ]
12
12
 
13
- __version__ = "0.1.0"
13
+ __version__ = "0.1.1"
@@ -37,7 +37,7 @@ class AdamW_adv(torch.optim.Optimizer):
37
37
  combined with the primary momentum (`mt`) to stabilize updates,
38
38
  especially in noisy, small-batch settings. If `False`, the
39
39
  optimizer behaves as standard AdamW. (default: False)
40
- beta3 (float): The decay rate for the slow exponential moving average of
40
+ beta3_ema (float): The decay rate for the slow exponential moving average of
41
41
  the momentum (only used when `use_AdEMAMix` is `True`). A higher
42
42
  value (e.g., 0.9999) gives the EMA a longer memory, making it more
43
43
  stable but slower to adapt. A lower value (e.g., 0.999) is often
@@ -71,7 +71,7 @@ class AdamW_adv(torch.optim.Optimizer):
71
71
  use_grams: bool = False,
72
72
  use_orthograd: bool = False,
73
73
  use_AdEMAMix: bool = False,
74
- beta3: float = 0.9999,
74
+ beta3_ema: float = 0.9999,
75
75
  alpha: float = 5.0,
76
76
  t_alpha: int | None = None,
77
77
  factored: bool = True,
@@ -89,7 +89,7 @@ class AdamW_adv(torch.optim.Optimizer):
89
89
  "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
90
90
  "vector_reshape": vector_reshape, "use_atan2": use_atan2,
91
91
  "use_orthograd": use_orthograd, "use_bias_correction": use_bias_correction,
92
- "beta3": beta3, "alpha": alpha, "t_alpha": t_alpha,
92
+ "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
93
93
  }
94
94
  self.stochastic_rounding = stochastic_rounding
95
95
  self.use_cautious = use_cautious
@@ -162,7 +162,7 @@ class AdamW_adv(torch.optim.Optimizer):
162
162
 
163
163
  beta1, beta2 = group['betas']
164
164
  if self.use_AdEMAMix:
165
- beta3 = group['beta3']
165
+ beta3_ema = group['beta3_ema']
166
166
  alpha = group['alpha']
167
167
  t_alpha = group['t_alpha']
168
168
  current_step = state['step'] + 1
@@ -201,7 +201,7 @@ class AdamW_adv(torch.optim.Optimizer):
201
201
  torch.where(unpacked_sign_slow, mt_slow, -mt_slow, out=mt_slow)
202
202
  del unpacked_sign_slow
203
203
 
204
- mt_slow.mul_(beta3).add_(grad_reshaped, alpha=1.0 - beta3)
204
+ mt_slow.mul_(beta3_ema).add_(grad_reshaped, alpha=1.0 - beta3_ema)
205
205
  update_m = mt + (alpha_t * mt_slow)
206
206
  else:
207
207
  update_m = mt
@@ -245,7 +245,7 @@ class AdamW_adv(torch.optim.Optimizer):
245
245
 
246
246
  if self.use_AdEMAMix:
247
247
  exp_avg_slow = state['exp_avg_slow']
248
- exp_avg_slow.mul_(beta3).add_(grad, alpha=1 - beta3)
248
+ exp_avg_slow.mul_(beta3_ema).add_(grad, alpha=1 - beta3_ema)
249
249
  update_m = exp_avg + (alpha_t * exp_avg_slow)
250
250
  else:
251
251
  update_m = exp_avg
@@ -48,7 +48,7 @@ class Adopt_adv(torch.optim.Optimizer):
48
48
  combined with the primary momentum (`mt`) to stabilize updates,
49
49
  especially in noisy, small-batch settings. If `False`, the
50
50
  optimizer behaves as standard ADOPT. (default: False)
51
- beta3 (float): The decay rate for the slow exponential moving average of
51
+ beta3_ema (float): The decay rate for the slow exponential moving average of
52
52
  the momentum (only used when `use_AdEMAMix` is `True`). A higher
53
53
  value (e.g., 0.9999) gives the EMA a longer memory, making it more
54
54
  stable but slower to adapt. A lower value (e.g., 0.999) is often
@@ -83,7 +83,7 @@ class Adopt_adv(torch.optim.Optimizer):
83
83
  use_grams: bool = False,
84
84
  use_orthograd: bool = False,
85
85
  use_AdEMAMix: bool = False,
86
- beta3: float = 0.9999,
86
+ beta3_ema: float = 0.9999,
87
87
  alpha: float = 5.0,
88
88
  t_alpha: int | None = None,
89
89
  factored: bool = True,
@@ -99,7 +99,7 @@ class Adopt_adv(torch.optim.Optimizer):
99
99
 
100
100
  defaults = {
101
101
  "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
102
- "vector_reshape": vector_reshape, "beta3": beta3, "alpha": alpha,
102
+ "vector_reshape": vector_reshape, "beta3_ema": beta3_ema, "alpha": alpha,
103
103
  "t_alpha": t_alpha,
104
104
  }
105
105
  self.clip_lambda = clip_lambda
@@ -179,7 +179,7 @@ class Adopt_adv(torch.optim.Optimizer):
179
179
 
180
180
  beta1, beta2 = group['betas']
181
181
  if self.use_AdEMAMix:
182
- beta3 = group['beta3']
182
+ beta3_ema = group['beta3_ema']
183
183
  alpha = group['alpha']
184
184
  t_alpha = group['t_alpha']
185
185
  # Use step+1 for 1-based step count in scheduler
@@ -236,7 +236,7 @@ class Adopt_adv(torch.optim.Optimizer):
236
236
  del mask
237
237
 
238
238
  if self.use_AdEMAMix:
239
- mt_slow = mt_slow_prev.mul_(beta3).add_(normalized_grad, alpha=1.0 - beta3)
239
+ mt_slow = mt_slow_prev.mul_(beta3_ema).add_(normalized_grad, alpha=1.0 - beta3_ema)
240
240
  update = mt + (alpha_t * mt_slow)
241
241
  update = update.view(p.shape)
242
242
  else:
@@ -293,7 +293,7 @@ class Adopt_adv(torch.optim.Optimizer):
293
293
  del mask
294
294
 
295
295
  if self.use_AdEMAMix:
296
- m_slow.mul_(beta3).add_(normalized_grad, alpha=1.0 - beta3)
296
+ m_slow.mul_(beta3_ema).add_(normalized_grad, alpha=1.0 - beta3_ema)
297
297
  update = m + (alpha_t * m_slow)
298
298
  else:
299
299
  update = m
@@ -1,5 +1,6 @@
1
1
  import torch
2
- from typing import Optional
2
+ import torch.distributed as dist
3
+
3
4
  import math
4
5
 
5
6
  from ..util.BF16_Stochastic_Rounding import add_stochastic_
@@ -54,6 +55,23 @@ class Prodigy_adv(torch.optim.Optimizer):
54
55
  the scheduler is disabled and th
55
56
  factored (bool): whether to use the factorization or disable it to use
56
57
  the uncompressed optimizer. (default: True)
58
+ d0 (float):
59
+ Initial D estimate for D-adaptation (default 1e-6). Rarely needs changing.
60
+ d_coef (float):
61
+ Coefficient in the expression for the estimate of d (default 1.0).
62
+ Values such as 0.5 and 2.0 typically work as well.
63
+ Changing this parameter is the preferred way to tune the method.
64
+ growth_rate (float):
65
+ prevent the D estimate from growing faster than this multiplicative rate.
66
+ Default is inf, for unrestricted. Values like 1.02 give a kind of learning
67
+ rate warmup effect.
68
+ fsdp_in_use (bool):
69
+ If you're using sharded parameters, this should be set to True. The optimizer
70
+ will attempt to auto-detect this, but if you're using an implementation other
71
+ than PyTorch's builtin version, the auto-detection won't work.
72
+ slice_p (int): Reduce memory usage by calculating LR adaptation statistics on only every
73
+ pth entry of each tensor. For values greater than 1 this an an approximation to standard
74
+ Prodigy. Values ~11 are reasonable (default 1).
57
75
  """
58
76
 
59
77
  def __init__(
@@ -80,6 +98,7 @@ class Prodigy_adv(torch.optim.Optimizer):
80
98
  d_coef: float = 1,
81
99
  growth_rate: float = float('inf'),
82
100
  safeguard_warmup: bool = False,
101
+ fsdp_in_use: bool = False,
83
102
  slice_p: int = 11,
84
103
  ):
85
104
  if not (lr >= 0.0):
@@ -98,12 +117,14 @@ class Prodigy_adv(torch.optim.Optimizer):
98
117
  "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
99
118
  "beta3": beta3, "d": d0, "d0": d0, "d_max": d0, "d_numerator": 0.0, "d_coef": d_coef,
100
119
  "growth_rate": growth_rate, "safeguard_warmup": safeguard_warmup, "k": 0, "slice_p": slice_p,
120
+ "fsdp_in_use": fsdp_in_use,
101
121
  }
102
122
  self.stochastic_rounding = stochastic_rounding
103
123
  self.use_cautious = use_cautious
104
124
  self.use_grams = use_grams
105
125
  self.use_AdEMAMix = use_AdEMAMix
106
126
  self.factored = factored
127
+ self.fsdp_in_use = fsdp_in_use
107
128
  super().__init__(params, defaults)
108
129
  self.init_step()
109
130
 
@@ -142,6 +163,9 @@ class Prodigy_adv(torch.optim.Optimizer):
142
163
  if p.grad is None:
143
164
  return
144
165
 
166
+ if hasattr(p, "_fsdp_flattened"):
167
+ self.fsdp_in_use = True
168
+
145
169
  grad = p.grad
146
170
  if grad.dtype != torch.float32 and self.factored:
147
171
  grad = grad.float()
@@ -349,8 +373,16 @@ class Prodigy_adv(torch.optim.Optimizer):
349
373
  g_group = self.param_groups[0]
350
374
  d_max, d_coef, growth_rate = g_group['d_max'], g_group['d_coef'], g_group['growth_rate']
351
375
 
352
- global_d_numerator = self.d_numerator
353
- global_d_denom = self.d_denom
376
+ if self.fsdp_in_use and dist.is_available() and dist.is_initialized():
377
+ # Use the device of the first parameter to avoid hardcoding '.cuda()'
378
+ device = self.param_groups[0]['params'][0].device
379
+ dist_tensor = torch.tensor([self.d_numerator, self.d_denom], device=device)
380
+ dist.all_reduce(dist_tensor, op=dist.ReduceOp.SUM)
381
+ global_d_numerator = dist_tensor[0].item()
382
+ global_d_denom = dist_tensor[1].item()
383
+ else:
384
+ global_d_numerator = self.d_numerator
385
+ global_d_denom = self.d_denom
354
386
 
355
387
  d_hat = self.d
356
388
  if global_d_denom > 0:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -1,7 +1,7 @@
1
- adv_optm/__init__.py,sha256=4JNXqWmFkMvsUIQorZLy43BbyqZiJxMRQkCCr09sPKw,172
2
- adv_optm/optim/AdamW_adv.py,sha256=cvCl3bRfkENfbXwfdzZZ8k3AJ_tNx-c5kBgaguf5fnQ,12689
3
- adv_optm/optim/Adopt_adv.py,sha256=jDmz2Fky2t5Gv9VY5UzltF5b5TDtY3xS5pNlnj-Eox4,14952
4
- adv_optm/optim/Prodigy_adv.py,sha256=FFATlt4VFb7o3UocP_W4KjBIzJa_0ncsji7BsFFU_9E,15482
1
+ adv_optm/__init__.py,sha256=Ol6hg_EdQH1AXJsa_9l5iWnlUXuOXwD-6eU1OweL87A,172
2
+ adv_optm/optim/AdamW_adv.py,sha256=VGGzLhLh6CdY4I8mxmlzIC90rWnc9oGNuuXK8vE1dE0,12729
3
+ adv_optm/optim/Adopt_adv.py,sha256=-GRpXWISCq6HPkd7UB1S57jSzsg2D3nAhAt6082_7Ms,14992
4
+ adv_optm/optim/Prodigy_adv.py,sha256=5N5GsTWYg_0q_R95E_ryZVa3zSe-q30p_bFK5dXOUpM,17311
5
5
  adv_optm/optim/__init__.py,sha256=kX9MQhLQZGlKFPCGLXsZtooigs4wXULTEmNSSOJvcCY,178
6
6
  adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
7
7
  adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
@@ -10,8 +10,8 @@ adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfm
10
10
  adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
11
11
  adv_optm/util/Randomized_SVD.py,sha256=TFG417hh1t5f1n_mChnbgdQhpMoi37O04xVCe8wz8Qc,1708
12
12
  adv_optm/util/__init__.py,sha256=3yYKo23JDfHDZdGcjrDKxH8nYjk5KDB-i44kW-J4sPk,367
13
- adv_optm-0.1.0.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
14
- adv_optm-0.1.0.dist-info/METADATA,sha256=ig2YmYzdS6DmX0KEIGkdwX-n9eciG2S2aZYog1feqmE,6342
15
- adv_optm-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
- adv_optm-0.1.0.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
17
- adv_optm-0.1.0.dist-info/RECORD,,
13
+ adv_optm-0.1.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
14
+ adv_optm-0.1.1.dist-info/METADATA,sha256=Mej63zbzvVh1YkAydQojP6SZSqz_46JA6-Y_3i3b2Fs,6342
15
+ adv_optm-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
+ adv_optm-0.1.1.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
17
+ adv_optm-0.1.1.dist-info/RECORD,,