adv-optm 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of adv-optm might be problematic. Click here for more details.

adv_optm/__init__.py CHANGED
@@ -16,4 +16,4 @@ __all__ = [
16
16
  "Lion_Prodigy_adv",
17
17
  ]
18
18
 
19
- __version__ = "1.0.1"
19
+ __version__ = "1.0.3"
@@ -33,7 +33,7 @@ class AdamW_adv(torch.optim.Optimizer):
33
33
  grams_moment (bool): whether to use Grams-style updates. (default: False)
34
34
  cautious_mask (bool): whether to use cautious masking to align the gradient's
35
35
  direction with the first moment's. (default: False)
36
- use_orthograd (bool): whether to use OrthoGrad. (default: False)
36
+ orthogonal_gradient (bool): whether to use OrthoGrad. (default: False)
37
37
  use_AdEMAMix (bool): whether to enable the AdEMAMix feature. This adds
38
38
  a second, slow-moving average of the momentum (`mt_slow`) which is
39
39
  combined with the primary momentum (`mt`) to stabilize updates,
@@ -71,7 +71,7 @@ class AdamW_adv(torch.optim.Optimizer):
71
71
  use_atan2: bool = False,
72
72
  cautious_mask: bool = False,
73
73
  grams_moment: bool = False,
74
- use_orthograd: bool = False,
74
+ orthogonal_gradient: bool = False,
75
75
  use_AdEMAMix: bool = False,
76
76
  beta3_ema: float = 0.9999,
77
77
  alpha: float = 5.0,
@@ -93,7 +93,7 @@ class AdamW_adv(torch.optim.Optimizer):
93
93
  defaults = {
94
94
  "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
95
95
  "vector_reshape": vector_reshape, "use_atan2": use_atan2,
96
- "use_orthograd": use_orthograd, "use_bias_correction": use_bias_correction,
96
+ "orthogonal_gradient": orthogonal_gradient, "use_bias_correction": use_bias_correction,
97
97
  "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
98
98
  }
99
99
  self.stochastic_rounding = stochastic_rounding
@@ -123,7 +123,7 @@ class AdamW_adv(torch.optim.Optimizer):
123
123
  grad = p.grad
124
124
  if grad.dtype != torch.float32 and self.factored:
125
125
  grad = grad.float()
126
- if group["use_orthograd"]:
126
+ if group["orthogonal_gradient"]:
127
127
  grad = _orthogonalize_gradient(p, grad)
128
128
  state = self.state[p]
129
129
 
@@ -40,7 +40,7 @@ class Adopt_adv(torch.optim.Optimizer):
40
40
  direction with the first moment's. (default: False)
41
41
  grams_moment (bool): whether to combine the gradient's direction with the
42
42
  first moment's magnitude (default: False).
43
- use_orthograd (bool): whether to use OrthoGrad. (default: False)
43
+ orthogonal_gradient (bool): whether to use OrthoGrad. (default: False)
44
44
  use_AdEMAMix (bool): whether to enable the AdEMAMix feature. This adds
45
45
  a second, slow-moving average of the momentum (`mt_slow`) which is
46
46
  combined with the primary momentum (`mt`) to stabilize updates,
@@ -89,7 +89,7 @@ class Adopt_adv(torch.optim.Optimizer):
89
89
  use_atan2: bool = False,
90
90
  cautious_mask: bool = False,
91
91
  grams_moment: bool = False,
92
- use_orthograd: bool = False,
92
+ orthogonal_gradient: bool = False,
93
93
  use_AdEMAMix: bool = False,
94
94
  beta3_ema: float = 0.9999,
95
95
  alpha: float = 5.0,
@@ -131,7 +131,7 @@ class Adopt_adv(torch.optim.Optimizer):
131
131
  self.use_atan2 = use_atan2 and not Simplified_AdEMAMix
132
132
  self.cautious_mask = cautious_mask and not Simplified_AdEMAMix
133
133
  self.grams_moment = grams_moment and not Simplified_AdEMAMix
134
- self.use_orthograd = use_orthograd
134
+ self.orthogonal_gradient = orthogonal_gradient
135
135
  self.use_AdEMAMix = use_AdEMAMix and not Simplified_AdEMAMix
136
136
  self.Simplified_AdEMAMix = Simplified_AdEMAMix
137
137
  self.factored = nnmf_factor
@@ -152,7 +152,7 @@ class Adopt_adv(torch.optim.Optimizer):
152
152
  grad = p.grad
153
153
  if self.factored and grad.dtype != torch.float32:
154
154
  grad = grad.float()
155
- if self.use_orthograd:
155
+ if self.orthogonal_gradient:
156
156
  grad = _orthogonalize_gradient(p, grad)
157
157
  state = self.state[p]
158
158
 
@@ -60,7 +60,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
60
60
  weight_decay: float = 0.0,
61
61
  vector_reshape: bool = True,
62
62
  stochastic_rounding: bool = True,
63
- use_orthograd: bool = False,
63
+ orthogonal_gradient: bool = False,
64
64
  cautious_mask: bool = False,
65
65
  clip_threshold: float = 0.0,
66
66
  nnmf_factor: bool = True,
@@ -85,7 +85,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
85
85
  betas=betas,
86
86
  weight_decay=weight_decay,
87
87
  vector_reshape=vector_reshape,
88
- use_orthograd=use_orthograd,
88
+ orthogonal_gradient=orthogonal_gradient,
89
89
  clip_threshold=clip_threshold,
90
90
  beta3=beta3, d=d0, d0=d0, d_max=d0, d_numerator=0.0, d_coef=d_coef,
91
91
  growth_rate=growth_rate, safeguard_warmup=safeguard_warmup, k=0, slice_p=slice_p,
@@ -146,7 +146,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
146
146
  if grad_norm > group["clip_threshold"]:
147
147
  clip_coef = group["clip_threshold"] / grad_norm
148
148
  grad.mul_(clip_coef)
149
- if group["use_orthograd"]:
149
+ if group["orthogonal_gradient"]:
150
150
  grad = _orthogonalize_gradient(p, grad)
151
151
  state = self.state[p]
152
152
 
@@ -195,7 +195,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
195
195
  exp_avg = exp_avg.float()
196
196
 
197
197
  # Compute update term c_t = β1*m_{t-1} + (1-β1)*g_t
198
- signed_update = exp_avg.clone().mul_(self.beta1).add_(grad_reshaped, alpha=(1-self.beta1)).sign_()
198
+ signed_update = exp_avg.clone().mul_(self.beta1).add_(grad_reshaped, alpha=self.d * (1-self.beta1)).sign_()
199
199
 
200
200
  if self.cautious_mask:
201
201
  mask = (signed_update * grad_reshaped > 0).to(grad_reshaped.dtype)
@@ -222,7 +222,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
222
222
  # Compute update term and sign for the update
223
223
  if exp_avg.dtype != torch.float32 and self.factored:
224
224
  exp_avg = exp_avg.float()
225
- signed_update = exp_avg.clone().mul_(self.beta1).add_(grad, alpha=(1-self.beta1)).sign_()
225
+ signed_update = exp_avg.clone().mul_(self.beta1).add_(grad, alpha=self.d * (1-self.beta1)).sign_()
226
226
 
227
227
  if self.cautious_mask:
228
228
  mask = (signed_update * grad > 0).to(grad.dtype)
@@ -43,7 +43,7 @@ class Lion_adv(torch.optim.Optimizer):
43
43
  weight_decay: float = 0.0,
44
44
  vector_reshape: bool = True,
45
45
  stochastic_rounding: bool = True,
46
- use_orthograd: bool = False,
46
+ orthogonal_gradient: bool = False,
47
47
  cautious_mask: bool = False,
48
48
  clip_threshold: float = 0.0,
49
49
  nnmf_factor: bool = True,
@@ -60,7 +60,7 @@ class Lion_adv(torch.optim.Optimizer):
60
60
  betas=betas,
61
61
  weight_decay=weight_decay,
62
62
  vector_reshape=vector_reshape,
63
- use_orthograd=use_orthograd,
63
+ orthogonal_gradient=orthogonal_gradient,
64
64
  clip_threshold=clip_threshold,
65
65
  )
66
66
  self.stochastic_rounding = stochastic_rounding
@@ -94,7 +94,7 @@ class Lion_adv(torch.optim.Optimizer):
94
94
  if grad_norm > group["clip_threshold"]:
95
95
  clip_coef = group["clip_threshold"] / grad_norm
96
96
  grad.mul_(clip_coef)
97
- if group["use_orthograd"]:
97
+ if group["orthogonal_gradient"]:
98
98
  grad = _orthogonalize_gradient(p, grad)
99
99
  state = self.state[p]
100
100
 
@@ -32,7 +32,7 @@ class Prodigy_adv(torch.optim.Optimizer):
32
32
  grams_moment (bool): whether to use Grams-style updates. (default: False)
33
33
  cautious_mask (bool): whether to use cautious masking to align the gradient's
34
34
  direction with the first moment's. (default: False)
35
- use_orthograd (bool): whether to use OrthoGrad. (default: False)
35
+ orthogonal_gradient (bool): whether to use OrthoGrad. (default: False)
36
36
  use_AdEMAMix (bool): whether to enable the AdEMAMix feature. This adds
37
37
  a second, slow-moving average of the momentum (`mt_slow`) which is
38
38
  combined with the primary momentum (`mt`) to stabilize updates,
@@ -99,7 +99,7 @@ class Prodigy_adv(torch.optim.Optimizer):
99
99
  use_atan2: bool = False,
100
100
  cautious_mask: bool = False,
101
101
  grams_moment: bool = False,
102
- use_orthograd: bool = False,
102
+ orthogonal_gradient: bool = False,
103
103
  use_AdEMAMix: bool = False,
104
104
  beta3_ema: float = 0.9999,
105
105
  alpha: float = 5.0,
@@ -148,7 +148,7 @@ class Prodigy_adv(torch.optim.Optimizer):
148
148
  defaults = {
149
149
  "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
150
150
  "vector_reshape": vector_reshape, "use_atan2": use_atan2,
151
- "use_orthograd": use_orthograd,
151
+ "orthogonal_gradient": orthogonal_gradient,
152
152
  "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
153
153
  "beta3": beta3, "d": d0, "d0": d0, "d_max": d0, "d_numerator": 0.0, "d_coef": d_coef,
154
154
  "growth_rate": growth_rate, "safeguard_warmup": safeguard_warmup, "k": 0, "slice_p": slice_p,
@@ -206,7 +206,7 @@ class Prodigy_adv(torch.optim.Optimizer):
206
206
  grad = p.grad
207
207
  if grad.dtype != torch.float32 and self.factored:
208
208
  grad = grad.float()
209
- if group["use_orthograd"]:
209
+ if group["orthogonal_gradient"]:
210
210
  grad = _orthogonalize_gradient(p, grad)
211
211
  state = self.state[p]
212
212
 
@@ -46,7 +46,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
46
46
  matrices to apply low-rank compression (default: True).
47
47
  stochastic_rounding (bool): whether to use stochastic
48
48
  rounding for BF16 parameter updates (default: True).
49
- use_orthograd (bool): whether to use OrthoGrad. (default: False)
49
+ orthogonal_gradient (bool): whether to use OrthoGrad. (default: False)
50
50
  nnmf_factor (bool): whether to use the factorization or disable it to use
51
51
  the uncompressed optimizer. (default: False)
52
52
  """
@@ -64,7 +64,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
64
64
  use_bias_correction: bool = True,
65
65
  vector_reshape: bool = True,
66
66
  stochastic_rounding: bool = True,
67
- use_orthograd: bool = False,
67
+ orthogonal_gradient: bool = False,
68
68
  nnmf_factor: bool = False,
69
69
  ):
70
70
  if not (lr >= 0.0):
@@ -82,7 +82,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
82
82
  "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
83
83
  "alpha_grad": alpha_grad, "beta1_warmup": beta1_warmup, "min_beta1": min_beta1,
84
84
  "vector_reshape": vector_reshape,
85
- "use_orthograd": use_orthograd, "use_bias_correction": use_bias_correction,
85
+ "orthogonal_gradient": orthogonal_gradient, "use_bias_correction": use_bias_correction,
86
86
  }
87
87
  self.stochastic_rounding = stochastic_rounding
88
88
  self.factored = nnmf_factor
@@ -108,7 +108,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
108
108
  grad = p.grad
109
109
  if grad.dtype != torch.float32 and self.factored:
110
110
  grad = grad.float()
111
- if group["use_orthograd"]:
111
+ if group["orthogonal_gradient"]:
112
112
  grad = _orthogonalize_gradient(p, grad)
113
113
  state = self.state[p]
114
114
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: adv_optm
3
- Version: 1.0.1
3
+ Version: 1.0.3
4
4
  Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
5
  Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
6
  Author: Koratahiu
@@ -0,0 +1,19 @@
1
+ adv_optm/__init__.py,sha256=zL7hnbPAHt7w-0fZQld04Pt58F-aYaRAqz15e-RZh-Y,306
2
+ adv_optm/optim/AdamW_adv.py,sha256=aTuYcJgd_EcZOrs6TDgBrBKw3wtU5LPzE5WvTBDDeEo,14317
3
+ adv_optm/optim/Adopt_adv.py,sha256=lElmraSiIZiGu9W6ELXnIPZNEEYi1ZWuvuemgPZOixk,17484
4
+ adv_optm/optim/Lion_Prodigy_adv.py,sha256=sGzhts9a6gHfCkuHTB5L9IrClo4c6UThzYYErBwqOaA,12844
5
+ adv_optm/optim/Lion_adv.py,sha256=6G1CukJB_pC7l9HwFEuY1ydsNHZFabVmOvcHDsHHVuQ,8295
6
+ adv_optm/optim/Prodigy_adv.py,sha256=8XUpu19BaBmHb-R9K3jgwySDbtVaLU1_Drtttc_zITs,22461
7
+ adv_optm/optim/Simplified_AdEMAMix.py,sha256=tb3d6Cw_nGwcTzYUhDnKqyP7GzjD1hn8k4WqGG5lhmw,9813
8
+ adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
9
+ adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
10
+ adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
11
+ adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
12
+ adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
13
+ adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
14
+ adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
15
+ adv_optm-1.0.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
16
+ adv_optm-1.0.3.dist-info/METADATA,sha256=Cx9bqS9VFt2nBey-H7GxVS0AXwNzTy0eW5NtSW6uXKk,8422
17
+ adv_optm-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
+ adv_optm-1.0.3.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
19
+ adv_optm-1.0.3.dist-info/RECORD,,
@@ -1,19 +0,0 @@
1
- adv_optm/__init__.py,sha256=zRfL5MVYJYRNKJAwBSjRKCU6Xo5vW8RbFlTEENHpKxg,306
2
- adv_optm/optim/AdamW_adv.py,sha256=O0q35c5CO9l1qr9YW3SWBuvkw-x04Ns9T03ewLjD1Ok,14287
3
- adv_optm/optim/Adopt_adv.py,sha256=PlkO2jfjbw_aPwkRVIqdB9U-3xIfnvlkZmRMjpeoXfc,17454
4
- adv_optm/optim/Lion_Prodigy_adv.py,sha256=YcrfoZ7Sh0O7tj2OXdOzB3K3oOzGXmEg0z55_iSiPRg,12802
5
- adv_optm/optim/Lion_adv.py,sha256=uME0YTV5yTZs3bCBg9BQSF0PqlfJJE1BB2q5kWeh49w,8271
6
- adv_optm/optim/Prodigy_adv.py,sha256=pL5fOqBcraaQqnZn1tQ7f3Ph2SAJ9ZPN65LNTHlrys4,22431
7
- adv_optm/optim/Simplified_AdEMAMix.py,sha256=7vr413DsxuHTzh1G9mdXpyB8_1F00GW82iiuUD_jbyg,9783
8
- adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
9
- adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
10
- adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
11
- adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
12
- adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
13
- adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
14
- adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
15
- adv_optm-1.0.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
16
- adv_optm-1.0.1.dist-info/METADATA,sha256=IP6zZKr9fGFNBm1D6yvzQr1bFCMQn4CVUcl4qd_yR8M,8422
17
- adv_optm-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- adv_optm-1.0.1.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
19
- adv_optm-1.0.1.dist-info/RECORD,,