adv-optm 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of adv-optm might be problematic. Click here for more details.
- adv_optm/__init__.py +1 -1
- adv_optm/optim/AdamW_adv.py +4 -4
- adv_optm/optim/Adopt_adv.py +4 -4
- adv_optm/optim/Lion_Prodigy_adv.py +5 -5
- adv_optm/optim/Lion_adv.py +3 -3
- adv_optm/optim/Prodigy_adv.py +4 -4
- adv_optm/optim/Simplified_AdEMAMix.py +4 -4
- {adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/METADATA +1 -1
- adv_optm-1.0.3.dist-info/RECORD +19 -0
- adv_optm-1.0.1.dist-info/RECORD +0 -19
- {adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/WHEEL +0 -0
- {adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/licenses/LICENSE +0 -0
- {adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/top_level.txt +0 -0
adv_optm/__init__.py
CHANGED
adv_optm/optim/AdamW_adv.py
CHANGED
|
@@ -33,7 +33,7 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
33
33
|
grams_moment (bool): whether to use Grams-style updates. (default: False)
|
|
34
34
|
cautious_mask (bool): whether to use cautious masking to align the gradient's
|
|
35
35
|
direction with the first moment's. (default: False)
|
|
36
|
-
|
|
36
|
+
orthogonal_gradient (bool): whether to use OrthoGrad. (default: False)
|
|
37
37
|
use_AdEMAMix (bool): whether to enable the AdEMAMix feature. This adds
|
|
38
38
|
a second, slow-moving average of the momentum (`mt_slow`) which is
|
|
39
39
|
combined with the primary momentum (`mt`) to stabilize updates,
|
|
@@ -71,7 +71,7 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
71
71
|
use_atan2: bool = False,
|
|
72
72
|
cautious_mask: bool = False,
|
|
73
73
|
grams_moment: bool = False,
|
|
74
|
-
|
|
74
|
+
orthogonal_gradient: bool = False,
|
|
75
75
|
use_AdEMAMix: bool = False,
|
|
76
76
|
beta3_ema: float = 0.9999,
|
|
77
77
|
alpha: float = 5.0,
|
|
@@ -93,7 +93,7 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
93
93
|
defaults = {
|
|
94
94
|
"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
|
|
95
95
|
"vector_reshape": vector_reshape, "use_atan2": use_atan2,
|
|
96
|
-
"
|
|
96
|
+
"orthogonal_gradient": orthogonal_gradient, "use_bias_correction": use_bias_correction,
|
|
97
97
|
"beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
|
|
98
98
|
}
|
|
99
99
|
self.stochastic_rounding = stochastic_rounding
|
|
@@ -123,7 +123,7 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
123
123
|
grad = p.grad
|
|
124
124
|
if grad.dtype != torch.float32 and self.factored:
|
|
125
125
|
grad = grad.float()
|
|
126
|
-
if group["
|
|
126
|
+
if group["orthogonal_gradient"]:
|
|
127
127
|
grad = _orthogonalize_gradient(p, grad)
|
|
128
128
|
state = self.state[p]
|
|
129
129
|
|
adv_optm/optim/Adopt_adv.py
CHANGED
|
@@ -40,7 +40,7 @@ class Adopt_adv(torch.optim.Optimizer):
|
|
|
40
40
|
direction with the first moment's. (default: False)
|
|
41
41
|
grams_moment (bool): whether to combine the gradient's direction with the
|
|
42
42
|
first moment's magnitude (default: False).
|
|
43
|
-
|
|
43
|
+
orthogonal_gradient (bool): whether to use OrthoGrad. (default: False)
|
|
44
44
|
use_AdEMAMix (bool): whether to enable the AdEMAMix feature. This adds
|
|
45
45
|
a second, slow-moving average of the momentum (`mt_slow`) which is
|
|
46
46
|
combined with the primary momentum (`mt`) to stabilize updates,
|
|
@@ -89,7 +89,7 @@ class Adopt_adv(torch.optim.Optimizer):
|
|
|
89
89
|
use_atan2: bool = False,
|
|
90
90
|
cautious_mask: bool = False,
|
|
91
91
|
grams_moment: bool = False,
|
|
92
|
-
|
|
92
|
+
orthogonal_gradient: bool = False,
|
|
93
93
|
use_AdEMAMix: bool = False,
|
|
94
94
|
beta3_ema: float = 0.9999,
|
|
95
95
|
alpha: float = 5.0,
|
|
@@ -131,7 +131,7 @@ class Adopt_adv(torch.optim.Optimizer):
|
|
|
131
131
|
self.use_atan2 = use_atan2 and not Simplified_AdEMAMix
|
|
132
132
|
self.cautious_mask = cautious_mask and not Simplified_AdEMAMix
|
|
133
133
|
self.grams_moment = grams_moment and not Simplified_AdEMAMix
|
|
134
|
-
self.
|
|
134
|
+
self.orthogonal_gradient = orthogonal_gradient
|
|
135
135
|
self.use_AdEMAMix = use_AdEMAMix and not Simplified_AdEMAMix
|
|
136
136
|
self.Simplified_AdEMAMix = Simplified_AdEMAMix
|
|
137
137
|
self.factored = nnmf_factor
|
|
@@ -152,7 +152,7 @@ class Adopt_adv(torch.optim.Optimizer):
|
|
|
152
152
|
grad = p.grad
|
|
153
153
|
if self.factored and grad.dtype != torch.float32:
|
|
154
154
|
grad = grad.float()
|
|
155
|
-
if self.
|
|
155
|
+
if self.orthogonal_gradient:
|
|
156
156
|
grad = _orthogonalize_gradient(p, grad)
|
|
157
157
|
state = self.state[p]
|
|
158
158
|
|
|
@@ -60,7 +60,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
|
|
|
60
60
|
weight_decay: float = 0.0,
|
|
61
61
|
vector_reshape: bool = True,
|
|
62
62
|
stochastic_rounding: bool = True,
|
|
63
|
-
|
|
63
|
+
orthogonal_gradient: bool = False,
|
|
64
64
|
cautious_mask: bool = False,
|
|
65
65
|
clip_threshold: float = 0.0,
|
|
66
66
|
nnmf_factor: bool = True,
|
|
@@ -85,7 +85,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
|
|
|
85
85
|
betas=betas,
|
|
86
86
|
weight_decay=weight_decay,
|
|
87
87
|
vector_reshape=vector_reshape,
|
|
88
|
-
|
|
88
|
+
orthogonal_gradient=orthogonal_gradient,
|
|
89
89
|
clip_threshold=clip_threshold,
|
|
90
90
|
beta3=beta3, d=d0, d0=d0, d_max=d0, d_numerator=0.0, d_coef=d_coef,
|
|
91
91
|
growth_rate=growth_rate, safeguard_warmup=safeguard_warmup, k=0, slice_p=slice_p,
|
|
@@ -146,7 +146,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
|
|
|
146
146
|
if grad_norm > group["clip_threshold"]:
|
|
147
147
|
clip_coef = group["clip_threshold"] / grad_norm
|
|
148
148
|
grad.mul_(clip_coef)
|
|
149
|
-
if group["
|
|
149
|
+
if group["orthogonal_gradient"]:
|
|
150
150
|
grad = _orthogonalize_gradient(p, grad)
|
|
151
151
|
state = self.state[p]
|
|
152
152
|
|
|
@@ -195,7 +195,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
|
|
|
195
195
|
exp_avg = exp_avg.float()
|
|
196
196
|
|
|
197
197
|
# Compute update term c_t = β1*m_{t-1} + (1-β1)*g_t
|
|
198
|
-
signed_update = exp_avg.clone().mul_(self.beta1).add_(grad_reshaped, alpha=(1-self.beta1)).sign_()
|
|
198
|
+
signed_update = exp_avg.clone().mul_(self.beta1).add_(grad_reshaped, alpha=self.d * (1-self.beta1)).sign_()
|
|
199
199
|
|
|
200
200
|
if self.cautious_mask:
|
|
201
201
|
mask = (signed_update * grad_reshaped > 0).to(grad_reshaped.dtype)
|
|
@@ -222,7 +222,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
|
|
|
222
222
|
# Compute update term and sign for the update
|
|
223
223
|
if exp_avg.dtype != torch.float32 and self.factored:
|
|
224
224
|
exp_avg = exp_avg.float()
|
|
225
|
-
signed_update = exp_avg.clone().mul_(self.beta1).add_(grad, alpha=(1-self.beta1)).sign_()
|
|
225
|
+
signed_update = exp_avg.clone().mul_(self.beta1).add_(grad, alpha=self.d * (1-self.beta1)).sign_()
|
|
226
226
|
|
|
227
227
|
if self.cautious_mask:
|
|
228
228
|
mask = (signed_update * grad > 0).to(grad.dtype)
|
adv_optm/optim/Lion_adv.py
CHANGED
|
@@ -43,7 +43,7 @@ class Lion_adv(torch.optim.Optimizer):
|
|
|
43
43
|
weight_decay: float = 0.0,
|
|
44
44
|
vector_reshape: bool = True,
|
|
45
45
|
stochastic_rounding: bool = True,
|
|
46
|
-
|
|
46
|
+
orthogonal_gradient: bool = False,
|
|
47
47
|
cautious_mask: bool = False,
|
|
48
48
|
clip_threshold: float = 0.0,
|
|
49
49
|
nnmf_factor: bool = True,
|
|
@@ -60,7 +60,7 @@ class Lion_adv(torch.optim.Optimizer):
|
|
|
60
60
|
betas=betas,
|
|
61
61
|
weight_decay=weight_decay,
|
|
62
62
|
vector_reshape=vector_reshape,
|
|
63
|
-
|
|
63
|
+
orthogonal_gradient=orthogonal_gradient,
|
|
64
64
|
clip_threshold=clip_threshold,
|
|
65
65
|
)
|
|
66
66
|
self.stochastic_rounding = stochastic_rounding
|
|
@@ -94,7 +94,7 @@ class Lion_adv(torch.optim.Optimizer):
|
|
|
94
94
|
if grad_norm > group["clip_threshold"]:
|
|
95
95
|
clip_coef = group["clip_threshold"] / grad_norm
|
|
96
96
|
grad.mul_(clip_coef)
|
|
97
|
-
if group["
|
|
97
|
+
if group["orthogonal_gradient"]:
|
|
98
98
|
grad = _orthogonalize_gradient(p, grad)
|
|
99
99
|
state = self.state[p]
|
|
100
100
|
|
adv_optm/optim/Prodigy_adv.py
CHANGED
|
@@ -32,7 +32,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
32
32
|
grams_moment (bool): whether to use Grams-style updates. (default: False)
|
|
33
33
|
cautious_mask (bool): whether to use cautious masking to align the gradient's
|
|
34
34
|
direction with the first moment's. (default: False)
|
|
35
|
-
|
|
35
|
+
orthogonal_gradient (bool): whether to use OrthoGrad. (default: False)
|
|
36
36
|
use_AdEMAMix (bool): whether to enable the AdEMAMix feature. This adds
|
|
37
37
|
a second, slow-moving average of the momentum (`mt_slow`) which is
|
|
38
38
|
combined with the primary momentum (`mt`) to stabilize updates,
|
|
@@ -99,7 +99,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
99
99
|
use_atan2: bool = False,
|
|
100
100
|
cautious_mask: bool = False,
|
|
101
101
|
grams_moment: bool = False,
|
|
102
|
-
|
|
102
|
+
orthogonal_gradient: bool = False,
|
|
103
103
|
use_AdEMAMix: bool = False,
|
|
104
104
|
beta3_ema: float = 0.9999,
|
|
105
105
|
alpha: float = 5.0,
|
|
@@ -148,7 +148,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
148
148
|
defaults = {
|
|
149
149
|
"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
|
|
150
150
|
"vector_reshape": vector_reshape, "use_atan2": use_atan2,
|
|
151
|
-
"
|
|
151
|
+
"orthogonal_gradient": orthogonal_gradient,
|
|
152
152
|
"beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
|
|
153
153
|
"beta3": beta3, "d": d0, "d0": d0, "d_max": d0, "d_numerator": 0.0, "d_coef": d_coef,
|
|
154
154
|
"growth_rate": growth_rate, "safeguard_warmup": safeguard_warmup, "k": 0, "slice_p": slice_p,
|
|
@@ -206,7 +206,7 @@ class Prodigy_adv(torch.optim.Optimizer):
|
|
|
206
206
|
grad = p.grad
|
|
207
207
|
if grad.dtype != torch.float32 and self.factored:
|
|
208
208
|
grad = grad.float()
|
|
209
|
-
if group["
|
|
209
|
+
if group["orthogonal_gradient"]:
|
|
210
210
|
grad = _orthogonalize_gradient(p, grad)
|
|
211
211
|
state = self.state[p]
|
|
212
212
|
|
|
@@ -46,7 +46,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
|
|
|
46
46
|
matrices to apply low-rank compression (default: True).
|
|
47
47
|
stochastic_rounding (bool): whether to use stochastic
|
|
48
48
|
rounding for BF16 parameter updates (default: True).
|
|
49
|
-
|
|
49
|
+
orthogonal_gradient (bool): whether to use OrthoGrad. (default: False)
|
|
50
50
|
nnmf_factor (bool): whether to use the factorization or disable it to use
|
|
51
51
|
the uncompressed optimizer. (default: False)
|
|
52
52
|
"""
|
|
@@ -64,7 +64,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
|
|
|
64
64
|
use_bias_correction: bool = True,
|
|
65
65
|
vector_reshape: bool = True,
|
|
66
66
|
stochastic_rounding: bool = True,
|
|
67
|
-
|
|
67
|
+
orthogonal_gradient: bool = False,
|
|
68
68
|
nnmf_factor: bool = False,
|
|
69
69
|
):
|
|
70
70
|
if not (lr >= 0.0):
|
|
@@ -82,7 +82,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
|
|
|
82
82
|
"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
|
|
83
83
|
"alpha_grad": alpha_grad, "beta1_warmup": beta1_warmup, "min_beta1": min_beta1,
|
|
84
84
|
"vector_reshape": vector_reshape,
|
|
85
|
-
"
|
|
85
|
+
"orthogonal_gradient": orthogonal_gradient, "use_bias_correction": use_bias_correction,
|
|
86
86
|
}
|
|
87
87
|
self.stochastic_rounding = stochastic_rounding
|
|
88
88
|
self.factored = nnmf_factor
|
|
@@ -108,7 +108,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
|
|
|
108
108
|
grad = p.grad
|
|
109
109
|
if grad.dtype != torch.float32 and self.factored:
|
|
110
110
|
grad = grad.float()
|
|
111
|
-
if group["
|
|
111
|
+
if group["orthogonal_gradient"]:
|
|
112
112
|
grad = _orthogonalize_gradient(p, grad)
|
|
113
113
|
state = self.state[p]
|
|
114
114
|
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
adv_optm/__init__.py,sha256=zL7hnbPAHt7w-0fZQld04Pt58F-aYaRAqz15e-RZh-Y,306
|
|
2
|
+
adv_optm/optim/AdamW_adv.py,sha256=aTuYcJgd_EcZOrs6TDgBrBKw3wtU5LPzE5WvTBDDeEo,14317
|
|
3
|
+
adv_optm/optim/Adopt_adv.py,sha256=lElmraSiIZiGu9W6ELXnIPZNEEYi1ZWuvuemgPZOixk,17484
|
|
4
|
+
adv_optm/optim/Lion_Prodigy_adv.py,sha256=sGzhts9a6gHfCkuHTB5L9IrClo4c6UThzYYErBwqOaA,12844
|
|
5
|
+
adv_optm/optim/Lion_adv.py,sha256=6G1CukJB_pC7l9HwFEuY1ydsNHZFabVmOvcHDsHHVuQ,8295
|
|
6
|
+
adv_optm/optim/Prodigy_adv.py,sha256=8XUpu19BaBmHb-R9K3jgwySDbtVaLU1_Drtttc_zITs,22461
|
|
7
|
+
adv_optm/optim/Simplified_AdEMAMix.py,sha256=tb3d6Cw_nGwcTzYUhDnKqyP7GzjD1hn8k4WqGG5lhmw,9813
|
|
8
|
+
adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
|
|
9
|
+
adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
|
|
10
|
+
adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
|
|
11
|
+
adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
|
|
12
|
+
adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
|
|
13
|
+
adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
|
|
14
|
+
adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
|
|
15
|
+
adv_optm-1.0.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
16
|
+
adv_optm-1.0.3.dist-info/METADATA,sha256=Cx9bqS9VFt2nBey-H7GxVS0AXwNzTy0eW5NtSW6uXKk,8422
|
|
17
|
+
adv_optm-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
+
adv_optm-1.0.3.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
|
|
19
|
+
adv_optm-1.0.3.dist-info/RECORD,,
|
adv_optm-1.0.1.dist-info/RECORD
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
adv_optm/__init__.py,sha256=zRfL5MVYJYRNKJAwBSjRKCU6Xo5vW8RbFlTEENHpKxg,306
|
|
2
|
-
adv_optm/optim/AdamW_adv.py,sha256=O0q35c5CO9l1qr9YW3SWBuvkw-x04Ns9T03ewLjD1Ok,14287
|
|
3
|
-
adv_optm/optim/Adopt_adv.py,sha256=PlkO2jfjbw_aPwkRVIqdB9U-3xIfnvlkZmRMjpeoXfc,17454
|
|
4
|
-
adv_optm/optim/Lion_Prodigy_adv.py,sha256=YcrfoZ7Sh0O7tj2OXdOzB3K3oOzGXmEg0z55_iSiPRg,12802
|
|
5
|
-
adv_optm/optim/Lion_adv.py,sha256=uME0YTV5yTZs3bCBg9BQSF0PqlfJJE1BB2q5kWeh49w,8271
|
|
6
|
-
adv_optm/optim/Prodigy_adv.py,sha256=pL5fOqBcraaQqnZn1tQ7f3Ph2SAJ9ZPN65LNTHlrys4,22431
|
|
7
|
-
adv_optm/optim/Simplified_AdEMAMix.py,sha256=7vr413DsxuHTzh1G9mdXpyB8_1F00GW82iiuUD_jbyg,9783
|
|
8
|
-
adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
|
|
9
|
-
adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
|
|
10
|
-
adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
|
|
11
|
-
adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
|
|
12
|
-
adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
|
|
13
|
-
adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
|
|
14
|
-
adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
|
|
15
|
-
adv_optm-1.0.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
16
|
-
adv_optm-1.0.1.dist-info/METADATA,sha256=IP6zZKr9fGFNBm1D6yvzQr1bFCMQn4CVUcl4qd_yR8M,8422
|
|
17
|
-
adv_optm-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
18
|
-
adv_optm-1.0.1.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
|
|
19
|
-
adv_optm-1.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|