PyPI - adv-optm - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

adv-optm 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of adv-optm might be problematic. Click here for more details.

Files changed (13) hide show

adv_optm/__init__.py +1 -1
adv_optm/optim/AdamW_adv.py +4 -4
adv_optm/optim/Adopt_adv.py +4 -4
adv_optm/optim/Lion_Prodigy_adv.py +5 -5
adv_optm/optim/Lion_adv.py +3 -3
adv_optm/optim/Prodigy_adv.py +4 -4
adv_optm/optim/Simplified_AdEMAMix.py +4 -4
{adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/METADATA +1 -1
adv_optm-1.0.3.dist-info/RECORD +19 -0
adv_optm-1.0.1.dist-info/RECORD +0 -19
{adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/WHEEL +0 -0
{adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/licenses/LICENSE +0 -0
{adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/top_level.txt +0 -0

adv_optm/__init__.py CHANGED Viewed

@@ -16,4 +16,4 @@ __all__ = [
     "Lion_Prodigy_adv",
 ]
-__version__ = "1.0.1"
+__version__ = "1.0.3"

adv_optm/optim/AdamW_adv.py CHANGED Viewed

@@ -33,7 +33,7 @@ class AdamW_adv(torch.optim.Optimizer):
         grams_moment (bool): whether to use Grams-style updates. (default: False)
         cautious_mask (bool):  whether to use cautious masking to align the gradient's
             direction with the first moment's.  (default: False)
-        use_orthograd (bool): whether to use OrthoGrad.  (default: False)
+        orthogonal_gradient (bool): whether to use OrthoGrad.  (default: False)
         use_AdEMAMix (bool): whether to enable the AdEMAMix feature. This adds
             a second, slow-moving average of the momentum (`mt_slow`) which is
             combined with the primary momentum (`mt`) to stabilize updates,
@@ -71,7 +71,7 @@ class AdamW_adv(torch.optim.Optimizer):
         use_atan2: bool = False,
         cautious_mask: bool = False,
         grams_moment: bool = False,
-        use_orthograd: bool = False,
+        orthogonal_gradient: bool = False,
         use_AdEMAMix: bool = False,
         beta3_ema: float = 0.9999,
         alpha: float = 5.0,
@@ -93,7 +93,7 @@ class AdamW_adv(torch.optim.Optimizer):
         defaults = {
             "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
             "vector_reshape": vector_reshape, "use_atan2": use_atan2,
-            "use_orthograd": use_orthograd, "use_bias_correction": use_bias_correction,
+            "orthogonal_gradient": orthogonal_gradient, "use_bias_correction": use_bias_correction,
             "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
         }
         self.stochastic_rounding = stochastic_rounding
@@ -123,7 +123,7 @@ class AdamW_adv(torch.optim.Optimizer):
         grad = p.grad
         if grad.dtype != torch.float32 and self.factored:
             grad = grad.float()
-        if group["use_orthograd"]:
+        if group["orthogonal_gradient"]:
             grad = _orthogonalize_gradient(p, grad)
         state = self.state[p]

adv_optm/optim/Adopt_adv.py CHANGED Viewed

@@ -40,7 +40,7 @@ class Adopt_adv(torch.optim.Optimizer):
             direction with the first moment's.  (default: False)
         grams_moment (bool): whether to combine the gradient's direction with the
             first moment's magnitude (default: False).
-        use_orthograd (bool): whether to use OrthoGrad. (default: False)
+        orthogonal_gradient (bool): whether to use OrthoGrad. (default: False)
         use_AdEMAMix (bool): whether to enable the AdEMAMix feature. This adds
             a second, slow-moving average of the momentum (`mt_slow`) which is
             combined with the primary momentum (`mt`) to stabilize updates,
@@ -89,7 +89,7 @@ class Adopt_adv(torch.optim.Optimizer):
         use_atan2: bool = False,
         cautious_mask: bool = False,
         grams_moment: bool = False,
-        use_orthograd: bool = False,
+        orthogonal_gradient: bool = False,
         use_AdEMAMix: bool = False,
         beta3_ema: float = 0.9999,
         alpha: float = 5.0,
@@ -131,7 +131,7 @@ class Adopt_adv(torch.optim.Optimizer):
         self.use_atan2 = use_atan2 and not Simplified_AdEMAMix
         self.cautious_mask = cautious_mask and not Simplified_AdEMAMix
         self.grams_moment = grams_moment and not Simplified_AdEMAMix
-        self.use_orthograd = use_orthograd
+        self.orthogonal_gradient = orthogonal_gradient
         self.use_AdEMAMix = use_AdEMAMix and not Simplified_AdEMAMix
         self.Simplified_AdEMAMix = Simplified_AdEMAMix
         self.factored = nnmf_factor
@@ -152,7 +152,7 @@ class Adopt_adv(torch.optim.Optimizer):
         grad = p.grad
         if self.factored and grad.dtype != torch.float32:
             grad = grad.float()
-        if self.use_orthograd:
+        if self.orthogonal_gradient:
             grad = _orthogonalize_gradient(p, grad)
         state = self.state[p]

adv_optm/optim/Lion_Prodigy_adv.py CHANGED Viewed

@@ -60,7 +60,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
         weight_decay: float = 0.0,
         vector_reshape: bool = True,
         stochastic_rounding: bool = True,
-        use_orthograd: bool = False,
+        orthogonal_gradient: bool = False,
         cautious_mask: bool = False,
         clip_threshold: float = 0.0,
         nnmf_factor: bool = True,
@@ -85,7 +85,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
             betas=betas,
             weight_decay=weight_decay,
             vector_reshape=vector_reshape,
-            use_orthograd=use_orthograd,
+            orthogonal_gradient=orthogonal_gradient,
             clip_threshold=clip_threshold,
             beta3=beta3, d=d0, d0=d0, d_max=d0, d_numerator=0.0, d_coef=d_coef,
             growth_rate=growth_rate, safeguard_warmup=safeguard_warmup, k=0, slice_p=slice_p,
@@ -146,7 +146,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
             if grad_norm > group["clip_threshold"]:
                 clip_coef = group["clip_threshold"] / grad_norm
                 grad.mul_(clip_coef)
-        if group["use_orthograd"]:
+        if group["orthogonal_gradient"]:
             grad = _orthogonalize_gradient(p, grad)
         state = self.state[p]
@@ -195,7 +195,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
                 exp_avg = exp_avg.float()
             # Compute update term c_t = β1*m_{t-1} + (1-β1)*g_t
-            signed_update = exp_avg.clone().mul_(self.beta1).add_(grad_reshaped, alpha=(1-self.beta1)).sign_()
+            signed_update = exp_avg.clone().mul_(self.beta1).add_(grad_reshaped, alpha=self.d * (1-self.beta1)).sign_()
             if self.cautious_mask:
                 mask = (signed_update * grad_reshaped > 0).to(grad_reshaped.dtype)
@@ -222,7 +222,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
             # Compute update term and sign for the update
             if exp_avg.dtype != torch.float32 and self.factored:
                 exp_avg = exp_avg.float()
-            signed_update = exp_avg.clone().mul_(self.beta1).add_(grad, alpha=(1-self.beta1)).sign_()
+            signed_update = exp_avg.clone().mul_(self.beta1).add_(grad, alpha=self.d * (1-self.beta1)).sign_()
             if self.cautious_mask:
                 mask = (signed_update * grad > 0).to(grad.dtype)

adv_optm/optim/Lion_adv.py CHANGED Viewed

@@ -43,7 +43,7 @@ class Lion_adv(torch.optim.Optimizer):
         weight_decay: float = 0.0,
         vector_reshape: bool = True,
         stochastic_rounding: bool = True,
-        use_orthograd: bool = False,
+        orthogonal_gradient: bool = False,
         cautious_mask: bool = False,
         clip_threshold: float = 0.0,
         nnmf_factor: bool = True,
@@ -60,7 +60,7 @@ class Lion_adv(torch.optim.Optimizer):
             betas=betas,
             weight_decay=weight_decay,
             vector_reshape=vector_reshape,
-            use_orthograd=use_orthograd,
+            orthogonal_gradient=orthogonal_gradient,
             clip_threshold=clip_threshold,
         )
         self.stochastic_rounding = stochastic_rounding
@@ -94,7 +94,7 @@ class Lion_adv(torch.optim.Optimizer):
             if grad_norm > group["clip_threshold"]:
                 clip_coef = group["clip_threshold"] / grad_norm
                 grad.mul_(clip_coef)
-        if group["use_orthograd"]:
+        if group["orthogonal_gradient"]:
             grad = _orthogonalize_gradient(p, grad)
         state = self.state[p]

adv_optm/optim/Prodigy_adv.py CHANGED Viewed

@@ -32,7 +32,7 @@ class Prodigy_adv(torch.optim.Optimizer):
         grams_moment (bool): whether to use Grams-style updates. (default: False)
         cautious_mask (bool):  whether to use cautious masking to align the gradient's
             direction with the first moment's.  (default: False)
-        use_orthograd (bool): whether to use OrthoGrad.  (default: False)
+        orthogonal_gradient (bool): whether to use OrthoGrad.  (default: False)
         use_AdEMAMix (bool): whether to enable the AdEMAMix feature. This adds
             a second, slow-moving average of the momentum (`mt_slow`) which is
             combined with the primary momentum (`mt`) to stabilize updates,
@@ -99,7 +99,7 @@ class Prodigy_adv(torch.optim.Optimizer):
         use_atan2: bool = False,
         cautious_mask: bool = False,
         grams_moment: bool = False,
-        use_orthograd: bool = False,
+        orthogonal_gradient: bool = False,
         use_AdEMAMix: bool = False,
         beta3_ema: float = 0.9999,
         alpha: float = 5.0,
@@ -148,7 +148,7 @@ class Prodigy_adv(torch.optim.Optimizer):
         defaults = {
             "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
             "vector_reshape": vector_reshape, "use_atan2": use_atan2,
-            "use_orthograd": use_orthograd,
+            "orthogonal_gradient": orthogonal_gradient,
             "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
             "beta3": beta3, "d": d0, "d0": d0, "d_max": d0, "d_numerator": 0.0, "d_coef": d_coef,
             "growth_rate": growth_rate, "safeguard_warmup": safeguard_warmup, "k": 0, "slice_p": slice_p,
@@ -206,7 +206,7 @@ class Prodigy_adv(torch.optim.Optimizer):
         grad = p.grad
         if grad.dtype != torch.float32 and self.factored:
             grad = grad.float()
-        if group["use_orthograd"]:
+        if group["orthogonal_gradient"]:
             grad = _orthogonalize_gradient(p, grad)
         state = self.state[p]

adv_optm/optim/Simplified_AdEMAMix.py CHANGED Viewed

@@ -46,7 +46,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
             matrices to apply low-rank compression (default: True).
         stochastic_rounding (bool): whether to use stochastic
             rounding for BF16 parameter updates (default: True).
-        use_orthograd (bool): whether to use OrthoGrad. (default: False)
+        orthogonal_gradient (bool): whether to use OrthoGrad. (default: False)
         nnmf_factor (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: False)
     """
@@ -64,7 +64,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
         use_bias_correction: bool = True,
         vector_reshape: bool = True,
         stochastic_rounding: bool = True,
-        use_orthograd: bool = False,
+        orthogonal_gradient: bool = False,
         nnmf_factor: bool = False,
     ):
         if not (lr >= 0.0):
@@ -82,7 +82,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
             "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
             "alpha_grad": alpha_grad, "beta1_warmup": beta1_warmup, "min_beta1": min_beta1,
             "vector_reshape": vector_reshape,
-            "use_orthograd": use_orthograd, "use_bias_correction": use_bias_correction,
+            "orthogonal_gradient": orthogonal_gradient, "use_bias_correction": use_bias_correction,
         }
         self.stochastic_rounding = stochastic_rounding
         self.factored = nnmf_factor
@@ -108,7 +108,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
         grad = p.grad
         if grad.dtype != torch.float32 and self.factored:
             grad = grad.float()
-        if group["use_orthograd"]:
+        if group["orthogonal_gradient"]:
             grad = _orthogonalize_gradient(p, grad)
         state = self.state[p]

{adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 1.0.1
+Version: 1.0.3
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

adv_optm-1.0.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+adv_optm/__init__.py,sha256=zL7hnbPAHt7w-0fZQld04Pt58F-aYaRAqz15e-RZh-Y,306
+adv_optm/optim/AdamW_adv.py,sha256=aTuYcJgd_EcZOrs6TDgBrBKw3wtU5LPzE5WvTBDDeEo,14317
+adv_optm/optim/Adopt_adv.py,sha256=lElmraSiIZiGu9W6ELXnIPZNEEYi1ZWuvuemgPZOixk,17484
+adv_optm/optim/Lion_Prodigy_adv.py,sha256=sGzhts9a6gHfCkuHTB5L9IrClo4c6UThzYYErBwqOaA,12844
+adv_optm/optim/Lion_adv.py,sha256=6G1CukJB_pC7l9HwFEuY1ydsNHZFabVmOvcHDsHHVuQ,8295
+adv_optm/optim/Prodigy_adv.py,sha256=8XUpu19BaBmHb-R9K3jgwySDbtVaLU1_Drtttc_zITs,22461
+adv_optm/optim/Simplified_AdEMAMix.py,sha256=tb3d6Cw_nGwcTzYUhDnKqyP7GzjD1hn8k4WqGG5lhmw,9813
+adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
+adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
+adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
+adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
+adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
+adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
+adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
+adv_optm-1.0.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+adv_optm-1.0.3.dist-info/METADATA,sha256=Cx9bqS9VFt2nBey-H7GxVS0AXwNzTy0eW5NtSW6uXKk,8422
+adv_optm-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+adv_optm-1.0.3.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
+adv_optm-1.0.3.dist-info/RECORD,,

adv_optm-1.0.1.dist-info/RECORD DELETED Viewed

@@ -1,19 +0,0 @@
-adv_optm/__init__.py,sha256=zRfL5MVYJYRNKJAwBSjRKCU6Xo5vW8RbFlTEENHpKxg,306
-adv_optm/optim/AdamW_adv.py,sha256=O0q35c5CO9l1qr9YW3SWBuvkw-x04Ns9T03ewLjD1Ok,14287
-adv_optm/optim/Adopt_adv.py,sha256=PlkO2jfjbw_aPwkRVIqdB9U-3xIfnvlkZmRMjpeoXfc,17454
-adv_optm/optim/Lion_Prodigy_adv.py,sha256=YcrfoZ7Sh0O7tj2OXdOzB3K3oOzGXmEg0z55_iSiPRg,12802
-adv_optm/optim/Lion_adv.py,sha256=uME0YTV5yTZs3bCBg9BQSF0PqlfJJE1BB2q5kWeh49w,8271
-adv_optm/optim/Prodigy_adv.py,sha256=pL5fOqBcraaQqnZn1tQ7f3Ph2SAJ9ZPN65LNTHlrys4,22431
-adv_optm/optim/Simplified_AdEMAMix.py,sha256=7vr413DsxuHTzh1G9mdXpyB8_1F00GW82iiuUD_jbyg,9783
-adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
-adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
-adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
-adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
-adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
-adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
-adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
-adv_optm-1.0.1.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-adv_optm-1.0.1.dist-info/METADATA,sha256=IP6zZKr9fGFNBm1D6yvzQr1bFCMQn4CVUcl4qd_yR8M,8422
-adv_optm-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-adv_optm-1.0.1.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
-adv_optm-1.0.1.dist-info/RECORD,,

{adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{adv_optm-1.0.1.dist-info → adv_optm-1.0.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

adv-optm 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

Potentially problematic release.

adv-optm 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl