PyPI - heavyball - Versions diffs - 2.0.0.dev0__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

heavyball 2.0.0.dev0py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

heavyball/__init__.py +168 -29
heavyball/chainable.py +165 -63
heavyball/helpers.py +5 -1
heavyball/utils.py +490 -124
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.0.dist-info}/METADATA +19 -7
heavyball-2.1.0.dist-info/RECORD +9 -0
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.0.dist-info}/WHEEL +1 -1
heavyball-2.0.0.dev0.dist-info/RECORD +0 -9
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.0.dist-info}/licenses/LICENSE +0 -0
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.0.dist-info}/top_level.txt +0 -0

heavyball/__init__.py CHANGED Viewed

@@ -41,6 +41,34 @@ class SAMWrapper(torch.optim.Optimizer):
         self.wrapped_optimizer.zero_grad()
+class SGD(C.BaseOpt):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        beta=0.9,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        **kwargs,
+    ):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        defaults.update(defaults.pop("kwargs"))
+        if kwargs:
+            utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, fns=(C.heavyball_momentum,))
 class ForeachAdamW(C.BaseOpt):
     def __init__(
         self,
@@ -69,7 +97,110 @@ class ForeachAdamW(C.BaseOpt):
         if kwargs:
             utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.update_by_adam)
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, fns=(C.update_by_adam,))
+class UnscaledAdamW(C.BaseOpt):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+        **kwargs,
+    ):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        defaults.update(defaults.pop("kwargs"))
+        if kwargs:
+            utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
+        super().__init__(
+            params, defaults, foreach, gradient_clipping, update_clipping, palm, fns=(C.scale_by_unscaled_adam,)
+        )
+class SUDSAdamW(C.BaseOpt):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+        precond_lr: float = 1e-2,
+        **kwargs,
+    ):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        defaults.update(defaults.pop("kwargs"))
+        if kwargs:
+            utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, fns=(C.scale_by_suds,))
+class ForeachAdamC(C.BaseOpt):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        max_lr: float | None = None,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+        **kwargs,
+    ):
+        if max_lr is None:
+            utils.warn_once(
+                "max_lr was not set. setting it to the current learning rate, under the assumption that it strictly decreases"
+            )
+            max_lr = lr
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        defaults.update(defaults.pop("kwargs"))
+        if kwargs:
+            utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, fns=(C.update_by_adamc,))
 class ForeachRMSprop(C.BaseOpt):
@@ -113,7 +244,7 @@ class ForeachRMSprop(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             palm,
-            C.scale_by_exp_avg_sq,
+            fns=(C.scale_by_exp_avg_sq,),
         )
@@ -154,8 +285,7 @@ class ForeachSFAdamW(C.ScheduleFree):
             gradient_clipping,
             update_clipping,
             palm,
-            C.scale_by_exp_avg_sq,
-            C.update_by_schedule_free,
+            fns=(C.scale_by_exp_avg_sq, C.update_by_schedule_free),
         )
@@ -197,8 +327,7 @@ class MSAMLaProp(C.MSAM):
             gradient_clipping,
             update_clipping,
             palm,
-            C.scale_by_exp_avg_sq,
-            C.update_by_msam,
+            fns=(C.scale_by_exp_avg_sq, C.update_by_msam),
         )
@@ -234,7 +363,7 @@ class ForeachADOPT(C.BaseOpt):
         if kwargs:
             utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.update_by_adopt)
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, fns=(C.update_by_adopt,))
 class ForeachMuon(C.BaseOpt):
@@ -256,6 +385,7 @@ class ForeachMuon(C.BaseOpt):
         palm: bool = C.use_default,
         beta2_scale: float = 0.8,
         nesterov: bool = True,
+        heavyball_momentum: bool = False,
         **kwargs,
     ):
         defaults = locals()
@@ -266,6 +396,16 @@ class ForeachMuon(C.BaseOpt):
         if kwargs:
             utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
+        if heavyball_momentum:
+            if nesterov:
+                ema = C.nesterov_momentum
+            else:
+                ema = C.heavyball_momentum
+        elif nesterov:
+            ema = C.nesterov_ema
+        else:
+            ema = C.exp_avg
         super().__init__(
             params,
             defaults,
@@ -273,8 +413,7 @@ class ForeachMuon(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             palm,
-            C.nesterov_ema if nesterov else C.exp_avg,
-            C.orthogonalize_update,
+            fns=(ema, C.orthogonalize_update),
         )
@@ -306,7 +445,7 @@ class ForeachLaProp(C.BaseOpt):
         if kwargs:
             utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.update_by_laprop)
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, fns=(C.update_by_laprop,))
 class MuonLaProp(C.BaseOpt):
@@ -344,8 +483,7 @@ class MuonLaProp(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             palm,
-            C.scale_by_laprop,
-            C.orthogonalize_update,
+            fns=(C.scale_by_laprop, C.orthogonalize_update),
         )
@@ -417,7 +555,7 @@ class ForeachSOAP(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             palm,  #
-            C.scale_by_soap,
+            fns=(C.scale_by_soap,),
         )
@@ -456,8 +594,7 @@ class ForeachSignLaProp(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             palm,
-            C.scale_by_laprop,
-            C.sign,
+            fns=(C.scale_by_laprop, C.sign),
         )
@@ -528,7 +665,7 @@ class ForeachSOLP(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             palm,  #
-            functools.partial(C.scale_by_soap, inner="laprop"),
+            fns=(functools.partial(C.scale_by_soap, inner="laprop"),),
         )
@@ -580,8 +717,7 @@ class OrthoLaProp(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             palm,
-            C.orthogonalize_grad_to_param,
-            C.scale_by_laprop,
+            fns=(C.orthogonalize_grad_to_param, C.scale_by_laprop),
         )
@@ -619,8 +755,7 @@ class LaPropOrtho(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             palm,
-            C.scale_by_laprop,
-            C.orthogonalize_grad_to_param,
+            fns=(C.scale_by_laprop, C.orthogonalize_grad_to_param),
         )
@@ -683,6 +818,10 @@ class ForeachPSGDKron(C.BaseOpt):
         exp_avg_input = C.default(exp_avg_input, self.exp_avg_input)
         update_clipping = C.default(update_clipping, utils.trust_region_clip_)
         inverse_free = C.default(inverse_free, self.quad)
+        if inverse_free:
+            raise ValueError(
+                "inverse_free (i.e., PSGD-QUAD) is not supported at the moment. Consider using https://github.com/evanatyourservice/quad_torch"
+            )
         defaults = locals()
         defaults.pop("self")
@@ -703,8 +842,10 @@ class ForeachPSGDKron(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             False,  #
-            *(C.exp_avg,) * exp_avg_input,  #
-            functools.partial(C.scale_by_delayed_psgd if delayed else C.scale_by_psgd, cached=cached),
+            fns=(
+                *(C.exp_avg,) * exp_avg_input,
+                functools.partial(C.scale_by_delayed_psgd if delayed else C.scale_by_psgd, cached=cached),
+            ),
         )
@@ -733,11 +874,6 @@ class NewtonHybrid2PSGDKron(ForeachCachedNewtonPSGD):
     hvp_interval = 2
-class QUAD(ForeachPSGDKron):
-    quad = True
-    cached = True
 class ForeachPSGDLRA(C.BaseOpt):
     """
     Originally from Evan Walters and Omead Pooladzandi, 2024
@@ -808,8 +944,7 @@ class ForeachPSGDLRA(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             False,  #
-            *(C.exp_avg,) * exp_avg_input,  #
-            C.scale_by_delayed_psgd_lra if delayed else C.scale_by_psgd_lra,
+            fns=(*(C.exp_avg,) * exp_avg_input, C.scale_by_delayed_psgd_lra if delayed else C.scale_by_psgd_lra),
         )
@@ -846,6 +981,7 @@ SignLaProp = ForeachSignLaProp
 DelayedPSGDLRA = ForeachDelayedPSGDLRA
 PSGDLRA = ForeachPSGDLRA
 NewtonPSGDLRA = ForeachNewtonPSGDLRA
+NewtonPSGDKron = ForeachCachedNewtonPSGD
 __all__ = [
     "Muon",
@@ -892,4 +1028,7 @@ __all__ = [
     "NewtonHybrid2PSGDLRA",
     "NewtonHybrid2PSGDKron",
     "MSAMLaProp",
+    "NewtonPSGDKron",
+    "ForeachAdamC",
+    "SGD",
 ]

heavyball 2.0.0.dev0__py3-none-any.whl → 2.1.0__py3-none-any.whl

heavyball 2.0.0.dev0py3-none-any.whl → 2.1.0py3-none-any.whl