PyPI - heavyball - Versions diffs - 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

heavyball 1.6.3py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

heavyball/__init__.py +515 -100
heavyball/chainable.py +487 -156
heavyball/optimizations/__init__.py +38 -0
heavyball/optimizations/integrator.py +169 -0
heavyball/optimizations/optimizations.py +329 -0
heavyball/utils.py +780 -241
{heavyball-1.6.3.dist-info → heavyball-1.7.1.dist-info}/METADATA +3 -2
heavyball-1.7.1.dist-info/RECORD +11 -0
{heavyball-1.6.3.dist-info → heavyball-1.7.1.dist-info}/WHEEL +1 -1
{heavyball-1.6.3.dist-info → heavyball-1.7.1.dist-info/licenses}/LICENSE +1 -1
heavyball-1.6.3.dist-info/RECORD +0 -8
{heavyball-1.6.3.dist-info → heavyball-1.7.1.dist-info}/top_level.txt +0 -0

heavyball/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import functools
+import math
 from typing import Optional
 from . import chainable as C
@@ -6,10 +7,24 @@ from . import utils
 class ForeachAdamW(C.BaseOpt):
-    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
-                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
-                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+    ):
         defaults = locals()
         defaults.pop("self")
         params = defaults.pop("params")
@@ -21,26 +36,74 @@ class ForeachRMSprop(C.BaseOpt):
     Debiased RMSprop (not torch.optim.RMSprop)
     """
-    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-6, weight_decay=0, warmup_steps=0, r=0.0,
-                 weight_lr_power=2.0, foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False,
-                 caution: bool = False, mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
-                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-6,
+        weight_decay=0,
+        warmup_steps=0,
+        r=0.0,
+        weight_lr_power=2.0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+    ):
         defaults = locals()
         defaults.pop("self")
         params = defaults.pop("params")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_exp_avg_sq)
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            palm,
+            C.scale_by_exp_avg_sq,
+        )
 class ForeachSFAdamW(C.ScheduleFree):
-    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-6, weight_decay=0, warmup_steps=0, r=0.0,
-                 weight_lr_power=2.0, foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False,
-                 caution: bool = False, mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
-                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-6,
+        weight_decay=0,
+        warmup_steps=0,
+        r=0.0,
+        weight_lr_power=2.0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+    ):
         defaults = locals()
         defaults.pop("self")
         params = defaults.pop("params")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_exp_avg_sq,
-                         C.update_by_schedule_free)
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            palm,
+            C.scale_by_exp_avg_sq,
+            C.update_by_schedule_free,
+        )
 class PaLMForeachSFAdamW(ForeachSFAdamW):
@@ -48,10 +111,24 @@ class PaLMForeachSFAdamW(ForeachSFAdamW):
 class ForeachADOPT(C.BaseOpt):
-    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
-                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
-                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+    ):
         defaults = locals()
         defaults.pop("self")
         params = defaults.pop("params")
@@ -59,23 +136,59 @@ class ForeachADOPT(C.BaseOpt):
 class ForeachMuon(C.BaseOpt):
-    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
-                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
-                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8,
-                 nesterov: bool = True):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+        nesterov: bool = True,
+    ):
         defaults = locals()
         defaults.pop("self")
         params = defaults.pop("params")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,
-                         C.nesterov_momentum if nesterov else C.heavyball_momentum, C.orthogonalize_update)
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            palm,
+            C.nesterov_momentum if nesterov else C.heavyball_momentum,
+            C.orthogonalize_update,
+        )
 class ForeachLaProp(C.BaseOpt):
-    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
-                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
-                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+    ):
         defaults = locals()
         defaults.pop("self")
         params = defaults.pop("params")
@@ -83,15 +196,37 @@ class ForeachLaProp(C.BaseOpt):
 class MuonLaProp(C.BaseOpt):
-    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
-                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
-                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+    ):
         defaults = locals()
         defaults.pop("self")
         params = defaults.pop("params")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_laprop,
-                         C.orthogonalize_update)
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            palm,
+            C.scale_by_laprop,
+            C.orthogonalize_update,
+        )
 class ForeachSOAP(C.BaseOpt):
@@ -105,16 +240,38 @@ class ForeachSOAP(C.BaseOpt):
             https://arxiv.org/abs/2409.11321
             https://github.com/nikhilvyas/SOAP
     """
     use_precond_schedule: bool = False
-    def __init__(self, params, lr: float = 3e-3, betas=(0.9, 0.95), shampoo_beta: float = 0.95, eps: float = 1e-8,
-                 weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
-                 merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
-                 correct_bias: bool = True, warmup_steps: int = 0, split: bool = False, foreach: bool = True,
-                 mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025, palm: bool = C.use_default,
-                 precond_scheduler=(1 / 3, 9), beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
-                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default,
-                 storage_dtype: str = 'float32', stochastic_schedule: bool = False):
+    def __init__(
+        self,
+        params,
+        lr: float = 3e-3,
+        betas=(0.9, 0.95),
+        shampoo_beta: float = 0.95,
+        eps: float = 1e-8,
+        weight_decay: float = 0.01,
+        precondition_frequency: int = 2,
+        max_precond_dim: int = 2048,  #
+        merge_dims: bool = True,
+        precondition_1d: bool = False,
+        normalize_grads: bool = False,
+        correct_bias: bool = True,
+        warmup_steps: int = 0,
+        split: bool = False,
+        foreach: bool = True,
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        palm: bool = C.use_default,
+        precond_scheduler=(1 / 3, 9),
+        beta2_scale: float = 0.8,
+        use_precond_schedule: bool = C.use_default,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        storage_dtype: str = "float32",
+        stochastic_schedule: bool = False,
+    ):
         use_precond_schedule = C.default(use_precond_schedule, self.use_precond_schedule)
         defaults = locals()
@@ -122,24 +279,54 @@ class ForeachSOAP(C.BaseOpt):
         params = defaults.pop("params")
         if use_precond_schedule:
-            del defaults['precondition_frequency']
+            del defaults["precondition_frequency"]
             self.precond_schedule = utils.get_soap_precond_schedule(defaults.pop("precond_scheduler"))
         else:
-            del defaults['precond_scheduler']
+            del defaults["precond_scheduler"]
             self.precond_schedule = 1 / defaults.pop("precondition_frequency")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,  #
-                         C.scale_by_soap)
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            palm,  #
+            C.scale_by_soap,
+        )
 class ForeachSignLaProp(C.BaseOpt):
-    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
-                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
-                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+    ):
         defaults = locals()
         defaults.pop("self")
         params = defaults.pop("params")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_laprop, C.sign)
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            palm,
+            C.scale_by_laprop,
+            C.sign,
+        )
 class ForeachSOLP(C.BaseOpt):
@@ -153,16 +340,38 @@ class ForeachSOLP(C.BaseOpt):
             https://arxiv.org/abs/2409.11321
             https://github.com/nikhilvyas/SOAP
     """
     use_precond_schedule: bool = False
-    def __init__(self, params, lr: float = 3e-3, betas=(0.9, 0.95), shampoo_beta: float = 0.95, eps: float = 1e-8,
-                 weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
-                 merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
-                 correct_bias: bool = True, warmup_steps: int = 0, split: bool = False, foreach: bool = True,
-                 mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025, palm: bool = C.use_default,
-                 precond_scheduler=(1 / 3, 9), beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
-                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default,
-                 storage_dtype: str = 'float32', stochastic_schedule: bool = False):
+    def __init__(
+        self,
+        params,
+        lr: float = 3e-3,
+        betas=(0.9, 0.95),
+        shampoo_beta: float = 0.95,
+        eps: float = 1e-8,
+        weight_decay: float = 0.01,
+        precondition_frequency: int = 2,
+        max_precond_dim: int = 2048,  #
+        merge_dims: bool = True,
+        precondition_1d: bool = False,
+        normalize_grads: bool = False,
+        correct_bias: bool = True,
+        warmup_steps: int = 0,
+        split: bool = False,
+        foreach: bool = True,
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        palm: bool = C.use_default,
+        precond_scheduler=(1 / 3, 9),
+        beta2_scale: float = 0.8,
+        use_precond_schedule: bool = C.use_default,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        storage_dtype: str = "float32",
+        stochastic_schedule: bool = False,
+    ):
         use_precond_schedule = C.default(use_precond_schedule, self.use_precond_schedule)
         defaults = locals()
@@ -170,13 +379,20 @@ class ForeachSOLP(C.BaseOpt):
         params = defaults.pop("params")
         if use_precond_schedule:
-            del defaults['precondition_frequency']
+            del defaults["precondition_frequency"]
             self.precond_schedule = utils.get_soap_precond_schedule(defaults.pop("precond_scheduler"))
         else:
-            del defaults['precond_scheduler']
+            del defaults["precond_scheduler"]
             self.precond_schedule = 1 / defaults.pop("precondition_frequency")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,  #
-                         functools.partial(C.scale_by_soap, inner='laprop'))
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            palm,  #
+            functools.partial(C.scale_by_soap, inner="laprop"),
+        )
 class PaLMForeachSOAP(ForeachSOAP):
@@ -194,27 +410,71 @@ class PrecondSchedulePaLMForeachSOAP(ForeachSOAP):
 class OrthoLaProp(C.BaseOpt):
-    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
-                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
-                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+    ):
         defaults = locals()
         defaults.pop("self")
         params = defaults.pop("params")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,
-                         C.orthogonalize_grad_to_param, C.scale_by_laprop)
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            palm,
+            C.orthogonalize_grad_to_param,
+            C.scale_by_laprop,
+        )
 class LaPropOrtho(C.BaseOpt):
-    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
-                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
-                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+    ):
         defaults = locals()
         defaults.pop("self")
         params = defaults.pop("params")
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_laprop,
-                         C.orthogonalize_grad_to_param)
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            palm,
+            C.scale_by_laprop,
+            C.orthogonalize_grad_to_param,
+        )
 class ForeachPSGDKron(C.BaseOpt):
@@ -228,20 +488,43 @@ class ForeachPSGDKron(C.BaseOpt):
     cached: bool = False
     exp_avg_input: bool = True
-    def __init__(self, params, lr=0.001, beta=0.9, weight_decay=0.0, preconditioner_update_probability=None,
-                 max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
-                 momentum_into_precond_update=True, warmup_steps: int = 0, merge_dims: bool = False,
-                 split: bool = False, store_triu_as_line: bool = True, foreach: bool = True, q_dtype='float32',
-                 stochastic_schedule: bool = False, storage_dtype: str = 'float32', mars: bool = False,
-                 caution: bool = False, mars_gamma: float = 0.0025, delayed: Optional[bool] = C.use_default,
-                 cached: Optional[bool] = C.use_default, exp_avg_input: Optional[bool] = C.use_default,
-                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default,  #
-                 # expert parameters
-                 precond_init_scale=1.0, precond_lr=0.1):
+    def __init__(
+        self,
+        params,
+        lr=0.001,
+        beta=0.9,
+        weight_decay=0.0,
+        preconditioner_update_probability=None,
+        max_size_triangular=2048,
+        min_ndim_triangular=2,
+        memory_save_mode=None,
+        momentum_into_precond_update=True,
+        warmup_steps: int = 0,
+        merge_dims: bool = False,
+        split: bool = False,
+        store_triu_as_line: bool = True,
+        foreach: bool = True,
+        q_dtype="float32",
+        stochastic_schedule: bool = False,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        delayed: Optional[bool] = C.use_default,
+        cached: Optional[bool] = C.use_default,
+        exp_avg_input: Optional[bool] = C.use_default,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,  #
+        # expert parameters
+        precond_init_scale=None,
+        precond_init_scale_scale=1,
+        precond_lr=0.1,
+    ):
         defaults = locals()
         defaults.pop("self")
-        self.precond_schedule = defaults.pop(
-            "preconditioner_update_probability") or utils.precond_update_prob_schedule()
+        self.precond_schedule = (
+            defaults.pop("preconditioner_update_probability") or utils.precond_update_prob_schedule()
+        )
         params = defaults.pop("params")
         delayed = C.default(delayed, self.delayed)
@@ -249,9 +532,16 @@ class ForeachPSGDKron(C.BaseOpt):
         exp_avg_input = C.default(exp_avg_input, self.exp_avg_input)
         update_clipping = C.default(update_clipping, utils.trust_region_clip_)
-        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, False,  #
-                         *(C.exp_avg,) * exp_avg_input,  #
-                         functools.partial(C.scale_by_delayed_psgd if delayed else C.scale_by_psgd, cached=cached))
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            False,  #
+            *(C.exp_avg,) * exp_avg_input,  #
+            functools.partial(C.scale_by_delayed_psgd if delayed else C.scale_by_psgd, cached=cached),
+        )
 class ForeachPurePSGD(ForeachPSGDKron):
@@ -275,6 +565,90 @@ class ForeachCachedNewtonPSGD(ForeachCachedPSGDKron):
     hessian_approx = True
+class NewtonHybrid2PSGDKron(ForeachCachedNewtonPSGD):
+    hvp_interval = 2
+class ForeachPSGDLRA(C.BaseOpt):
+    """
+    Originally from Evan Walters and Omead Pooladzandi, 2024
+    Modified under Creative Commons Attribution 4.0 International
+    Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5ee8a1a4c29e4780bbf6c521e545189eff9/kron_torch/kron.py
+    """
+    delayed: bool = False
+    exp_avg_input: bool = True
+    def __init__(
+        self,
+        params,
+        lr=0.001,
+        beta=0.9,
+        weight_decay=0.0,
+        preconditioner_update_probability=None,
+        momentum_into_precond_update=True,
+        rank: Optional[int] = None,
+        warmup_steps: int = 0,
+        foreach: bool = True,
+        q_dtype="float32",
+        stochastic_schedule: bool = False,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        delayed: Optional[bool] = C.use_default,
+        exp_avg_input: Optional[bool] = C.use_default,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        eps: float = 1e-8,  #
+        # expert parameters
+        precond_init_scale=None,
+        precond_init_scale_scale=1,
+        precond_lr=0.1,
+    ):
+        defaults = locals()
+        defaults.pop("self")
+        self.precond_schedule = (
+            defaults.pop("preconditioner_update_probability") or utils.precond_update_prob_schedule()
+        )
+        params = defaults.pop("params")
+        if rank is None:
+            utils.warn_once(
+                f"{rank=}. It will be set to log2(param_count). This requires `params` to be of type list. Currently, {type(params)=}"
+            )
+            params = list(params)
+            defaults["rank"] = round(math.log2(sum(p.numel() for p in params)))
+            utils.warn_once(f"rank was set to {defaults['rank']}")
+        delayed = C.default(delayed, self.delayed)
+        exp_avg_input = C.default(exp_avg_input, self.exp_avg_input)
+        update_clipping = C.default(update_clipping, utils.trust_region_clip_)
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            False,  #
+            *(C.exp_avg,) * exp_avg_input,  #
+            C.scale_by_delayed_psgd_lra if delayed else C.scale_by_psgd_lra,
+        )
+class ForeachDelayedPSGDLRA(ForeachPSGDLRA):
+    delayed: bool = True
+class ForeachNewtonPSGDLRA(ForeachPSGDLRA):
+    hessian_approx = True
+class NewtonHybrid2PSGDLRA(ForeachNewtonPSGDLRA):
+    hvp_interval = 2
 PalmForEachSoap = PaLMForeachSOAP
 PaLMSOAP = PaLMForeachSOAP
 PaLMSFAdamW = PaLMForeachSFAdamW
@@ -293,11 +667,52 @@ CachedPSGDKron = ForeachCachedPSGDKron
 CachedDelayedPSGDKron = ForeachCachedDelayedPSGDKron
 Muon = ForeachMuon
 SignLaProp = ForeachSignLaProp
-__all__ = ["Muon", "RMSprop", "PrecondSchedulePaLMSOAP", "PSGDKron", "PurePSGD", "DelayedPSGD", "CachedPSGDKron",
-           "CachedDelayedPSGDKron", "PalmForEachSoap", "PaLMSOAP", "PaLMSFAdamW", "LaProp", "ADOPT",
-           "PrecondScheduleSOAP", "PrecondSchedulePaLMSOAP", 'RMSprop', 'MuonLaProp', 'ForeachSignLaProp'  #
-                                                                                      "ForeachAdamW", "ForeachSFAdamW",
-           "ForeachLaProp", "ForeachADOPT", "ForeachSOAP", "ForeachPSGDKron", "ForeachPurePSGD", "ForeachDelayedPSGD",
-           "ForeachCachedPSGDKron", "ForeachCachedDelayedPSGDKron", "ForeachRMSprop", "ForeachMuon",
-           'ForeachCachedNewtonPSGD', 'OrthoLaProp', 'LaPropOrtho', 'SignLaProp']
+DelayedPSGDLRA = ForeachDelayedPSGDLRA
+PSGDLRA = ForeachPSGDLRA
+NewtonPSGDLRA = ForeachNewtonPSGDLRA
+__all__ = [
+    "Muon",
+    "RMSprop",
+    "PrecondSchedulePaLMSOAP",
+    "PSGDKron",
+    "PurePSGD",
+    "DelayedPSGD",
+    "CachedPSGDKron",
+    "CachedDelayedPSGDKron",
+    "PalmForEachSoap",
+    "PaLMSOAP",
+    "PaLMSFAdamW",
+    "LaProp",
+    "ADOPT",
+    "PrecondScheduleSOAP",
+    "PrecondSchedulePaLMSOAP",
+    "RMSprop",
+    "MuonLaProp",
+    "ForeachSignLaProp",
+    "ForeachDelayedPSGDLRA",
+    "ForeachPSGDLRA",
+    "ForeachPSGDLRA",
+    "ForeachNewtonPSGDLRA",  #
+    "ForeachAdamW",
+    "ForeachSFAdamW",
+    "ForeachLaProp",
+    "ForeachADOPT",
+    "ForeachSOAP",
+    "ForeachPSGDKron",
+    "ForeachPurePSGD",
+    "ForeachDelayedPSGD",
+    "ForeachCachedPSGDKron",
+    "ForeachCachedDelayedPSGDKron",
+    "ForeachRMSprop",
+    "ForeachMuon",
+    "ForeachCachedNewtonPSGD",
+    "OrthoLaProp",
+    "LaPropOrtho",
+    "SignLaProp",
+    "DelayedPSGD",
+    "PSGDLRA",
+    "NewtonPSGDLRA",
+    "NewtonHybrid2PSGDLRA",
+    "NewtonHybrid2PSGDKron",
+]

heavyball 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl

heavyball 1.6.3py3-none-any.whl → 1.7.1py3-none-any.whl