PyPI - heavyball - Versions diffs - 1.5.2__tar.gz → 1.6.0__tar.gz - Mend

heavyball 1.5.2tar.gz → 1.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{heavyball-1.5.2 → heavyball-1.6.0}/PKG-INFO RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.5.2
+Version: 1.6.0
 Summary: Efficient optimizers
-Home-page: https://github.com/clashluke/heavyball
-Author: Lucas Nestler
+Home-page: https://github.com/HomebrewML/HeavyBall
+Author: HeavyBall Authors
 Author-email: github.heavyball@nestler.sh
 License: BSD
 Classifier: Development Status :: 5 - Production/Stable
@@ -300,7 +300,7 @@ class ForeachSOAP(C.BaseOpt):
     def __init__(self, params, lr: float = 3e-3, betas=(0.9, 0.95), shampoo_beta: float = 0.95, eps: float = 1e-8,
                  weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
-                 data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
+                 correct_bias: bool = True, warmup_steps: int = 1,
                  split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
                  mars_gamma: float = 0.0025, palm: bool = C.use_default, precond_scheduler=(1 / 3, 9),
                  beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
@@ -324,7 +324,6 @@ the second-order statistics of the gradients to accelerate convergence.
 * **`merge_dims`**: Whether to merge dimensions when forming the preconditioner.
 * **`precondition_1d`**: Whether to use a 1D preconditioner for 1D parameters.
 * **`normalize_grads`**: Whether to normalize gradients before applying SOAP.
-* **`data_format`**: `"channels_first"` or `"channels_last"`. Specifies the data format of the input tensors.
 * **`correct_bias`**: Enables/disables bias correction for the running averages.
 * **`warmup_steps`**: Number of steps for linear learning rate warmup.
 * **`split`**: Whether to split large dimensions when forming the preconditioner.
@@ -931,4 +930,3 @@ tasks. However, the best choice always depends on your specific model, dataset,
 * **`heavyball.utils`:** Remember to utilize the settings and functions in `heavyball.utils` (e.g., `set_torch`,
   `compile_mode`, `zeroth_power_mode`, clipping functions) to optimize performance and experiment with different
   configurations.

{heavyball-1.5.2 → heavyball-1.6.0}/README.md RENAMED Viewed

@@ -276,7 +276,7 @@ class ForeachSOAP(C.BaseOpt):
     def __init__(self, params, lr: float = 3e-3, betas=(0.9, 0.95), shampoo_beta: float = 0.95, eps: float = 1e-8,
                  weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
-                 data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
+                 correct_bias: bool = True, warmup_steps: int = 1,
                  split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
                  mars_gamma: float = 0.0025, palm: bool = C.use_default, precond_scheduler=(1 / 3, 9),
                  beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
@@ -300,7 +300,6 @@ the second-order statistics of the gradients to accelerate convergence.
 * **`merge_dims`**: Whether to merge dimensions when forming the preconditioner.
 * **`precondition_1d`**: Whether to use a 1D preconditioner for 1D parameters.
 * **`normalize_grads`**: Whether to normalize gradients before applying SOAP.
-* **`data_format`**: `"channels_first"` or `"channels_last"`. Specifies the data format of the input tensors.
 * **`correct_bias`**: Enables/disables bias correction for the running averages.
 * **`warmup_steps`**: Number of steps for linear learning rate warmup.
 * **`split`**: Whether to split large dimensions when forming the preconditioner.
@@ -907,4 +906,3 @@ tasks. However, the best choice always depends on your specific model, dataset,
 * **`heavyball.utils`:** Remember to utilize the settings and functions in `heavyball.utils` (e.g., `set_torch`,
   `compile_mode`, `zeroth_power_mode`, clipping functions) to optimize performance and experiment with different
   configurations.

{heavyball-1.5.2 → heavyball-1.6.0}/heavyball/__init__.py RENAMED Viewed

@@ -104,23 +104,17 @@ class ForeachSOAP(C.BaseOpt):
             Nikhil Vyas, Depen Morwani, Rosie Zhao, Itai Shapira, David Brandfonbrener, Lucas Janson, Sham Kakade
             https://arxiv.org/abs/2409.11321
             https://github.com/nikhilvyas/SOAP
-        ScheduleFree:
-            The Road Less Scheduled
-            Aaron Defazio, Xingyu Alice Yang, Harsh Mehta, Konstantin Mishchenko, Ahmed Khaled, Ashok Cutkosky
-            https://arxiv.org/abs/2405.15682
-            https://github.com/facebookresearch/schedule_free
     """
     use_precond_schedule: bool = False
     def __init__(self, params, lr: float = 3e-3, betas=(0.9, 0.95), shampoo_beta: float = 0.95, eps: float = 1e-8,
                  weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
-                 data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 0,
-                 split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
-                 mars_gamma: float = 0.0025, palm: bool = C.use_default, precond_scheduler=(1 / 3, 9),
-                 beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
-                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default):
+                 correct_bias: bool = True, warmup_steps: int = 0, split: bool = False, foreach: bool = True,
+                 mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025, palm: bool = C.use_default,
+                 precond_scheduler=(1 / 3, 9), beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
+                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default,
+                 storage_dtype: str = 'float32', stochastic_schedule: bool = False):
         use_precond_schedule = C.default(use_precond_schedule, self.use_precond_schedule)
         defaults = locals()
@@ -137,6 +131,54 @@ class ForeachSOAP(C.BaseOpt):
                          C.scale_by_soap)
+class ForeachSignLaProp(C.BaseOpt):
+    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
+                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_laprop, C.sign)
+class ForeachSOLP(C.BaseOpt):
+    """
+    ForeachSOLP
+    Sources:
+        Baseline SOAP:
+            SOAP: Improving and Stabilizing Shampoo using Adam
+            Nikhil Vyas, Depen Morwani, Rosie Zhao, Itai Shapira, David Brandfonbrener, Lucas Janson, Sham Kakade
+            https://arxiv.org/abs/2409.11321
+            https://github.com/nikhilvyas/SOAP
+    """
+    use_precond_schedule: bool = False
+    def __init__(self, params, lr: float = 3e-3, betas=(0.9, 0.95), shampoo_beta: float = 0.95, eps: float = 1e-8,
+                 weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
+                 merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
+                 correct_bias: bool = True, warmup_steps: int = 0, split: bool = False, foreach: bool = True,
+                 mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025, palm: bool = C.use_default,
+                 precond_scheduler=(1 / 3, 9), beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
+                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default,
+                 storage_dtype: str = 'float32', stochastic_schedule: bool = False):
+        use_precond_schedule = C.default(use_precond_schedule, self.use_precond_schedule)
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        if use_precond_schedule:
+            del defaults['precondition_frequency']
+            self.precond_schedule = utils.get_soap_precond_schedule(defaults.pop("precond_scheduler"))
+        else:
+            del defaults['precond_scheduler']
+            self.precond_schedule = 1 / defaults.pop("precondition_frequency")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,  #
+                         functools.partial(C.scale_by_soap, inner='laprop'))
 class PaLMForeachSOAP(ForeachSOAP):
     use_precond_schedule: bool = False
     palm: bool = True
@@ -163,6 +205,18 @@ class OrthoLaProp(C.BaseOpt):
                          C.orthogonalize_grad_to_param, C.scale_by_laprop)
+class LaPropOrtho(C.BaseOpt):
+    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
+                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_laprop,
+                         C.orthogonalize_grad_to_param)
 class ForeachPSGDKron(C.BaseOpt):
     """
     Originally from Evan Walters and Omead Pooladzandi, 2024
@@ -178,10 +232,10 @@ class ForeachPSGDKron(C.BaseOpt):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 0, merge_dims: bool = False,
                  split: bool = False, store_triu_as_line: bool = True, foreach: bool = True, q_dtype='float32',
-                 stochastic_schedule: bool = True, storage_dtype: str = 'float32', mars: bool = False,
+                 stochastic_schedule: bool = False, storage_dtype: str = 'float32', mars: bool = False,
                  caution: bool = False, mars_gamma: float = 0.0025, delayed: Optional[bool] = C.use_default,
                  cached: Optional[bool] = C.use_default, exp_avg_input: Optional[bool] = C.use_default,
-                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default, #
+                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default,  #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         defaults = locals()
@@ -238,10 +292,12 @@ DelayedPSGD = ForeachDelayedPSGD
 CachedPSGDKron = ForeachCachedPSGDKron
 CachedDelayedPSGDKron = ForeachCachedDelayedPSGDKron
 Muon = ForeachMuon
+SignLaProp = ForeachSignLaProp
 __all__ = ["Muon", "RMSprop", "PrecondSchedulePaLMSOAP", "PSGDKron", "PurePSGD", "DelayedPSGD", "CachedPSGDKron",
            "CachedDelayedPSGDKron", "PalmForEachSoap", "PaLMSOAP", "PaLMSFAdamW", "LaProp", "ADOPT",
-           "PrecondScheduleSOAP", "PrecondSchedulePaLMSOAP", 'RMSprop', 'MuonLaProp',  #
-           "ForeachAdamW", "ForeachSFAdamW", "ForeachLaProp", "ForeachADOPT", "ForeachSOAP", "ForeachPSGDKron",
-           "ForeachPurePSGD", "ForeachDelayedPSGD", "ForeachCachedPSGDKron", "ForeachCachedDelayedPSGDKron",
-           "ForeachRMSprop", "ForeachMuon", 'ForeachCachedNewtonPSGD']
+           "PrecondScheduleSOAP", "PrecondSchedulePaLMSOAP", 'RMSprop', 'MuonLaProp', 'ForeachSignLaProp'  #
+                                                                                      "ForeachAdamW", "ForeachSFAdamW",
+           "ForeachLaProp", "ForeachADOPT", "ForeachSOAP", "ForeachPSGDKron", "ForeachPurePSGD", "ForeachDelayedPSGD",
+           "ForeachCachedPSGDKron", "ForeachCachedDelayedPSGDKron", "ForeachRMSprop", "ForeachMuon",
+           'ForeachCachedNewtonPSGD', 'OrthoLaProp', 'LaPropOrtho', 'SignLaProp']

{heavyball-1.5.2 → heavyball-1.6.0}/heavyball/chainable.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import functools
 import random
-from typing import Optional, Union, Literal
+from typing import Optional, Union, Literal, List
 import torch
@@ -127,7 +127,7 @@ def zero_guard(*names):
 def copy_guard(index, *names):
-    return functools.partial(CopyGuard, index=index, names=names, )
+    return functools.partial(CopyGuard, index=index, names=names)
 def general_guard(*names, init_fn, skip_first: bool = True):
@@ -152,6 +152,22 @@ def exp_avg(group, update, grad, param, exp_avg):
     return utils.scale_by_exp_avg_(exp_avg, update, utils.beta_debias(utils.get_beta1(group), group["step"]))
+@zero_guard('exp_avg')
+@no_state
+def weight_decay_to_ema(group, update, grad, param, exp_avg):
+    utils.weight_decay_to_ema_(exp_avg, update, utils.beta_debias(group['ema_beta'], group['step']),
+                               group['weight_decay_to_ema'] * group['lr'])
+    return update
+@zero_guard('exp_avg')
+@no_state
+def l1_weight_decay_to_ema(group, update, grad, param, exp_avg):
+    utils.l1_weight_decay_to_ema_(exp_avg, update, utils.beta_debias(group['ema_beta'], group['step']),
+                                  group['weight_decay_to_ema'] * group['lr'])
+    return update
 @zero_guard("exp_avg_sq")
 @no_state
 def scale_by_exp_avg_sq(group, update, grad, param, exp_avg_sq):
@@ -206,14 +222,15 @@ def update_by_schedule_free(group, update, grad, param, z):
 @no_state
 def update_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
     if group['step'] == 1:
-        utils.exp_avg_sq_(exp_avg_sq, update, 0, 1)
+        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, 0, group['eps'])
         raise SkipUpdate
     if group['step'] == 2:
         update = utils.promote(update)
         easq = utils.promote(exp_avg_sq)
         [utils.set_(ea, u / easq_.sqrt().clamp_(min=group['eps'])) for ea, u, easq_ in zip(exp_avg, update, easq)]
-        utils.exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']), 1)
+        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']),
+                                   group['eps'])
         raise SkipUpdate
     utils.fused_adopt_(param, update, grad, exp_avg_sq, exp_avg, utils.get_beta1(group), utils.get_beta2(group),
@@ -225,21 +242,22 @@ def update_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
 @no_state
 def scale_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
     if group['step'] == 1:
-        utils.exp_avg_sq_(exp_avg_sq, update, 0, 1)
+        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, 0, group['eps'])
         raise SkipUpdate
     if group['step'] == 2:
         update = utils.promote(update)
         easq = utils.promote(exp_avg_sq)
         [utils.set_(ea, u / easq_.sqrt().clamp_(min=group['eps'])) for ea, u, easq_ in zip(exp_avg, update, easq)]
-        utils.exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']), 1)
+        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']),
+                                   group['eps'])
         raise SkipUpdate
     return utils.adopt(update, exp_avg_sq, exp_avg, utils.get_beta1(group), utils.get_beta2(group), group['step'] - 2)
-def _init_soap(state, group, update, grad, param):
-    utils.init_preconditioner(grad, state, utils.get_beta2(group), group['max_precond_dim'], group['precondition_1d'])
+def _init_soap(state, group, update, grad, param, inner: str = ''):
+    utils.init_preconditioner(grad, state, group['max_precond_dim'], group['precondition_1d'])
 def _init_psgd(state, group, update, grad, param, cached: bool = False, prob: Optional[callable] = None):
@@ -295,6 +313,25 @@ def nesterov_momentum(group, updates, grads, params, momentum):
     return utils.nesterov_momentum(momentum, updates, utils.get_beta1(group))
+@zero_guard('momentum')
+@no_state
+def nesterov_ema(group, updates, grads, params, momentum):  # equivalent to Grokfast
+    return utils.nesterov_ema(momentum, updates, utils.get_beta1(group))
+def _store_std(state, group, update, grad, param):
+    state['init_std'] = torch.std(grad, dim=0)
+@general_guard("init_std", init_fn=_store_std)
+@no_state
+def mup_approx(group, updates, grads, params, init_std):
+    _updates = [(u, i) for u, i in zip(updates, init_std) if u.ndim > 1]
+    _updates, _init_std = zip(*_updates)
+    utils.stochastic_multiply_(_updates, _init_std)
+    return updates
 @zero_guard("momentum")
 @no_state
 def heavyball_momentum(group, updates, grads, params, momentum):
@@ -308,15 +345,16 @@ _optim_fns = {'adam': utils.adam_, 'laprop': utils.laprop_}
 @general_guard("Q", "GG", init_fn=_init_soap)
 @no_state
 def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG, inner: str = 'adam'):
-    update = utils.promote(update)
+    update = utils.promote(update)  # Promote to highest precision if needed
     grad_projected = [utils.project(u, q, False) for u, q in zip(update, Q)]
     fn = _optim_fns[inner]
-    precond = fn(exp_avg, exp_avg_sq, grad_projected, utils.get_beta1(group), utils.get_beta2(group), group['step'])
+    precond = fn(exp_avg, exp_avg_sq, grad_projected, utils.get_beta1(group), utils.get_beta2(group), group['step'] - 1,
+                 group['eps'])
     precond = [utils.project(p, q, True) for p, q in zip(precond, Q)]
-    for u, q, gg, eas in zip(update, Q, GG, exp_avg_sq):
-        utils.update_preconditioner(u, q, gg, eas, group['max_precond_dim'], group['precondition_1d'],
+    for u, q, gg, ea in zip(update, Q, GG, exp_avg):
+        utils.update_preconditioner(u, q, gg, ea, group['max_precond_dim'], group['precondition_1d'],
                                     utils.beta_debias(group['shampoo_beta'], group['step']),
                                     group['is_preconditioning'])
     return precond
@@ -414,6 +452,11 @@ def update_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: st
     raise SkipUpdate
+@no_state
+def sign(group, update, grad, param, graft: bool = True):
+    return utils.sign_(update, graft)
 @general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def update_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
@@ -439,8 +482,7 @@ def apply_to_idx(fn, idx):
     return _fn
-def chain(state: Union[callable, dict], group, grad, param, *fns):
-    update = [torch.clone(g, memory_format=torch.preserve_format) for g in grad]
+def _inner_chain(state, group, update, grad, param, *fns):
     skip_update = False
     for fn in fns:
         try:
@@ -450,10 +492,30 @@ def chain(state: Union[callable, dict], group, grad, param, *fns):
             continue
         if update is None:
             break
+    return update, skip_update
+def chain(state: Union[callable, dict], group, grad, param, *fns):
+    update = [torch.clone(g, memory_format=torch.preserve_format) for g in grad]
+    update, skip_update = _inner_chain(state, group, update, grad, param, *fns)
     if not skip_update and update is not None:
         utils.update_param_(param, update, group['lr'], group['weight_decay'], caution=group['caution'], grad=grad)
+def create_branch(branches: List[List[callable]], merge_fn: callable):
+    def _branch(state, group, update, grad, param):
+        outputs = []
+        for branch in branches:
+            branch_update = [torch.clone(u, memory_format=torch.preserve_format) for u in update]
+            branch_update, skip_update = _inner_chain(state, group, branch_update, grad, param, *branch)
+            if skip_update:
+                raise ValueError("Branches should not skip updates")
+            outputs.append(branch_update)
+        return merge_fn(outputs)
+    return _branch
 class ChainOpt(utils.StatefulOptimizer):
     promote: bool = False

heavyball 1.5.2__tar.gz → 1.6.0__tar.gz

heavyball 1.5.2tar.gz → 1.6.0tar.gz