PyPI - heavyball - Versions diffs - 1.5.2__tar.gz → 1.5.3__tar.gz - Mend

heavyball 1.5.2tar.gz → 1.5.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{heavyball-1.5.2 → heavyball-1.5.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.5.2
+Version: 1.5.3
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.5.2 → heavyball-1.5.3}/heavyball/__init__.py RENAMED Viewed

@@ -163,6 +163,19 @@ class OrthoLaProp(C.BaseOpt):
                          C.orthogonalize_grad_to_param, C.scale_by_laprop)
+class LaPropOrtho(C.BaseOpt):
+    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
+                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,
+                         C.scale_by_laprop, C.orthogonalize_grad_to_param)
 class ForeachPSGDKron(C.BaseOpt):
     """
     Originally from Evan Walters and Omead Pooladzandi, 2024
@@ -244,4 +257,4 @@ __all__ = ["Muon", "RMSprop", "PrecondSchedulePaLMSOAP", "PSGDKron", "PurePSGD",
            "PrecondScheduleSOAP", "PrecondSchedulePaLMSOAP", 'RMSprop', 'MuonLaProp',  #
            "ForeachAdamW", "ForeachSFAdamW", "ForeachLaProp", "ForeachADOPT", "ForeachSOAP", "ForeachPSGDKron",
            "ForeachPurePSGD", "ForeachDelayedPSGD", "ForeachCachedPSGDKron", "ForeachCachedDelayedPSGDKron",
-           "ForeachRMSprop", "ForeachMuon", 'ForeachCachedNewtonPSGD']
+           "ForeachRMSprop", "ForeachMuon", 'ForeachCachedNewtonPSGD', 'OrthoLaProp', 'LaPropOrtho']

{heavyball-1.5.2 → heavyball-1.5.3}/heavyball/chainable.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import functools
 import random
-from typing import Optional, Union, Literal
+from typing import Optional, Union, Literal, List
 import torch
@@ -152,6 +152,22 @@ def exp_avg(group, update, grad, param, exp_avg):
     return utils.scale_by_exp_avg_(exp_avg, update, utils.beta_debias(utils.get_beta1(group), group["step"]))
+@zero_guard('exp_avg')
+@no_state
+def weight_decay_to_ema(group, update, grad, param, exp_avg):
+    utils.weight_decay_to_ema_(exp_avg, update, utils.beta_debias(group['ema_beta'], group['step']),
+                               group['weight_decay_to_ema'] * group['lr'])
+    return update
+@zero_guard('exp_avg')
+@no_state
+def l1_weight_decay_to_ema(group, update, grad, param, exp_avg):
+    utils.l1_weight_decay_to_ema_(exp_avg, update, utils.beta_debias(group['ema_beta'], group['step']),
+                                  group['weight_decay_to_ema'] * group['lr'])
+    return update
 @zero_guard("exp_avg_sq")
 @no_state
 def scale_by_exp_avg_sq(group, update, grad, param, exp_avg_sq):
@@ -295,6 +311,25 @@ def nesterov_momentum(group, updates, grads, params, momentum):
     return utils.nesterov_momentum(momentum, updates, utils.get_beta1(group))
+@zero_guard('momentum')
+@no_state
+def nesterov_ema(group, updates, grads, params, momentum):  # equivalent to Grokfast
+    return utils.nesterov_ema(momentum, updates, utils.get_beta1(group))
+def _store_std(state, group, update, grad, param):
+    state['init_std'] = torch.std(grad, dim=0)
+@general_guard("init_std", init_fn=_store_std)
+@no_state
+def mup_approx(group, updates, grads, params, init_std):
+    _updates = [(u, i) for u, i in zip(updates, init_std) if u.ndim > 1]
+    _updates, _init_std = zip(*_updates)
+    utils.stochastic_multiply_(_updates, _init_std)
+    return updates
 @zero_guard("momentum")
 @no_state
 def heavyball_momentum(group, updates, grads, params, momentum):
@@ -312,7 +347,7 @@ def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG, inner:
     grad_projected = [utils.project(u, q, False) for u, q in zip(update, Q)]
     fn = _optim_fns[inner]
-    precond = fn(exp_avg, exp_avg_sq, grad_projected, utils.get_beta1(group), utils.get_beta2(group), group['step'])
+    precond = fn(exp_avg, exp_avg_sq, grad_projected, utils.get_beta1(group), utils.get_beta2(group), group['step'], group['eps'])
     precond = [utils.project(p, q, True) for p, q in zip(precond, Q)]
     for u, q, gg, eas in zip(update, Q, GG, exp_avg_sq):
@@ -414,6 +449,11 @@ def update_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: st
     raise SkipUpdate
+@no_state
+def sign(group, update, grad, param, graft: bool = True):
+    return utils.sign_(update, graft)
 @general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def update_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
@@ -439,8 +479,7 @@ def apply_to_idx(fn, idx):
     return _fn
-def chain(state: Union[callable, dict], group, grad, param, *fns):
-    update = [torch.clone(g, memory_format=torch.preserve_format) for g in grad]
+def _inner_chain(state, group, update, grad, param, *fns):
     skip_update = False
     for fn in fns:
         try:
@@ -450,10 +489,30 @@ def chain(state: Union[callable, dict], group, grad, param, *fns):
             continue
         if update is None:
             break
+    return update, skip_update
+def chain(state: Union[callable, dict], group, grad, param, *fns):
+    update = [torch.clone(g, memory_format=torch.preserve_format) for g in grad]
+    update, skip_update = _inner_chain(state, group, update, grad, param, *fns)
     if not skip_update and update is not None:
         utils.update_param_(param, update, group['lr'], group['weight_decay'], caution=group['caution'], grad=grad)
+def create_branch(branches: List[List[callable]], merge_fn: callable):
+    def _branch(state, group, update, grad, param):
+        outputs = []
+        for branch in branches:
+            branch_update = [torch.clone(g, memory_format=torch.preserve_format) for u in update]
+            branch_update, skip_update = _inner_chain(state, group, branch_update, grad, param, *branch)
+            if skip_update:
+                raise ValueError("Branches should not skip updates")
+            outputs.append(branch_update)
+        return merge_fn(outputs)
+    return _branch
 class ChainOpt(utils.StatefulOptimizer):
     promote: bool = False

{heavyball-1.5.2 → heavyball-1.5.3}/heavyball/utils.py RENAMED Viewed

@@ -317,6 +317,19 @@ def nesterov_momentum(state, grad, beta):
     return grad
+@decorator_knowngood
+def _compilable_nesterov_ema_(state, grad, beta):
+    ema32 = _lerp32(state, grad, beta)
+    stochastic_add_(grad, ema32, 1)
+def nesterov_ema(state, grad, beta):
+    state, grad = list_guard(state, grad)
+    beta = scalar_guard(beta, state[0])
+    _compilable_nesterov_ema_(state, grad, beta)
+    return grad
 def _compilable_grafting(magnitude, direction):
     return direction * (magnitude.norm() / direction.norm().clamp(min=1e-6))
@@ -509,6 +522,19 @@ def stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[float, int, T
     _compilable_stochastic_add_(x, y, alpha)
+@decorator_knowngood
+def _compilable_stochastic_multiply_(x: List[Tensor], y: List[Tensor]):
+    for x_, y_ in zip(x, y):
+        x32 = promote(x_)
+        y32 = promote(y_)
+        copy_stochastic_(x_, x32 * y32)
+def stochastic_multiply_(x: List[Tensor], y: List[Tensor]):
+    x, y = list_guard(x, y)
+    _compilable_stochastic_multiply_(x, y)
 @decorator
 def compute_ggt(grad, GG, max_precond_dim, precondition_1d, beta):
     if grad.dim() == 1 and (not precondition_1d or grad.shape[0] > max_precond_dim):
@@ -783,7 +809,7 @@ def _compilable_adam_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: Lis
 def adam_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float, beta2: float, step: int,
-          eps: float):
+          eps: float = 1e-8):
     exp_avg, exp_avg_sq, grad = map(list_guard, (exp_avg, exp_avg_sq, grad))
     beta1, beta2, step, eps = scalar_guard(beta1, beta2, step, eps, exp_avg[0])
     _compilable_adam_(exp_avg, exp_avg_sq, grad, beta1, beta2, step, eps)
@@ -815,23 +841,23 @@ def fused_adam_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor]
 @decorator_knowngood
 def _compilable_laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: Tensor,
-                        beta2: Tensor, step: Tensor):
+                        beta2: Tensor, step: Tensor, eps: Tensor):
     beta1 = beta_debias(beta1, step)
     beta2 = beta_debias(beta2, step)
     gp32 = list(map(promote, grad))
-    denom = exp_avg_sq_(exp_avg_sq, gp32, beta2, 1e-8)
+    denom = exp_avg_sq_(exp_avg_sq, gp32, beta2, eps)
     gp32 = torch._foreach_div(gp32, denom)
     gp32 = _lerp32(exp_avg, gp32, beta1)
     copy_stochastic_list_(grad, gp32)
-def laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float, beta2: float, step: int):
+def laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float, beta2: float, step: int, eps: float = 1e-8):
     exp_avg, exp_avg_sq, grad = list_guard(exp_avg, exp_avg_sq, grad)
-    beta1, beta2, step = scalar_guard(beta1, beta2, step, exp_avg[0])
-    _compilable_laprop_(exp_avg, exp_avg_sq, grad, beta1, beta2, step)
+    beta1, beta2, step, eps = scalar_guard(beta1, beta2, step, exp_avg[0], eps)
+    _compilable_laprop_(exp_avg, exp_avg_sq, grad, beta1, beta2, step, eps)
     return grad
@@ -970,6 +996,10 @@ def get_soap_precond_schedule(precond_scheduler):
     return _inner
+def _max_idx(x: List[int]):
+    return len(x) - 1 - np.argmax(x[::-1])  # we want to start counting from the back, as torch is fan-out/fan-in
 def init_Q_exprs(t, scale, max_size, min_ndim_triangular, memory_save_mode, dtype=None):
     """For a scalar or tensor t, we initialize its preconditioner Q and
     reusable einsum expressions for updating Q and preconditioning gradient.
@@ -992,17 +1022,20 @@ def init_Q_exprs(t, scale, max_size, min_ndim_triangular, memory_save_mode, dtyp
     scale = scale ** (1 / len(shape))
+    dim_diag = [False for _ in shape]
     if memory_save_mode is None:
-        dim_diag = [False for _ in shape]
+        pass
     elif memory_save_mode == "one_diag":
-        rev_sorted_dims = np.argsort(shape)[::-1]
-        dim_diag = [False for _ in shape]
-        dim_diag[rev_sorted_dims[0]] = True
+        dim_diag[_max_idx(shape)] = True
+    elif memory_save_mode == "smart_one_diag":
+        sorted_shape = sorted(shape)
+        if len(shape) >= 2 and sorted_shape[-1] > sorted_shape[-2]:
+            dim_diag[_max_idx(shape)] = True
     elif memory_save_mode == "all_diag":
         dim_diag = [True for _ in shape]
     else:
         raise ValueError(f"Invalid memory_save_mode: {memory_save_mode}, must be one of "
-                         "[None, 'one_diag', 'all_diag']")
+                         "[None, 'one_diag', 'all_diag', 'smart_one_diag']")
     Q = []
     piece1A, piece2A, piece3A = ([], "", "")
@@ -1221,6 +1254,48 @@ def identity(x):
     return x
+@decorator_knowngood
+def _compilable_weight_decay_to_ema_(p, ema, ema_decay, weight_decay):
+    ema32 = _lerp32(ema, p, ema_decay)
+    _lerp32(p, ema32, 1 - weight_decay)
+def weight_decay_to_ema_(p, ema, ema_decay, weight_decay):
+    p, ema = list_guard(p, ema)
+    ema_decay, weight_decay = scalar_guard(ema_decay, weight_decay, p[0])
+    _compilable_weight_decay_to_ema_(p, ema, ema_decay, weight_decay)
+@decorator_knowngood
+def _compilable_l1_weight_decay_to_ema_(p, ema, ema_deacy, weight_decay):
+    ema32 = _lerp32(ema, p, ema_deacy)
+    for p_, e_ in zip(p, ema32):
+        p32 = promote(p)
+        p32 = p32 + (p32 - e_).sign() * weight_decay
+        copy_stochastic_(p_, p32)
+def l1_weight_decay_to_ema_(p, ema, ema_decay, weight_decay):
+    p, ema = list_guard(p, ema)
+    ema_decay, weight_decay = scalar_guard(ema_decay, weight_decay, p[0])
+    _compilable_l1_weight_decay_to_ema_(p, ema, ema_decay, weight_decay)
+@decorator_knowngood
+def _compilable_sign_(grad: List[Tensor], graft: bool):
+    for g_ in grad:
+        gs = g_.sign()
+        if graft:
+            gs = _compilable_grafting(g_, gs)
+        copy_stochastic_(g_, gs)
+def sign_(grad: List[Tensor], graft: bool = True):
+    grad = list_guard(grad)
+    _compilable_sign_(grad, graft)
+    return grad
 @decorator_knowngood
 def _compilable_trust_region_clip_(grad, lerp, scale):
     # (sgn(x) * log(1 + |x|) * 0.1 + tanh(x) * 0.9).clamp_(min=-2, max=2)

{heavyball-1.5.2 → heavyball-1.5.3}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.5.2
+Version: 1.5.3
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.5.2 → heavyball-1.5.3}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setuptools.setup(
     name='heavyball',
     license='BSD',
     description='Efficient optimizers',
-    version='1.5.2',
+    version='1.5.3',
     long_description=README,
     url='https://github.com/clashluke/heavyball',
     packages=setuptools.find_packages(),