PyPI - heavyball - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

heavyball 1.0.0py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

heavyball/__init__.py +18 -4
heavyball/chainable.py +107 -49
heavyball/utils.py +143 -81
{heavyball-1.0.0.dist-info → heavyball-1.1.1.dist-info}/METADATA +3 -3
heavyball-1.1.1.dist-info/RECORD +8 -0
heavyball-1.0.0.dist-info/RECORD +0 -8
{heavyball-1.0.0.dist-info → heavyball-1.1.1.dist-info}/LICENSE +0 -0
{heavyball-1.0.0.dist-info → heavyball-1.1.1.dist-info}/WHEEL +0 -0
{heavyball-1.0.0.dist-info → heavyball-1.1.1.dist-info}/top_level.txt +0 -0

heavyball/__init__.py CHANGED Viewed

@@ -59,6 +59,19 @@ class ForeachADOPT(C.BaseOpt):
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.update_by_adopt)
+class ForeachMuon(C.BaseOpt):
+    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
+                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8,
+                 nesterov: bool = True):
+        defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
+                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma, beta2_scale=beta2_scale)
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,
+                         C.nesterov_momentum if nesterov else C.heavyball_momentum, C.orthogonalize_update)
 class ForeachLaProp(C.BaseOpt):
     def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
                  foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
@@ -200,10 +213,11 @@ PurePSGD = ForeachPurePSGD
 DelayedPSGD = ForeachDelayedPSGD
 CachedPSGDKron = ForeachCachedPSGDKron
 CachedDelayedPSGDKron = ForeachCachedDelayedPSGDKron
+Muon = ForeachMuon
-__all__ = ["PrecondSchedulePaLMSOAP", "PSGDKron", "PurePSGD", "DelayedPSGD", "CachedPSGDKron", "CachedDelayedPSGDKron",
-           "PalmForEachSoap", "PaLMSOAP", "PaLMSFAdamW", "LaProp", "ADOPT", "PrecondScheduleSOAP",
-           "PrecondSchedulePaLMSOAP", 'RMSprop',  #
+__all__ = ["Muon","RMSprop", "PrecondSchedulePaLMSOAP", "PSGDKron", "PurePSGD", "DelayedPSGD", "CachedPSGDKron",
+           "CachedDelayedPSGDKron", "PalmForEachSoap", "PaLMSOAP", "PaLMSFAdamW", "LaProp", "ADOPT",
+           "PrecondScheduleSOAP", "PrecondSchedulePaLMSOAP", 'RMSprop',   #
            "ForeachAdamW", "ForeachSFAdamW", "ForeachLaProp", "ForeachADOPT", "ForeachSOAP", "ForeachPSGDKron",
            "ForeachPurePSGD", "ForeachDelayedPSGD", "ForeachCachedPSGDKron", "ForeachCachedDelayedPSGDKron",
-           "ForeachRMSprop"]
+           "ForeachRMSprop", "ForeachMuon"]

heavyball/chainable.py CHANGED Viewed

@@ -33,6 +33,23 @@ def _guard_in_state(state, key, template_fn):
     return state[key]
+class FunctionTransform:
+    def __init__(self, fn):
+        self.fn = fn
+        self.fn_name = self.get_fn().__name__
+    def __call__(self, state, group, update, grad, param, *args, **kwargs):
+        raise NotImplementedError
+    def get_fn(self):
+        if hasattr(self.fn, 'get_fn'):
+            return self.fn.get_fn()
+        return self.fn
+    def val_name(self, name):
+        return f"{self.fn_name}_{name}"
 def _zero_guard(state, key, ref, dtype):
     return _guard_in_state(state, key,
                            lambda: torch.zeros_like(ref, dtype=torch.float32, memory_format=torch.preserve_format))
@@ -43,61 +60,77 @@ def _storage_dtype(group):
     return getattr(torch, dtype)
-def zero_guard(*names):
-    def _outer(fn):
-        def _fn(state, group, update, grad, param, *args, **kwargs):
-            vars = [[_zero_guard(state(p), name, p, _storage_dtype(group)) for p in param] for name in names]
-            return fn(state, group, update, grad, param, *args, *vars, **kwargs)
+class ZeroGuard(FunctionTransform):
+    def __init__(self, fn, names):
+        super().__init__(fn)
+        self.names = names
-        return _fn
+    def __call__(self, state, group, update, grad, param, *args, **kwargs):
+        vars = [[_zero_guard(state(p), self.val_name(name), p, _storage_dtype(group)) for p in param]  #
+                for name in self.names]
+        return self.fn(state, group, update, grad, param, *args, *vars, **kwargs)
-    return _outer
+class CopyGuard(FunctionTransform):
+    def __init__(self, fn, index, names):
+        super().__init__(fn)
+        self.index = index
+        self.names = names
-def copy_guard(index, *names):
-    def _outer(fn):
-        def _fn(state, group, update, grad, param, *args, **kwargs):
-            val = [update, grad, param, *args][index]
-            vars = [[_guard_in_state(state(p), name, lambda: torch.clone(v)) for p, v in zip(param, val)]  #
-                    for name in names]
-            return fn(state, group, update, grad, param, *args, *vars, **kwargs)
+    def __call__(self, state, group, update, grad, param, *args, **kwargs):
+        val = [update, grad, param, *args][self.index]
+        vars = [[_guard_in_state(state(p), self.val_name(name), lambda: torch.clone(v)) for p, v in zip(param, val)]  #
+                for name in self.names]
+        return self.fn(state, group, update, grad, param, *args, *vars, **kwargs)
-        return _fn
-    return _outer
+class GeneralGuard(FunctionTransform):  # We can't guard against reuse in the general case
+    def __init__(self, fn, names, init_fn):
+        super().__init__(fn)
+        self.names = names
+        self.init_fn = init_fn
+    def __call__(self, state, group, update, grad, param, *args, **kwargs):
+        vars = []
+        skip_update = False
+        for p, g, u in zip(param, grad, update):
+            st = state(p)
+            skip_update |= _inplace_guard_(st, self.names, lambda: self.init_fn(st, group, u, g, p, **kwargs))
+            vars.append([st[name] if isinstance(name, str) else st.get(name[0], name[1]) for name in self.names])
+        if skip_update:
+            raise SkipUpdate
+        return self.fn(state, group, update, grad, param, *args, *zip(*vars), **kwargs)
-def general_guard(*names, init_fn):
-    def _outer(fn):
-        def _fn(state, group, update, grad, param, *args, **kwargs):
-            vars = []
-            skip_update = False
-            for p, g, u in zip(param, grad, update):
-                st = state(p)
-                skip_update |= _inplace_guard_(st, names, lambda: init_fn(st, group, u, g, p, **kwargs))
-                vars.append([st[name] if isinstance(name, str) else st.get(name[0], name[1]) for name in names])
-            if skip_update:
-                raise SkipUpdate
-            return fn(state, group, update, grad, param, *args, *zip(*vars), **kwargs)
-        return _fn
+class NoState(FunctionTransform):
+    def __call__(self, state, group, update, grad, param, *args, **kwargs):
+        return self.fn(group, update, grad, param, *args, **kwargs)
-    return _outer
+class NoStateNoForeach(FunctionTransform):
+    def __call__(self, state, group, update, grad, param, *args, **kwargs):
+        for a in zip(update, grad, param, *args):
+            return self.fn(group, *a, **kwargs)
-def no_state(fn):
-    def _fn(state, *args, **kwargs):
-        return fn(*args, **kwargs)
-    return _fn
+def zero_guard(*names):
+    return functools.partial(ZeroGuard, names=names)
-def no_state_no_foreach(fn):
-    def _fn(state, group, *args, **kwargs):
-        for a in zip(*args):
-            return fn(group, *a, **kwargs)
+def copy_guard(index, *names):
+    return functools.partial(CopyGuard, index=index, names=names)
+def general_guard(*names, init_fn):
+    return functools.partial(GeneralGuard, names=names, init_fn=init_fn)
-    return _fn
+def no_state(fn):
+    return NoState(fn)
+def no_state_no_foreach(fn):
+    return NoStateNoForeach(fn)
 class SkipUpdate(ValueError):
@@ -107,18 +140,14 @@ class SkipUpdate(ValueError):
 @zero_guard("exp_avg")
 @no_state
 def exp_avg(group, update, grad, param, exp_avg):
-    utils.stochastic_lerp_(exp_avg, update, utils.beta_debias(utils.get_beta1(group), group["step"]))
-    return exp_avg
+    return utils.scale_by_exp_avg_(exp_avg, update, utils.beta_debias(utils.get_beta1(group), group["step"]))
 @zero_guard("exp_avg_sq")
 @no_state
 def scale_by_exp_avg_sq(group, update, grad, param, exp_avg_sq):
-    out = utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group["step"]),
-                                     group['eps'])
-    if group['step'] == 1:
-        raise SkipUpdate
-    return out
+    return utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group["step"]),
+                                      group['eps'])
 @zero_guard("exp_avg", "exp_avg_sq")
@@ -232,6 +261,29 @@ def precond_schedule(group, prob: Union[callable, float, None] = None, name: str
     raise ValueError("No preconditioner update schedule specified.")
+@no_state_no_foreach
+def orthogonalize_update(group, update, grad, param):
+    if update.dim() == 1:
+        return update
+    original_shape = update.shape
+    # doing it this way, as tmp and update are not guaranteed to share memory address or layout
+    tmp = update.flatten(1, -1)
+    utils.inplace_orthogonal_(tmp, utils.zeroth_power_mode, tmp)
+    return tmp.reshape(original_shape)
+@zero_guard("momentum")
+@no_state
+def nesterov_momentum(group, updates, grads, params, momentum):
+    return utils.nesterov_momentum(momentum, updates, utils.get_beta1(group))
+@zero_guard("momentum")
+@no_state
+def heavyball_momentum(group, updates, grads, params, momentum):
+    return utils.heavyball_momentum(momentum, updates, utils.get_beta1(group))
 @zero_guard("exp_avg", "exp_avg_sq")
 @general_guard("Q", "GG", init_fn=_init_soap)
 @no_state
@@ -296,9 +348,12 @@ def _fused_cached_psgd_precond_grad(group, grad, param, cached, cache_expr, expr
 @no_state_no_foreach
 def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                   prob: Optional[callable] = None):
+    old = update
+    update = update.to(memory_format=torch.contiguous_format)
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
-    return _cached_psgd_precond_grad(False, cache_expr, exprs, update, Q_mat, Q_cache)
+    out = _cached_psgd_precond_grad(False, cache_expr, exprs, update, Q_mat, Q_cache)
+    return torch.as_strided(out, old.shape, old.stride())
 @general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
@@ -346,7 +401,7 @@ def apply_to_idx(fn, idx):
 def chain(state: Union[callable, dict], group, grad, param, *fns):
-    update = grad
+    update = [torch.clone(g, memory_format=torch.preserve_format) for g in grad]
     skip_update = False
     for fn in fns:
         try:
@@ -375,7 +430,10 @@ class ChainOpt(utils.StatefulOptimizer):
         else:
             group['lr'] = -group['base_lr']
-        p, g = zip(*list(self.split_p_and_g_in_group(group, should_promote=False, beta1=utils.get_beta1(group))))
+        vals = list(self.split_p_and_g_in_group(group, should_promote=False, beta1=utils.get_beta1(group)))
+        if not vals:
+            return
+        p, g = zip(*vals)
         if not group['foreach'] or len(p) == 1:
             for param, grad in zip(p, g):

heavyball/utils.py CHANGED Viewed

@@ -23,7 +23,7 @@ from torch.utils._pytree import tree_map
 compile_mode = "max-autotune-no-cudagraphs"
 dynamic = False
 compile_mode_recommended_to_none = None
-zeroth_power_mode = 'qr'  # 'qr' is baseline, 'newtonschulz' converges better and faster, 'eigh' is perfect but slow
+zeroth_power_mode = 'qr'  # 'qr' is baseline, 'newtonschulz' converges better and faster
 tiny_bf16 = torch.finfo(torch.bfloat16).tiny
@@ -91,11 +91,9 @@ def schedule_free_(lr: float, weight_lr_power: float, weight_sum: float, beta1:
         ckp1 = weight / weight_sum
     except ZeroDivisionError:
         ckp1 = 0
-    ckp1 = 0
-    # These operations update y in-place,
-    # without computing x explicitly.
-    lr, ckp1, beta1 = scalar_guard(lr, parameters[0]), scalar_guard(ckp1, parameters[0]), scalar_guard(beta1, parameters[0])
+    grad, parameters, z = list_guard(grad, parameters, z)
+    lr, ckp1, beta1 = scalar_guard(lr, ckp1, beta1, grad[0])
     _compilable_schedule_free_(parameters, z, ckp1, grad, lr, beta1, decay)
     return weight_sum
@@ -179,28 +177,43 @@ def _compilable_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tens
 def exp_avg_sq_(state, grad, beta2, eps, out=None):
-    state, grad, out = list_guard(state), list_guard(grad), list_guard(out)
-    beta2, eps = scalar_guard(beta2, state[0]), scalar_guard(eps, state[0])
+    state, grad, out = list_guard(state, grad, out)
+    beta2, eps = scalar_guard(beta2, eps, state[0])
     return _compilable_exp_avg_sq_(state, grad, beta2, eps, out)
 @decorator_knowngood
-def _compilable_scale_by_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor,
-                                     out: List[Optional[Tensor]]):
+def _compilable_scale_by_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor):
     s32, g32 = [list(map(promote, x)) for x in (state, grad)]
     torch._foreach_mul_(s32, beta2)
     [s.addcmul_(g, g, value=1 - beta2) for s, g in zip(s32, g32)]
     denom = torch._foreach_sqrt(s32)
     [d.clamp_(min=eps) for d in denom]
-    out = torch._foreach_div(g32, denom)
+    out = torch._foreach_div_(g32, denom)
     copy_stochastic_list_(state, s32)
-    return stochastic_round_list_(grad, out)
+    copy_stochastic_list_(grad, out)
-def scale_by_exp_avg_sq_(grad, exp_avg_sq, beta2, eps):
-    grad, exp_avg_sq = list_guard(grad), list_guard(exp_avg_sq)
-    beta2, eps = scalar_guard(beta2, grad[0]), scalar_guard(eps, grad[0])
-    return _compilable_scale_by_exp_avg_sq_(grad, exp_avg_sq, beta2, eps, grad)
+def scale_by_exp_avg_sq_(exp_avg_sq, grad, beta2, eps):
+    grad, exp_avg_sq = list_guard(grad, exp_avg_sq)
+    beta2, eps = scalar_guard(beta2, eps, grad[0])
+    _compilable_scale_by_exp_avg_sq_(exp_avg_sq, grad, beta2, eps)
+    return grad
+@decorator_knowngood
+def _compilable_exp_avg_(state, grad, beta):
+    s32, g32 = [list(map(promote, x)) for x in (state, grad)]
+    [s.lerp_(g, beta) for s, g in zip(s32, g32)]
+    copy_stochastic_list_(state, s32)
+    copy_stochastic_list_(grad, s32)
+def scale_by_exp_avg_(state, grad, beta):
+    state, grad = list_guard(state, grad)
+    beta = scalar_guard(beta, state[0])
+    _compilable_exp_avg_(state, grad, beta)
+    return grad
 @decorator_knowngood
@@ -219,7 +232,7 @@ def adaptive_gradient_clipping_(parameters: List[Tensor], gradients: List[Tensor
                                 minimum: float = 1e-3, eps: float = 1e-8):
     if clip_val <= 0:
         return gradients
-    parameters, gradients = list_guard(parameters), list_guard(gradients)
+    parameters, gradients = list_guard(parameters, gradients)
     clip_val = scalar_guard(clip_val, parameters[0])
     return _compilable_agc_(parameters, gradients, clip_val, minimum, eps)
@@ -254,33 +267,29 @@ def set_torch():
 @decorator
-def zeropower_via_newtonschulz5(G, init, steps=2, eps=1e-7):
+def zeropower_via_newtonschulz5(G, steps=5, eps=1e-7):
     """
-    Modified from "modded-nanogpt" under the MIT license:
-    Original: https://github.com/KellerJordan/modded-nanogpt/blob/a0dcbfdd9a0617d091d5123cfc354745428e40d3/train_gpt2.py
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
     of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
     zero even beyond the point where the iteration no longer converges all the way to one everywhere
     on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' \sim Uniform(0.5, 1.5), which turns out not to hurt model
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
     a, b, c = (3.4445, -4.7750, 2.0315)
-    X = G.float()
-    init = init / (init.norm() + eps)  # ensure top singular value <= 1
-    X = X / (X.norm() + eps)  # ensure top singular value <= 1
+    X = G.bfloat16()
+    X /= (X.norm() + eps)  # ensure top singular value <= 1
     if G.size(0) > G.size(1):
         X = X.T
     for _ in range(steps):
-        A = X @ X.T  # preconditioner
-        B = A @ init
-        init = X = a * init + b * B + c * A @ B
+        A = X @ X.T
+        B = b * A + c * A @ A  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        X = a * X + B @ X
     if G.size(0) > G.size(1):
         X = X.T
-    return X
+    return X.to(G.dtype)
 def ortho(x):
@@ -292,6 +301,53 @@ def ortho(x):
     raise NotImplementedError(f"Unknown zeroth_power_mode: {zeroth_power_mode}")
+@decorator_knowngood
+def _compilable_heavyball_momentum_(state, grad, beta):
+    s32, g32 = [list(map(promote, x)) for x in (state, grad)]
+    torch._foreach_mul_(s32, beta)
+    torch._foreach_add_(s32, g32)
+    copy_stochastic_list_(state, s32)
+    copy_stochastic_list_(grad, s32)
+@decorator_knowngood
+def _compilable_nesterov_momentum_(state, grad, beta):
+    s32, g32 = [list(map(promote, x)) for x in (state, grad)]
+    torch._foreach_mul_(s32, beta)
+    torch._foreach_add_(s32, g32)
+    [g.add_(s, alpha=beta) for g, s in zip(g32, s32)]
+    copy_stochastic_list_(state, s32)
+    copy_stochastic_list_(grad, g32)
+def heavyball_momentum(state, grad, beta):
+    state, grad = list_guard(state, grad)
+    beta = scalar_guard(beta, state[0])
+    _compilable_heavyball_momentum_(state, grad, beta)
+    return grad
+def nesterov_momentum(state, grad, beta):
+    state, grad = list_guard(state, grad)
+    beta = scalar_guard(beta, state[0])
+    _compilable_nesterov_momentum_(state, grad, beta)
+    return grad
+@decorator_knowngood
+def inplace_orthogonal_(x, mode, out):
+    if mode == 'newtonschulz' or x.shape[0] != x.shape[1]:
+        y = zeropower_via_newtonschulz5(x, 5)
+    elif mode == 'qr':
+        y = torch.linalg.qr(x).Q
+    elif mode == 'svd':
+        u, s, v = torch.linalg.svd(x)
+        y = u @ v.T
+    else:
+        raise NotImplementedError(f"Unknown zeroth_power_mode: {mode}")
+    set_(out, y)
 def get_orthogonal_matrix_QR(GG, Q, exp_avg_sq):
     """
     Computes the eigenbases of the preconditioner using one round of power iteration
@@ -322,17 +378,7 @@ def get_orthogonal_matrix_QR(GG, Q, exp_avg_sq):
         est_eig = torch.einsum('ij,ij->j', o, tmp)
         sort_idx = torch.argsort(est_eig, descending=True)
         indices.append(sort_idx)
-        if zeroth_power_mode == 'eigh':
-            set_(q, torch.linalg.eigh(m)[1])
-        elif zeroth_power_mode.startswith('newtonschulz'):
-            iterations = zeroth_power_mode[len('newtonschulz'):]
-            if iterations == '':
-                iterations = 10
-            else:
-                iterations = int(iterations)
-            set_(q, zeropower_via_newtonschulz5(m, o[:, sort_idx], iterations))
-        else:
-            set_(q, ortho(tmp[:, sort_idx]))
+        inplace_orthogonal_(tmp[:, sort_idx], zeroth_power_mode, q)
     indices = tuple(slice(None) if ind is None else ind.view(*(1,) * i, -1, *(1,) * (exp_avg_sq.dim() - i - 1))  #
                     for i, ind in enumerate(indices))
@@ -407,7 +453,6 @@ def get_beta1(group):
 def get_beta2(group):
-    beta = None
     if 'beta2_scale' in group:
         step = max(group.get("step", 1), 1)
         return 1 - step ** -group['beta2_scale']
@@ -417,23 +462,36 @@ def get_beta2(group):
 def stochastic_lerp_(x: List[Tensor], y: List[Tensor], a: Union[float, int, Tensor]):
-    x, y = list_guard(x), list_guard(y)
+    x, y = list_guard(x, y)
     a = scalar_guard(a, x[0])
     _compilable_stochastic_lerp_(x, y, a)
-def list_guard(x):
-    if isinstance(x, (list, tuple)):
-        return x
-    return [x]
+def list_guard(*xs):
+    out = []
+    for x in xs:
+        if isinstance(x, (list, tuple)):
+            out.append(x)
+        else:
+            out.append([x])
+    if len(xs) == 1:
+        return out[0]
+    return out
-def scalar_guard(x, ref):
-    if isinstance(x, float):
-        return torch.empty((), dtype=torch.float32, device=ref.device).fill_(x)
-    if isinstance(x, int):
-        return torch.empty((), dtype=torch.int64, device=ref.device).fill_(x)
-    return x
+def scalar_guard(*args):
+    *xs, ref = args
+    out = []
+    for x in xs:
+        if isinstance(x, float):
+            out.append(torch.empty((), dtype=torch.float32, device=ref.device).fill_(x))
+        elif isinstance(x, int):
+            out.append(torch.empty((), dtype=torch.int64, device=ref.device).fill_(x))
+        else:
+            out.append(x)
+    if len(xs) == 1:
+        return out[0]
+    return out
 @decorator_knowngood
@@ -446,7 +504,7 @@ def _compilable_stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[f
 def stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[float, int, Tensor]):
-    x, y = list_guard(x), list_guard(y)
+    x, y = list_guard(x, y)
     alpha = scalar_guard(alpha, x[0])
     _compilable_stochastic_add_(x, y, alpha)
@@ -695,14 +753,14 @@ def _compilable_adam_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: Lis
     copy_stochastic_list_(exp_avg, exp_avg32)
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
-    return stochastic_round_list_(exp_avg, u32)
+    copy_stochastic_list_(grad, u32)
 def adam_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float, beta2: float, step: int):
     exp_avg, exp_avg_sq, grad = map(list_guard, (exp_avg, exp_avg_sq, grad))
-    beta1, beta2, step = scalar_guard(beta1, exp_avg[0]), scalar_guard(beta2, exp_avg[0]), scalar_guard(step,
-                                                                                                        exp_avg[0])
-    return _compilable_adam_(exp_avg, exp_avg_sq, grad, beta1, beta2, step)
+    beta1, beta2, step = scalar_guard(beta1, beta2, step, exp_avg[0])
+    _compilable_adam_(exp_avg, exp_avg_sq, grad, beta1, beta2, step)
+    return grad
 @decorator_knowngood
@@ -725,32 +783,32 @@ def _fused_compilable_adam_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq:
 def fused_adam_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float,
                 beta2: float, step: int, lr: float, eps: float, decay: float, caution: bool):
-    y, exp_avg, exp_avg_sq, grad = map(list_guard, (y, exp_avg, exp_avg_sq, grad))
-    beta1, beta2, step, lr = [scalar_guard (x, y[0]) for x in (beta1, beta2, step, lr)]
+    y, exp_avg, exp_avg_sq, grad = list_guard(y, exp_avg, exp_avg_sq, grad)
+    beta1, beta2, step, lr = scalar_guard(beta1, beta2, step, lr, y[0])
     return _fused_compilable_adam_(y, exp_avg, exp_avg_sq, grad, beta1, beta2, step, decay, lr, eps, caution)
 @decorator_knowngood
-def _compilable_laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad_projected: List[Tensor], beta1: Tensor,
+def _compilable_laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: Tensor,
                         beta2: Tensor, step: Tensor):
     beta1 = beta_debias(beta1, step)
     beta2 = beta_debias(beta2, step)
-    gp32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad_projected, exp_avg_sq]]
+    gp32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg_sq]]
     denom = exp_avg_sq_(exp_avg_sq32, gp32, beta2, 1e-8)
     gp32 = torch._foreach_div(gp32, denom)
     stochastic_lerp_(exp_avg, gp32, 1 - beta1)
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
+    copy_stochastic_list_(grad, exp_avg)
-def laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad_projected: List[Tensor], beta1: float, beta2: float,
-            step: int):
-    exp_avg, exp_avg_sq, grad_projected = list_guard(exp_avg), list_guard(exp_avg_sq), list_guard(grad_projected)
-    beta1, beta, step = scalar_guard(beta1, exp_avg[0]), scalar_guard(beta2, exp_avg[0]), scalar_guard(step, exp_avg[0])
-    _compilable_laprop_(exp_avg, exp_avg_sq, grad_projected, beta1, beta2, step)
-    return exp_avg
+def laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float, beta2: float, step: int):
+    exp_avg, exp_avg_sq, grad = list_guard(exp_avg, exp_avg_sq, grad)
+    beta1, beta2, step = scalar_guard(beta1, beta2, step, exp_avg[0])
+    _compilable_laprop_(exp_avg, exp_avg_sq, grad, beta1, beta2, step)
+    return grad
 @decorator_knowngood
@@ -770,11 +828,11 @@ def _fused_compilable_laprop_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
-def fused_laprop_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad_projected: List[Tensor],
-                  beta1: float, beta2: float, step: int, lr: float, decay: float, caution: bool):
-    y, exp_avg, exp_avg_sq, grad_projected = map(list_guard, (y, exp_avg, exp_avg_sq, grad_projected))
-    beta1, beta2, step, lr = [scalar_guard (x, y[0]) for x in (beta1, beta2, step, lr)]
-    _fused_compilable_laprop_(y, exp_avg, exp_avg_sq, grad_projected, beta1, beta2, step, lr, decay, caution)
+def fused_laprop_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float,
+                  beta2: float, step: int, lr: float, decay: float, caution: bool):
+    exp_avg, exp_avg_sq, grad, y = list_guard(exp_avg, exp_avg_sq, grad, y)
+    beta1, beta2, step, lr = scalar_guard(beta1, beta2, step, lr, exp_avg[0])
+    _fused_compilable_laprop_(y, exp_avg, exp_avg_sq, grad, beta1, beta2, step, lr, decay, caution)
 @decorator_knowngood
@@ -797,8 +855,8 @@ def _fused_compilable_adopt_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, l
 def fused_adopt_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
-    y, grad, exp_avg_sq, exp_avg = list_guard(y), list_guard(grad), list_guard(exp_avg_sq), list_guard(exp_avg)
-    beta1, beta2, step, lr = [scalar_guard (x, y[0]) for x in (beta1, beta2, step, lr)]
+    exp_avg, exp_avg_sq, grad, y = list_guard(exp_avg, exp_avg_sq, grad, y)
+    beta1, beta2, step, lr = scalar_guard(beta1, beta2, step, lr, exp_avg[0])
     _fused_compilable_adopt_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution)
@@ -819,14 +877,14 @@ def _compilable_adopt_(grad, exp_avg_sq, exp_avg, beta1, beta2, step):
     copy_stochastic_list_(exp_avg, exp_avg32)
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
-    return update
+    copy_stochastic_list_(grad, update)
 def adopt(grad, exp_avg_sq, exp_avg, beta1, beta2, step):
-    grad, exp_avg_sq, exp_avg = list_guard(grad), list_guard(exp_avg_sq), list_guard(exp_avg)
-    beta1, beta2, step = scalar_guard(beta1, grad[0]), scalar_guard(beta2, grad[0]), scalar_guard(step, grad[0])
-    return _compilable_adopt_(grad, exp_avg_sq, exp_avg, beta1, beta2, step)
+    exp_avg, exp_avg_sq, grad = list_guard(exp_avg, exp_avg_sq, grad, y)
+    beta1, beta2, step = scalar_guard(beta1, beta2, step, lr, exp_avg[0])
+    _compilable_adopt_(grad, exp_avg_sq, exp_avg, beta1, beta2, step)
+    return grad
 def stochastic_round_list_(ref: List[Tensor], source: List[Tensor]):
@@ -877,7 +935,7 @@ def _compilable_update_(p: List[Tensor], u: List[Tensor], decay: Tensor, add_fn:
 def update_param_(param: List[Tensor], update: List[Tensor], lr: float, decay: float, add_fn: callable = None,
                   caution: bool = False, grad: List[Tensor] = None):
-    param, update, grad = list_guard(param), list_guard(update), list_guard(grad)
+    param, update, grad = list_guard(param, update, grad)
     lr = scalar_guard(lr, param[0])
     if not caution:
         grad = [None] * len(param)
@@ -983,11 +1041,15 @@ def psgd_balance_Q(Q_in):
 def psgd_calc_A_and_conjB(exprA, G, Q):
+    V = torch.randn(G.shape, dtype=G.dtype, device=G.device)
+    eps = scalar_guard(math.sqrt(torch.finfo(torch.float32).eps), G)
+    eps *= G.norm() / G.numel()
+    G += V * eps
     md = min_dtype(Q + [G])
     A = torch.einsum(exprA, *[q.to(md) for q in Q], G.to(md)).to(G.dtype)
     order = G.dim()
     p = list(range(order))
-    conjB = torch.randn(G.shape[1:] + G.shape[:1], dtype=promote(G.dtype), device=G.device)
+    conjB = torch.permute(V, p[1:] + p[:1]).to(promote(G.dtype))
     Q = [promote(q) for q in Q]
     for i, q in enumerate(Q):
         if q.dim() <= 1:
@@ -1134,7 +1196,7 @@ def _compilable_trust_region_clip_(grad, lerp: float = 0.9, scale: float = 1.5):
 def trust_region_clip_(grad, lerp=0.9, scale=1.5):
     grad = list_guard(grad)
-    lerp, scale = scalar_guard(lerp, grad[0]), scalar_guard(scale, grad[0])
+    lerp, scale = scalar_guard(lerp, scale, grad[0])
     return _compilable_trust_region_clip_(grad, lerp, scale)

{heavyball-1.0.0.dist-info → heavyball-1.1.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.0.0
+Version: 1.1.1
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler
@@ -132,5 +132,5 @@ It has several handy functions:
 * `set_torch()` sets pytorch optimization settings (TF32, opt_einsum, benchmark, ...)
 * `compile_mode`, a string passed as-is to `torch.compile(mode=compile_mode)` in all compiled heavyball calls; `compile_mode=None` disables torch_compile
-* `zeroth_power_mode`, a string determining whether to use QR, newtonschulz{iterations}, or svd or eigh to approximate
-  the eigenvectors. Eigh has the highest precision and cost
+* `zeroth_power_mode`, a string determining whether to use QR, newtonschulz, or svd or eigh to approximate
+  the eigenvectors.

heavyball-1.1.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+heavyball/__init__.py,sha256=ZPadmK5Q9hGrBAeNYldMUw8vciy9dCDL3d7Zk9erC3E,12702
+heavyball/chainable.py,sha256=_KVV_bA_WYbyaiGDOaoQMHv-IM9jbgZx_cwzmRiKxl8,20321
+heavyball/utils.py,sha256=F4nwiyDCTGg3uU5XAaX9_p11qf5Uiw4WRseMbuLfq0Y,47223
+heavyball-1.1.1.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
+heavyball-1.1.1.dist-info/METADATA,sha256=uLpyF5G1Stjxxj23qmxpnk3ViUd0M55Q0wIDGDY58qk,12022
+heavyball-1.1.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+heavyball-1.1.1.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
+heavyball-1.1.1.dist-info/RECORD,,

heavyball-1.0.0.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-heavyball/__init__.py,sha256=1QPYBIH8amnk3-_rKe6L9FJ0rkV5wVNRr7Yw9BXjIYI,11636
-heavyball/chainable.py,sha256=cp-tpetPr4CNN9xJ85JSo89JYC5BWUygoE6dnET6tmc,18141
-heavyball/utils.py,sha256=qUoB9EIxl7GUyLkV5a5JAKOD6TvPc1FNsqyUbJ-HY6o,46343
-heavyball-1.0.0.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
-heavyball-1.0.0.dist-info/METADATA,sha256=9C2btIxngp26TRCJFU6B8ftkWQt1rfZZC10rkAhaORw,12074
-heavyball-1.0.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-heavyball-1.0.0.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
-heavyball-1.0.0.dist-info/RECORD,,

{heavyball-1.0.0.dist-info → heavyball-1.1.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{heavyball-1.0.0.dist-info → heavyball-1.1.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{heavyball-1.0.0.dist-info → heavyball-1.1.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

heavyball 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

heavyball 1.0.0py3-none-any.whl → 1.1.1py3-none-any.whl