PyPI - heavyball - Versions diffs - 1.3.1__tar.gz → 1.4.0__tar.gz - Mend

heavyball 1.3.1tar.gz → 1.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{heavyball-1.3.1 → heavyball-1.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.3.1
+Version: 1.4.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.3.1 → heavyball-1.4.0}/heavyball/chainable.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import functools
 import random
-import warnings
 from typing import Optional, Union, Literal
 import torch
@@ -85,10 +84,11 @@ class CopyGuard(FunctionTransform):
 class GeneralGuard(FunctionTransform):  # We can't guard against reuse in the general case
-    def __init__(self, fn, names, init_fn):
+    def __init__(self, fn, names, init_fn, skip_first: bool = True):
         super().__init__(fn)
         self.names = names
         self.init_fn = init_fn
+        self.skip_first = skip_first
     def __call__(self, state, group, update, grad, param, *args, **kwargs):
         vars = []
@@ -97,7 +97,7 @@ class GeneralGuard(FunctionTransform):  # We can't guard against reuse in the ge
             st = state(p)
             skip_update |= _inplace_guard_(st, self.names, lambda: self.init_fn(st, group, u, g, p, **kwargs))
             vars.append([st[name] if isinstance(name, str) else st.get(name[0], name[1]) for name in self.names])
-        if skip_update:
+        if skip_update and self.skip_first:
             raise SkipUpdate
         return self.fn(state, group, update, grad, param, *args, *zip(*vars), **kwargs)
@@ -109,8 +109,17 @@ class NoState(FunctionTransform):
 class NoStateNoForeach(FunctionTransform):
     def __call__(self, state, group, update, grad, param, *args, **kwargs):
+        updates = []
+        skip_update = False
         for a in zip(update, grad, param, *args):
-            return self.fn(group, *a, **kwargs)
+            try:
+                updates.append(self.fn(group, *a, **kwargs))
+            except SkipUpdate:
+                skip_update = True
+                pass
+        if skip_update:
+            raise SkipUpdate
+        return updates
 def zero_guard(*names):
@@ -118,11 +127,11 @@ def zero_guard(*names):
 def copy_guard(index, *names):
-    return functools.partial(CopyGuard, index=index, names=names)
+    return functools.partial(CopyGuard, index=index, names=names,)
-def general_guard(*names, init_fn):
-    return functools.partial(GeneralGuard, names=names, init_fn=init_fn)
+def general_guard(*names, init_fn, skip_first: bool = True):
+    return functools.partial(GeneralGuard, names=names, init_fn=init_fn, skip_first=skip_first)
 def no_state(fn):
@@ -311,18 +320,18 @@ def _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, p
     if prob is None:
         prob = utils.precond_update_prob_schedule()
     if not precond_schedule(group, prob, name=f"cumulative_prob_{id(Q)}"):
-        return
+        return Q_mat
-    Q = [utils.promote(q_) for q_ in Q]
     utils.psgd_update_precond(Q_mat, exprs, grad, group['precond_lr'], Q, group['store_triu_as_line'])
-    if grad.dim() > 1 and precond_schedule(group, balance_probability, "balance_prob"):
+    if grad.dim() > 1 and precond_schedule(group, balance_probability, f"balance_prob_{id(Q)}"):
         if group['store_triu_as_line']:
             utils.psgd_balance_Q([q_ for _, q_ in Q])
         else:
             utils.psgd_balance_Q(Q)
-    _update_psgd_cache(cached, Q_cache, Q_mat)
+    return _update_psgd_cache(cached, Q_cache, Q_mat)
 def _update_psgd_cache(cached, Q_cache, q):
     if not cached:
@@ -351,44 +360,47 @@ def _fused_cached_psgd_precond_grad(group, grad, param, cached, cache_expr, expr
                                       group['caution'], *Q_mat)
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                   prob: Optional[callable] = None):
-    old = update
     update = update.to(memory_format=torch.contiguous_format)
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob)
+    Q_mat = _update_psgd_precond(cached, Q_cache, group, param,
+                                 update if group['momentum_into_precond_update'] else grad, Q_mat, Q, exprs, prob)
     return _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def scale_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                           prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     precond = _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
-    _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
+                         Q_mat, Q, exprs, prob)
     return precond
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def update_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                    prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob)
+    Q_mat = _update_psgd_precond(cached, Q_cache, group, param,
+                                 update if group['momentum_into_precond_update'] else grad, Q_mat, Q, exprs, prob)
     _fused_cached_psgd_precond_grad(group, update, param, cached, cache_expr, exprs, update, Q_mat, Q_cache)
     raise SkipUpdate
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def update_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                            prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     _fused_cached_psgd_precond_grad(group, update, param, cached, cache_expr, exprs, update, Q_mat, Q_cache)
-    _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
+                         Q_mat, Q, exprs, prob)
     raise SkipUpdate
@@ -422,7 +434,6 @@ def chain(state: Union[callable, dict], group, grad, param, *fns):
 class ChainOpt(utils.StatefulOptimizer):
-    compile_step: bool = False
     promote: bool = False
     def __init__(self, params, defaults, foreach: bool, *fns):
@@ -432,6 +443,10 @@ class ChainOpt(utils.StatefulOptimizer):
     def _step(self, group):
         if 'base_lr' not in group:
             group['base_lr'] = group['lr']
+        if 'prev_lr' in group and group['prev_lr'] != group['lr']:
+            utils.warn_once(f'Learning rate changed between steps. This is an experimental feature and '
+                            f'only supported with foreach=True (currently foreach={group["foreach"]}).')
+            group['base_lr'] = group['lr']
         vals = list(self.split_p_and_g_in_group(group, should_promote=self.promote, beta1=utils.get_beta1(group)))
         if not vals:
@@ -451,9 +466,10 @@ class ChainOpt(utils.StatefulOptimizer):
         group['step'] = state['step'] = step = step + 1
         if group['warmup_steps'] and step < group['warmup_steps']:
-            group['lr'] = group['base_lr'] * step / group['warmup_steps']
+            group['prev_lr'] = group['lr'] = group['base_lr'] * step / group['warmup_steps']
         else:
-            group['lr'] = group['base_lr']
+            group['prev_lr'] = group['lr'] = group['base_lr']
         if not group['foreach'] or len(p) == 1:
             for param, grad in zip(p, g):

{heavyball-1.3.1 → heavyball-1.4.0}/heavyball/utils.py RENAMED Viewed

@@ -193,12 +193,12 @@ def scale_by_exp_avg_sq_(exp_avg_sq, grad, beta2, eps):
     return grad
-# TODO: This lerp was fucked - check other lerps
 @decorator_knowngood
 def _compilable_exp_avg_(state, grad, beta):
-    s32 = [s.lerp(g, 1 - beta) for s, g in zip(promote(state), promote(grad))]
-    copy_stochastic_list_(state, s32)
-    copy_stochastic_list_(grad, s32)
+    for s, g in zip(state, grad):
+        lerped = s.lerp(g, 1 - beta)
+        copy_stochastic_(s, lerped)
+        copy_stochastic_(g, lerped)
 def scale_by_exp_avg_(state, grad, beta):
@@ -592,6 +592,7 @@ def project(grad, Q, back: bool):
 class StatefulOptimizer(torch.optim.Optimizer):
     ema_decay: float = 0.001
+    compile_step: bool = False
     def __init__(self, params, defaults, foreach: bool = True, use_ema: bool = False):
         super().__init__(params, {**defaults, 'foreach': foreach})
@@ -637,6 +638,10 @@ class StatefulOptimizer(torch.optim.Optimizer):
                 p.grad = None
+            if self.compile_step:
+                yield p, grad
+                continue
             p_views = merge_group(group, p)
             if grad is not None:
                 grad = merge_group(group, grad)
@@ -1030,7 +1035,7 @@ def psgd_calc_A_and_conjB(exprA, G, Q):
     V = torch.randn(G.shape, dtype=G.dtype, device=G.device)
     eps = scalar_guard(math.sqrt(torch.finfo(torch.float32).eps), G)
     eps *= G.norm() / G.numel()
-    G += V * eps
+    G = G + V * eps
     md = min_dtype(Q + [G])
     A = torch.einsum(exprA, *[q.to(md) for q in Q], G.to(md)).to(G.dtype)
     order = G.dim()
@@ -1078,26 +1083,20 @@ def psgd_update_precond(Q, exprs, G, precond_lr, oq, store_triu_as_line):
         term1 = promote(torch.einsum(exprG, A, A))
         term2 = promote(torch.einsum(exprG, conjB, conjB))
-        term2 += term1  # a + b
-        term1 *= 2  # 2a
-        if term1.dtype == term2.dtype:
-            term1 -= term2  # 2a - (a + b) == a - b
-        else:
-            term1 = term1 - term2
+        term1, term2 = term1 - term2, term1 + term2
         term1 *= precond_lr
         norm = term2.norm(float('inf'))
         if q.dim() < 2:
-            term1 *= q.to(term1.dtype)
-            term1 /= norm.clamp_(min=tiny_bf16)
+            term1 *= q.to(term1.dtype) / norm.clamp_(min=tiny_bf16)
         else:
             torch.triu(term1, out=term1)
-            term1 /= psgd_lb(term2, norm).clamp_(tiny_bf16)
-            torch.matmul(term1, q, out=term1)
+            term1 /= torch.where(norm > 0, psgd_lb(term2, norm), norm).clamp_(tiny_bf16)
+            term1 = torch.mm(term1, q)
         if store_triu_as_line:
             term1 = triu_to_line([term1])[0][1]
             o = o[1]
-        stochastic_add_([o], [term1], -1)
+        stochastic_add_(o, term1, -1)
 @decorator_knowngood
@@ -1162,7 +1161,7 @@ def mu_law_compress(x, mu=127.0):
     """
     x = list_guard(x)
     mu = scalar_guard(mu, x[0])
-    _compilable_mu_law_compress(x, mu)
+    _compilable_mu_law_compress_(x, mu)
     return x
@@ -1191,7 +1190,7 @@ def a_law_compress(x, A=87.6):
     """
     x = list_guard(x)
     A = scalar_guard(A, x[0])
-    _compilable_a_law_compress(x, A)
+    _compilable_a_law_compress_(x, A)
     return x
@@ -1295,6 +1294,7 @@ def _compilable_fused_precond_grad_cached_(expr: str, ea: Tensor, param, lr, gra
 def fused_precond_grad_cached_(expr: str, ea: Tensor, param, lr, grad, decay, caution, *cached_q: Tensor):
     lr = scalar_guard(lr, param[0])
     _compilable_fused_precond_grad_cached_(expr, ea, param, lr, grad, decay, caution, *cached_q)
@@ -1310,7 +1310,7 @@ def psgd_precond_grad(expr: str, ea: Tensor, *preconds: Tensor):
 @decorator_knowngood
 def _compilable_fused_psgd_precond_grad(expr: str, ea: Tensor, param, lr, grad, decay, caution, *preconds: Tensor):
-    precond = psgd_precond_grad(expr, grad, *preconds)
+    precond = psgd_precond_grad(expr, ea, *preconds)
     update_param_(param, precond, lr, decay, caution=caution, grad=grad)

{heavyball-1.3.1 → heavyball-1.4.0}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.3.1
+Version: 1.4.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.3.1 → heavyball-1.4.0}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setuptools.setup(
     name='heavyball',
     license='BSD',
     description='Efficient optimizers',
-    version='1.3.1',
+    version='1.4.0',
     long_description=README,
     url='https://github.com/clashluke/heavyball',
     packages=setuptools.find_packages(),