PyPI - heavyball - Versions diffs - 1.3.0__tar.gz → 1.4.0__tar.gz - Mend

heavyball 1.3.0tar.gz → 1.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{heavyball-1.3.0 → heavyball-1.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.3.0
+Version: 1.4.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.3.0 → heavyball-1.4.0}/heavyball/chainable.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import functools
 import random
-import warnings
 from typing import Optional, Union, Literal
 import torch
@@ -85,10 +84,11 @@ class CopyGuard(FunctionTransform):
 class GeneralGuard(FunctionTransform):  # We can't guard against reuse in the general case
-    def __init__(self, fn, names, init_fn):
+    def __init__(self, fn, names, init_fn, skip_first: bool = True):
         super().__init__(fn)
         self.names = names
         self.init_fn = init_fn
+        self.skip_first = skip_first
     def __call__(self, state, group, update, grad, param, *args, **kwargs):
         vars = []
@@ -97,7 +97,7 @@ class GeneralGuard(FunctionTransform):  # We can't guard against reuse in the ge
             st = state(p)
             skip_update |= _inplace_guard_(st, self.names, lambda: self.init_fn(st, group, u, g, p, **kwargs))
             vars.append([st[name] if isinstance(name, str) else st.get(name[0], name[1]) for name in self.names])
-        if skip_update:
+        if skip_update and self.skip_first:
             raise SkipUpdate
         return self.fn(state, group, update, grad, param, *args, *zip(*vars), **kwargs)
@@ -109,8 +109,17 @@ class NoState(FunctionTransform):
 class NoStateNoForeach(FunctionTransform):
     def __call__(self, state, group, update, grad, param, *args, **kwargs):
+        updates = []
+        skip_update = False
         for a in zip(update, grad, param, *args):
-            return self.fn(group, *a, **kwargs)
+            try:
+                updates.append(self.fn(group, *a, **kwargs))
+            except SkipUpdate:
+                skip_update = True
+                pass
+        if skip_update:
+            raise SkipUpdate
+        return updates
 def zero_guard(*names):
@@ -118,11 +127,11 @@ def zero_guard(*names):
 def copy_guard(index, *names):
-    return functools.partial(CopyGuard, index=index, names=names)
+    return functools.partial(CopyGuard, index=index, names=names,)
-def general_guard(*names, init_fn):
-    return functools.partial(GeneralGuard, names=names, init_fn=init_fn)
+def general_guard(*names, init_fn, skip_first: bool = True):
+    return functools.partial(GeneralGuard, names=names, init_fn=init_fn, skip_first=skip_first)
 def no_state(fn):
@@ -307,21 +316,22 @@ def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG, inner:
     return precond
-def _update_psgd_precond(group, param, grad, Q_mat, Q, exprs, prob: Optional[callable] = None):
+def _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob: Optional[callable] = None):
     if prob is None:
         prob = utils.precond_update_prob_schedule()
     if not precond_schedule(group, prob, name=f"cumulative_prob_{id(Q)}"):
-        return
+        return Q_mat
-    Q = [utils.promote(q_) for q_ in Q]
     utils.psgd_update_precond(Q_mat, exprs, grad, group['precond_lr'], Q, group['store_triu_as_line'])
-    if grad.dim() > 1 and precond_schedule(group, balance_probability, "balance_prob"):
+    if grad.dim() > 1 and precond_schedule(group, balance_probability, f"balance_prob_{id(Q)}"):
         if group['store_triu_as_line']:
             utils.psgd_balance_Q([q_ for _, q_ in Q])
         else:
             utils.psgd_balance_Q(Q)
+    return _update_psgd_cache(cached, Q_cache, Q_mat)
 def _update_psgd_cache(cached, Q_cache, q):
     if not cached:
@@ -350,44 +360,47 @@ def _fused_cached_psgd_precond_grad(group, grad, param, cached, cache_expr, expr
                                       group['caution'], *Q_mat)
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                   prob: Optional[callable] = None):
-    old = update
     update = update.to(memory_format=torch.contiguous_format)
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
+    Q_mat = _update_psgd_precond(cached, Q_cache, group, param,
+                                 update if group['momentum_into_precond_update'] else grad, Q_mat, Q, exprs, prob)
     return _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def scale_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                           prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     precond = _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
-    _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
+                         Q_mat, Q, exprs, prob)
     return precond
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def update_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                    prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
+    Q_mat = _update_psgd_precond(cached, Q_cache, group, param,
+                                 update if group['momentum_into_precond_update'] else grad, Q_mat, Q, exprs, prob)
     _fused_cached_psgd_precond_grad(group, update, param, cached, cache_expr, exprs, update, Q_mat, Q_cache)
     raise SkipUpdate
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def update_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                            prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     _fused_cached_psgd_precond_grad(group, update, param, cached, cache_expr, exprs, update, Q_mat, Q_cache)
-    _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
+                         Q_mat, Q, exprs, prob)
     raise SkipUpdate
@@ -421,7 +434,7 @@ def chain(state: Union[callable, dict], group, grad, param, *fns):
 class ChainOpt(utils.StatefulOptimizer):
-    compile_step: bool = False
+    promote: bool = False
     def __init__(self, params, defaults, foreach: bool, *fns):
         super().__init__(params, defaults, foreach)
@@ -430,8 +443,12 @@ class ChainOpt(utils.StatefulOptimizer):
     def _step(self, group):
         if 'base_lr' not in group:
             group['base_lr'] = group['lr']
+        if 'prev_lr' in group and group['prev_lr'] != group['lr']:
+            utils.warn_once(f'Learning rate changed between steps. This is an experimental feature and '
+                            f'only supported with foreach=True (currently foreach={group["foreach"]}).')
+            group['base_lr'] = group['lr']
-        vals = list(self.split_p_and_g_in_group(group, should_promote=False, beta1=utils.get_beta1(group)))
+        vals = list(self.split_p_and_g_in_group(group, should_promote=self.promote, beta1=utils.get_beta1(group)))
         if not vals:
             return
         p, g = zip(*vals)
@@ -449,9 +466,10 @@ class ChainOpt(utils.StatefulOptimizer):
         group['step'] = state['step'] = step = step + 1
         if group['warmup_steps'] and step < group['warmup_steps']:
-            group['lr'] = group['base_lr'] * step / group['warmup_steps']
+            group['prev_lr'] = group['lr'] = group['base_lr'] * step / group['warmup_steps']
         else:
-            group['lr'] = group['base_lr']
+            group['prev_lr'] = group['lr'] = group['base_lr']
         if not group['foreach'] or len(p) == 1:
             for param, grad in zip(p, g):
@@ -486,36 +504,72 @@ _scale_to_update_map = {scale_by_delayed_psgd.get_fn(): update_by_delayed_psgd,
                         scale_by_adam.get_fn(): update_by_adam,  #
                         scale_by_laprop.get_fn(): update_by_laprop,  #
                         scale_by_adopt.get_fn(): update_by_adopt}
+_scale_to_update_map_inv = {update_by_delayed_psgd.get_fn(): scale_by_delayed_psgd,  #
+                            update_by_psgd.get_fn(): scale_by_psgd,  #
+                            update_by_adam.get_fn(): scale_by_adam,  #
+                            update_by_laprop.get_fn(): scale_by_laprop,  #
+                            update_by_adopt.get_fn(): scale_by_adopt}
 class BaseOpt(ChainOpt):
+    """
+    Base Optimizer
+    compile_step: bool = False
+    Whether to change some internals to try to make the optimizer compilable
+    This does not compile the step by itself and breaks some optimizers loudly (e.g. SOAP)
+    promote: bool = False
+    Whether to promote the gradients to fp32 before applying the optimizer
+    Improves update quality for low-precision parameters, but increases costs
+    Compiling the optimizer step would reduce memory and compute. Alternatively, `foreach=False` decreases memory at the cost of runtime
+    gradient_clipping: str_or_fn = None
+    The function to use for clipping the incoming gradients, before any other transformations.
+    This is syntactic sugar, equivalent to manually passing the function as the first element of the optimizer chain.
+    update_clipping: str_or_fn = None
+    The function to use for clipping the outgoing updates before applying them, after all other transformations.
+    This will turn off
+    This is syntactic sugar, equivalent to manually passing the function as the last element of the optimizer chain.
+    """
     gradient_clipping: str_or_fn = None
     update_clipping: str_or_fn = None
     palm: bool = False
     auto_fuse: bool = True
     def __init__(self, params, defaults, foreach: bool, gradient_clipping: str_or_fn, update_clipping: str_or_fn,
-                 palm: bool = use_default, *fns, compile_step: bool = use_default):
+                 palm: bool = use_default, *fns, compile_step: bool = use_default, promote: bool = use_default):
+        if not fns:
+            raise ValueError("No functions provided. If that's on purpose (SGD-like), use `identity`")
+        args, kwargs = None, None
+        fn = fns[-1]
+        if isinstance(fn, functools.partial):
+            fn, args, kwargs = fn.func, fn.args, fn.keywords
+        if isinstance(fn, FunctionTransform):
+            fn = fn.get_fn()
         if default(update_clipping, self.update_clipping) is None:
-            if fns and self.auto_fuse:
-                args, kwargs = None, None
-                fn = fns[-1]
-                if isinstance(fn, functools.partial):
-                    fn, args, kwargs = fn.func, fn.args, fn.keywords
-                if isinstance(fn, FunctionTransform):
-                    fn = fn.get_fn()
+            if self.auto_fuse:
                 if fn in _scale_to_update_map:
                     fn = _scale_to_update_map[fn]
                     if args is not None:
                         fn = functools.partial(fn, *args, **kwargs)
                     fns = tuple(fns)[:-1] + (fn,)
-        else:
-            if any(fn in (update_by_adopt, update_by_adam, update_by_laprop, update_by_schedule_free) for fn in fns):
-                raise ValueError("`update_by` functions do not support update clipping. Use `scale_by`")
-        fns = tuple(fns)
+        elif fn in _scale_to_update_map_inv:
+            if not self.auto_fuse:
+                raise ValueError("update_clipping is currently not compatible with update_by_* functions. "
+                                 "Manually select scale_by_* functions or set auto_fuse=True.")
+            fn = _scale_to_update_map_inv[fn]
+            if args is not None:
+                fn = functools.partial(fn, *args, **kwargs)
+            fns = tuple(fns)[:-1] + (fn,)
         self.compile_step = default(compile_step, self.compile_step)
+        self.promote = default(promote, self.promote)
         if default(palm, self.palm):
             fns = (palm_beta2,) + fns
         if default(gradient_clipping, self.gradient_clipping) is not None:

{heavyball-1.3.0 → heavyball-1.4.0}/heavyball/utils.py RENAMED Viewed

@@ -193,12 +193,12 @@ def scale_by_exp_avg_sq_(exp_avg_sq, grad, beta2, eps):
     return grad
-# TODO: This lerp was fucked - check other lerps
 @decorator_knowngood
 def _compilable_exp_avg_(state, grad, beta):
-    s32 = [s.lerp(g, 1 - beta) for s, g in zip(promote(state), promote(grad))]
-    copy_stochastic_list_(state, s32)
-    copy_stochastic_list_(grad, s32)
+    for s, g in zip(state, grad):
+        lerped = s.lerp(g, 1 - beta)
+        copy_stochastic_(s, lerped)
+        copy_stochastic_(g, lerped)
 def scale_by_exp_avg_(state, grad, beta):
@@ -592,6 +592,7 @@ def project(grad, Q, back: bool):
 class StatefulOptimizer(torch.optim.Optimizer):
     ema_decay: float = 0.001
+    compile_step: bool = False
     def __init__(self, params, defaults, foreach: bool = True, use_ema: bool = False):
         super().__init__(params, {**defaults, 'foreach': foreach})
@@ -637,6 +638,10 @@ class StatefulOptimizer(torch.optim.Optimizer):
                 p.grad = None
+            if self.compile_step:
+                yield p, grad
+                continue
             p_views = merge_group(group, p)
             if grad is not None:
                 grad = merge_group(group, grad)
@@ -1030,7 +1035,7 @@ def psgd_calc_A_and_conjB(exprA, G, Q):
     V = torch.randn(G.shape, dtype=G.dtype, device=G.device)
     eps = scalar_guard(math.sqrt(torch.finfo(torch.float32).eps), G)
     eps *= G.norm() / G.numel()
-    G += V * eps
+    G = G + V * eps
     md = min_dtype(Q + [G])
     A = torch.einsum(exprA, *[q.to(md) for q in Q], G.to(md)).to(G.dtype)
     order = G.dim()
@@ -1078,88 +1083,115 @@ def psgd_update_precond(Q, exprs, G, precond_lr, oq, store_triu_as_line):
         term1 = promote(torch.einsum(exprG, A, A))
         term2 = promote(torch.einsum(exprG, conjB, conjB))
-        term2 += term1  # a + b
-        term1 *= 2  # 2a
-        if term1.dtype == term2.dtype:
-            term1 -= term2  # 2a - (a + b) == a - b
-        else:
-            term1 = term1 - term2
+        term1, term2 = term1 - term2, term1 + term2
         term1 *= precond_lr
         norm = term2.norm(float('inf'))
         if q.dim() < 2:
-            term1 *= q.to(term1.dtype)
-            term1 /= norm.clamp_(min=tiny_bf16)
+            term1 *= q.to(term1.dtype) / norm.clamp_(min=tiny_bf16)
         else:
             torch.triu(term1, out=term1)
-            term1 /= psgd_lb(term2, norm).clamp_(tiny_bf16)
-            torch.matmul(term1, q, out=term1)
+            term1 /= torch.where(norm > 0, psgd_lb(term2, norm), norm).clamp_(tiny_bf16)
+            term1 = torch.mm(term1, q)
         if store_triu_as_line:
             term1 = triu_to_line([term1])[0][1]
             o = o[1]
-        stochastic_add_([o], [term1], -1)
+        stochastic_add_(o, term1, -1)
 @decorator_knowngood
-def _compilable_l2_clip_(x):
+def _compilable_l2_clip_(x, clip_at):
     ref = x
     x = list(map(promote, x))
     norm = torch._foreach_norm(x)
-    torch._foreach_maximum_(norm, 1e-8)
+    torch._foreach_maximum_(norm, clip_at)
     out = torch._foreach_div(x, norm)
     return stochastic_round_list_(ref, out)
-def l2_clip_(x):
+def l2_normalization_(x, clip_at: float = 1e-8):
     x = list_guard(x)
-    return _compilable_l2_clip_(x)
+    return _compilable_l2_clip_(x, clip_at)
+def l2_clip_(x, clip_at: float = 1.):
+    x = list_guard(x)
+    return _compilable_l2_clip_(x, clip_at)
 @decorator_knowngood
-def _compilable_rmsnorm_clip_(x):
+def _compilable_rmsnorm_clip_(x, clip_at):
     x = list(map(promote, x))
     norm = torch._foreach_norm(x)
     norm = [n.div_(x_.numel() ** 0.5) for n, x_ in zip(norm, x)]
-    torch._foreach_maximum_(norm, 1e-6)
+    torch._foreach_maximum_(norm, clip_at)
     return torch._foreach_div(x, norm)
-def rmsnorm_clip_(x):
+def rmsnorm_clip_(x, clip_at: float = 1.0):
     x = list_guard(x)
-    return _compilable_rmsnorm_clip_(x)
+    return _compilable_rmsnorm_clip_(x, clip_at)
-def mu_law_compress(x, mu=127.0):
+def rmsnorm_normalize_(x, clip_at: float = 1e-6):
+    x = list_guard(x)
+    return _compilable_rmsnorm_clip_(x, clip_at)
+@decorator_knowngood
+def _compilable_mu_law_compress_(x, mu):
     """
-    Foreach version of https://github.com/opooladz/modded-nanogpt-psgd/blob/dc7c78082ac15fbf326f1bacd9e0ead0a2b45908/kron_mu.py
+    original at https://github.com/opooladz/modded-nanogpt-psgd/blob/dc7c78082ac15fbf326f1bacd9e0ead0a2b45908/kron_mu.py
+    """
+    for x_ in x:
+        xa = promote(x_.abs()) * mu
+        xa = xa.log1p()
+        xa = xa / math.log1p(mu)
+        xa = xa.copysign(x_)
+        copy_stochastic_(x_, xa)
+def mu_law_compress(x, mu=127.0):
+    """
     μ-law compression
     Args:
         x: Input tensor
         mu: Compression parameter (default 127.0 for behavior similar to trust_region=1.5)
     """
-    xa = torch._foreach_abs_(x)
-    torch._foreach_mul_(xa, mu)
-    torch._foreach_log1p_(xa)
-    torch._foreach_div_(xa, math.log1p(mu))
-    return [xa_.copysign_(x_) for x_, xa_ in zip(x, xa)]
+    x = list_guard(x)
+    mu = scalar_guard(mu, x[0])
+    _compilable_mu_law_compress_(x, mu)
+    return x
-def a_law_compress(x, A=87.6):
+@decorator_knowngood
+def _compilable_a_law_compress_(x, A):
+    """
+    original at https://github.com/opooladz/modded-nanogpt-psgd/blob/dc7c78082ac15fbf326f1bacd9e0ead0a2b45908/kron_mu.py
     """
-    Foreach version of https://github.com/opooladz/modded-nanogpt-psgd/blob/dc7c78082ac15fbf326f1bacd9e0ead0a2b45908/kron_mu.py
+    for x_ in x:
+        xa = promote(x_.abs()) * A
+        xa = torch.where(xa < 1, xa, 1 + xa.log())
+        xa = xa.copysign(x_)
+        xa = xa * (1 / (1 + math.log(A)))
+        copy_stochastic_(x_, xa)
+def a_law_compress(x, A=87.6):
+    """
     A-law compression
     Args:
         x: Input tensor
         A: Compression parameter (default 87.6 - European PCM standard)
+    :param x:
+    :param A:
+    :return:
     """
-    xa = torch._foreach_abs(x)
-    torch._foreach_mul_(xa, A)
-    [torch.where(x_ < 1, x_, 1 + torch.log_(x_), out=x_) for x_ in xa]
-    [xa_.copysign(x_) for x_, xa_ in zip(x, xa)]
-    torch._foreach_mul_(xa, 1 / (1 + math.log(A)))
-    return xa
+    x = list_guard(x)
+    A = scalar_guard(A, x[0])
+    _compilable_a_law_compress_(x, A)
+    return x
 def identity(x):
@@ -1167,24 +1199,24 @@ def identity(x):
 @decorator_knowngood
-def _compilable_trust_region_clip_(grad, lerp: float = 0.9, scale: float = 1.5):
+def _compilable_trust_region_clip_(grad, lerp, scale):
     # (sgn(x) * log(1 + |x|) * 0.1 + tanh(x) * 0.9).clamp_(min=-2, max=2)
-    g32 = list(map(promote, grad))
-    [g.mul_(1 / scale) for g in g32]
-    tanh = torch._foreach_tanh(g32)
-    torch._foreach_abs_(g32)
-    torch._foreach_log1p_(g32)
-    [g.copysign_(t).lerp_(t, lerp).mul_(scale) for t, g in zip(tanh, g32)]
-    torch._foreach_maximum_(g32, -2)
-    torch._foreach_minimum_(g32, 2)
-    return [stochastic_round_(grad, g32) for grad, g32 in zip(grad, g32)]
+    for x_ in grad:
+        x = promote(x_)
+        x = x / scale
+        tanh = x.tanh()
+        x = x.abs().log1p()
+        x = x.copysign(tanh) * (1 - lerp) + tanh * lerp
+        x = x * scale
+        x = x.clamp(min=-2, max=2)
+        copy_stochastic_(x_, x)
 def trust_region_clip_(grad, lerp=0.9, scale=1.5):
     grad = list_guard(grad)
     lerp, scale = scalar_guard(lerp, scale, grad[0])
-    return _compilable_trust_region_clip_(grad, lerp, scale)
+    _compilable_trust_region_clip_(grad, lerp, scale)
+    return grad
 @decorator
@@ -1262,6 +1294,7 @@ def _compilable_fused_precond_grad_cached_(expr: str, ea: Tensor, param, lr, gra
 def fused_precond_grad_cached_(expr: str, ea: Tensor, param, lr, grad, decay, caution, *cached_q: Tensor):
     lr = scalar_guard(lr, param[0])
     _compilable_fused_precond_grad_cached_(expr, ea, param, lr, grad, decay, caution, *cached_q)
@@ -1277,7 +1310,7 @@ def psgd_precond_grad(expr: str, ea: Tensor, *preconds: Tensor):
 @decorator_knowngood
 def _compilable_fused_psgd_precond_grad(expr: str, ea: Tensor, param, lr, grad, decay, caution, *preconds: Tensor):
-    precond = psgd_precond_grad(expr, grad, *preconds)
+    precond = psgd_precond_grad(expr, ea, *preconds)
     update_param_(param, precond, lr, decay, caution=caution, grad=grad)

{heavyball-1.3.0 → heavyball-1.4.0}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.3.0
+Version: 1.4.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.3.0 → heavyball-1.4.0}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setuptools.setup(
     name='heavyball',
     license='BSD',
     description='Efficient optimizers',
-    version='1.3.0',
+    version='1.4.0',
     long_description=README,
     url='https://github.com/clashluke/heavyball',
     packages=setuptools.find_packages(),