PyPI - heavyball - Versions diffs - 1.3.0__tar.gz → 1.3.1__tar.gz - Mend

heavyball 1.3.0tar.gz → 1.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{heavyball-1.3.0 → heavyball-1.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.3.0
+Version: 1.3.1
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.3.0 → heavyball-1.3.1}/heavyball/chainable.py RENAMED Viewed

@@ -307,7 +307,7 @@ def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG, inner:
     return precond
-def _update_psgd_precond(group, param, grad, Q_mat, Q, exprs, prob: Optional[callable] = None):
+def _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob: Optional[callable] = None):
     if prob is None:
         prob = utils.precond_update_prob_schedule()
     if not precond_schedule(group, prob, name=f"cumulative_prob_{id(Q)}"):
@@ -322,6 +322,7 @@ def _update_psgd_precond(group, param, grad, Q_mat, Q, exprs, prob: Optional[cal
         else:
             utils.psgd_balance_Q(Q)
+    _update_psgd_cache(cached, Q_cache, Q_mat)
 def _update_psgd_cache(cached, Q_cache, q):
     if not cached:
@@ -357,7 +358,7 @@ def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str
     old = update
     update = update.to(memory_format=torch.contiguous_format)
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob)
     return _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
@@ -367,7 +368,7 @@ def scale_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_e
                           prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     precond = _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
-    _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob)
     return precond
@@ -376,7 +377,7 @@ def scale_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_e
 def update_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                    prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob)
     _fused_cached_psgd_precond_grad(group, update, param, cached, cache_expr, exprs, update, Q_mat, Q_cache)
     raise SkipUpdate
@@ -387,7 +388,7 @@ def update_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_
                            prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     _fused_cached_psgd_precond_grad(group, update, param, cached, cache_expr, exprs, update, Q_mat, Q_cache)
-    _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob)
     raise SkipUpdate
@@ -422,6 +423,7 @@ def chain(state: Union[callable, dict], group, grad, param, *fns):
 class ChainOpt(utils.StatefulOptimizer):
     compile_step: bool = False
+    promote: bool = False
     def __init__(self, params, defaults, foreach: bool, *fns):
         super().__init__(params, defaults, foreach)
@@ -431,7 +433,7 @@ class ChainOpt(utils.StatefulOptimizer):
         if 'base_lr' not in group:
             group['base_lr'] = group['lr']
-        vals = list(self.split_p_and_g_in_group(group, should_promote=False, beta1=utils.get_beta1(group)))
+        vals = list(self.split_p_and_g_in_group(group, should_promote=self.promote, beta1=utils.get_beta1(group)))
         if not vals:
             return
         p, g = zip(*vals)
@@ -486,36 +488,72 @@ _scale_to_update_map = {scale_by_delayed_psgd.get_fn(): update_by_delayed_psgd,
                         scale_by_adam.get_fn(): update_by_adam,  #
                         scale_by_laprop.get_fn(): update_by_laprop,  #
                         scale_by_adopt.get_fn(): update_by_adopt}
+_scale_to_update_map_inv = {update_by_delayed_psgd.get_fn(): scale_by_delayed_psgd,  #
+                            update_by_psgd.get_fn(): scale_by_psgd,  #
+                            update_by_adam.get_fn(): scale_by_adam,  #
+                            update_by_laprop.get_fn(): scale_by_laprop,  #
+                            update_by_adopt.get_fn(): scale_by_adopt}
 class BaseOpt(ChainOpt):
+    """
+    Base Optimizer
+    compile_step: bool = False
+    Whether to change some internals to try to make the optimizer compilable
+    This does not compile the step by itself and breaks some optimizers loudly (e.g. SOAP)
+    promote: bool = False
+    Whether to promote the gradients to fp32 before applying the optimizer
+    Improves update quality for low-precision parameters, but increases costs
+    Compiling the optimizer step would reduce memory and compute. Alternatively, `foreach=False` decreases memory at the cost of runtime
+    gradient_clipping: str_or_fn = None
+    The function to use for clipping the incoming gradients, before any other transformations.
+    This is syntactic sugar, equivalent to manually passing the function as the first element of the optimizer chain.
+    update_clipping: str_or_fn = None
+    The function to use for clipping the outgoing updates before applying them, after all other transformations.
+    This will turn off
+    This is syntactic sugar, equivalent to manually passing the function as the last element of the optimizer chain.
+    """
     gradient_clipping: str_or_fn = None
     update_clipping: str_or_fn = None
     palm: bool = False
     auto_fuse: bool = True
     def __init__(self, params, defaults, foreach: bool, gradient_clipping: str_or_fn, update_clipping: str_or_fn,
-                 palm: bool = use_default, *fns, compile_step: bool = use_default):
+                 palm: bool = use_default, *fns, compile_step: bool = use_default, promote: bool = use_default):
+        if not fns:
+            raise ValueError("No functions provided. If that's on purpose (SGD-like), use `identity`")
+        args, kwargs = None, None
+        fn = fns[-1]
+        if isinstance(fn, functools.partial):
+            fn, args, kwargs = fn.func, fn.args, fn.keywords
+        if isinstance(fn, FunctionTransform):
+            fn = fn.get_fn()
         if default(update_clipping, self.update_clipping) is None:
-            if fns and self.auto_fuse:
-                args, kwargs = None, None
-                fn = fns[-1]
-                if isinstance(fn, functools.partial):
-                    fn, args, kwargs = fn.func, fn.args, fn.keywords
-                if isinstance(fn, FunctionTransform):
-                    fn = fn.get_fn()
+            if self.auto_fuse:
                 if fn in _scale_to_update_map:
                     fn = _scale_to_update_map[fn]
                     if args is not None:
                         fn = functools.partial(fn, *args, **kwargs)
                     fns = tuple(fns)[:-1] + (fn,)
-        else:
-            if any(fn in (update_by_adopt, update_by_adam, update_by_laprop, update_by_schedule_free) for fn in fns):
-                raise ValueError("`update_by` functions do not support update clipping. Use `scale_by`")
-        fns = tuple(fns)
+        elif fn in _scale_to_update_map_inv:
+            if not self.auto_fuse:
+                raise ValueError("update_clipping is currently not compatible with update_by_* functions. "
+                                 "Manually select scale_by_* functions or set auto_fuse=True.")
+            fn = _scale_to_update_map_inv[fn]
+            if args is not None:
+                fn = functools.partial(fn, *args, **kwargs)
+            fns = tuple(fns)[:-1] + (fn,)
         self.compile_step = default(compile_step, self.compile_step)
+        self.promote = default(promote, self.promote)
         if default(palm, self.palm):
             fns = (palm_beta2,) + fns
         if default(gradient_clipping, self.gradient_clipping) is not None:

{heavyball-1.3.0 → heavyball-1.3.1}/heavyball/utils.py RENAMED Viewed

@@ -1101,65 +1101,98 @@ def psgd_update_precond(Q, exprs, G, precond_lr, oq, store_triu_as_line):
 @decorator_knowngood
-def _compilable_l2_clip_(x):
+def _compilable_l2_clip_(x, clip_at):
     ref = x
     x = list(map(promote, x))
     norm = torch._foreach_norm(x)
-    torch._foreach_maximum_(norm, 1e-8)
+    torch._foreach_maximum_(norm, clip_at)
     out = torch._foreach_div(x, norm)
     return stochastic_round_list_(ref, out)
-def l2_clip_(x):
+def l2_normalization_(x, clip_at: float = 1e-8):
     x = list_guard(x)
-    return _compilable_l2_clip_(x)
+    return _compilable_l2_clip_(x, clip_at)
+def l2_clip_(x, clip_at: float = 1.):
+    x = list_guard(x)
+    return _compilable_l2_clip_(x, clip_at)
 @decorator_knowngood
-def _compilable_rmsnorm_clip_(x):
+def _compilable_rmsnorm_clip_(x, clip_at):
     x = list(map(promote, x))
     norm = torch._foreach_norm(x)
     norm = [n.div_(x_.numel() ** 0.5) for n, x_ in zip(norm, x)]
-    torch._foreach_maximum_(norm, 1e-6)
+    torch._foreach_maximum_(norm, clip_at)
     return torch._foreach_div(x, norm)
-def rmsnorm_clip_(x):
+def rmsnorm_clip_(x, clip_at: float = 1.0):
     x = list_guard(x)
-    return _compilable_rmsnorm_clip_(x)
+    return _compilable_rmsnorm_clip_(x, clip_at)
-def mu_law_compress(x, mu=127.0):
+def rmsnorm_normalize_(x, clip_at: float = 1e-6):
+    x = list_guard(x)
+    return _compilable_rmsnorm_clip_(x, clip_at)
+@decorator_knowngood
+def _compilable_mu_law_compress_(x, mu):
+    """
+    original at https://github.com/opooladz/modded-nanogpt-psgd/blob/dc7c78082ac15fbf326f1bacd9e0ead0a2b45908/kron_mu.py
     """
-    Foreach version of https://github.com/opooladz/modded-nanogpt-psgd/blob/dc7c78082ac15fbf326f1bacd9e0ead0a2b45908/kron_mu.py
+    for x_ in x:
+        xa = promote(x_.abs()) * mu
+        xa = xa.log1p()
+        xa = xa / math.log1p(mu)
+        xa = xa.copysign(x_)
+        copy_stochastic_(x_, xa)
+def mu_law_compress(x, mu=127.0):
+    """
     μ-law compression
     Args:
         x: Input tensor
         mu: Compression parameter (default 127.0 for behavior similar to trust_region=1.5)
     """
-    xa = torch._foreach_abs_(x)
-    torch._foreach_mul_(xa, mu)
-    torch._foreach_log1p_(xa)
-    torch._foreach_div_(xa, math.log1p(mu))
-    return [xa_.copysign_(x_) for x_, xa_ in zip(x, xa)]
+    x = list_guard(x)
+    mu = scalar_guard(mu, x[0])
+    _compilable_mu_law_compress(x, mu)
+    return x
-def a_law_compress(x, A=87.6):
+@decorator_knowngood
+def _compilable_a_law_compress_(x, A):
+    """
+    original at https://github.com/opooladz/modded-nanogpt-psgd/blob/dc7c78082ac15fbf326f1bacd9e0ead0a2b45908/kron_mu.py
     """
-    Foreach version of https://github.com/opooladz/modded-nanogpt-psgd/blob/dc7c78082ac15fbf326f1bacd9e0ead0a2b45908/kron_mu.py
+    for x_ in x:
+        xa = promote(x_.abs()) * A
+        xa = torch.where(xa < 1, xa, 1 + xa.log())
+        xa = xa.copysign(x_)
+        xa = xa * (1 / (1 + math.log(A)))
+        copy_stochastic_(x_, xa)
+def a_law_compress(x, A=87.6):
+    """
     A-law compression
     Args:
         x: Input tensor
         A: Compression parameter (default 87.6 - European PCM standard)
+    :param x:
+    :param A:
+    :return:
     """
-    xa = torch._foreach_abs(x)
-    torch._foreach_mul_(xa, A)
-    [torch.where(x_ < 1, x_, 1 + torch.log_(x_), out=x_) for x_ in xa]
-    [xa_.copysign(x_) for x_, xa_ in zip(x, xa)]
-    torch._foreach_mul_(xa, 1 / (1 + math.log(A)))
-    return xa
+    x = list_guard(x)
+    A = scalar_guard(A, x[0])
+    _compilable_a_law_compress(x, A)
+    return x
 def identity(x):
@@ -1167,24 +1200,24 @@ def identity(x):
 @decorator_knowngood
-def _compilable_trust_region_clip_(grad, lerp: float = 0.9, scale: float = 1.5):
+def _compilable_trust_region_clip_(grad, lerp, scale):
     # (sgn(x) * log(1 + |x|) * 0.1 + tanh(x) * 0.9).clamp_(min=-2, max=2)
-    g32 = list(map(promote, grad))
-    [g.mul_(1 / scale) for g in g32]
-    tanh = torch._foreach_tanh(g32)
-    torch._foreach_abs_(g32)
-    torch._foreach_log1p_(g32)
-    [g.copysign_(t).lerp_(t, lerp).mul_(scale) for t, g in zip(tanh, g32)]
-    torch._foreach_maximum_(g32, -2)
-    torch._foreach_minimum_(g32, 2)
-    return [stochastic_round_(grad, g32) for grad, g32 in zip(grad, g32)]
+    for x_ in grad:
+        x = promote(x_)
+        x = x / scale
+        tanh = x.tanh()
+        x = x.abs().log1p()
+        x = x.copysign(tanh) * (1 - lerp) + tanh * lerp
+        x = x * scale
+        x = x.clamp(min=-2, max=2)
+        copy_stochastic_(x_, x)
 def trust_region_clip_(grad, lerp=0.9, scale=1.5):
     grad = list_guard(grad)
     lerp, scale = scalar_guard(lerp, scale, grad[0])
-    return _compilable_trust_region_clip_(grad, lerp, scale)
+    _compilable_trust_region_clip_(grad, lerp, scale)
+    return grad
 @decorator

{heavyball-1.3.0 → heavyball-1.3.1}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.3.0
+Version: 1.3.1
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.3.0 → heavyball-1.3.1}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setuptools.setup(
     name='heavyball',
     license='BSD',
     description='Efficient optimizers',
-    version='1.3.0',
+    version='1.3.1',
     long_description=README,
     url='https://github.com/clashluke/heavyball',
     packages=setuptools.find_packages(),