PyPI - heavyball - Versions diffs - 0.23.0__tar.gz → 0.23.3__tar.gz - Mend

heavyball 0.23.0tar.gz → 0.23.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

{heavyball-0.23.0 → heavyball-0.23.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.23.0
+Version: 0.23.3
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-0.23.0 → heavyball-0.23.3}/heavyball/delayed_psgd.py RENAMED Viewed

@@ -8,10 +8,10 @@ import torch
 from heavyball.utils import stochastic_lerp_, beta_debias, stochastic_add_
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
-    triu_to_line, line_to_triu, promote,_compilable_update_
+    triu_to_line, line_to_triu, promote,_compilable_update_, decorator_knowngood
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_psgd_precond_grad_(q, exprs, ea, p, lr, weight_decay, clip_fn, caution, grad):
     new = psgd_precond_grad(False, exprs, ea, *q)
     _compilable_update_([p], clip_fn([new]), weight_decay, stochastic_add_, lr, caution, [grad])

{heavyball-0.23.0 → heavyball-0.23.3}/heavyball/foreach_adamw.py RENAMED Viewed

@@ -2,10 +2,10 @@ import torch
 import torch.optim
 from heavyball.utils import copy_stochastic_list_
-from .utils import warmup, exp_avg_sq_, beta_debias, update_param_, StatefulOptimizer, promote
+from .utils import warmup, exp_avg_sq_, beta_debias, update_param_, StatefulOptimizer, promote, decorator_knowngood
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]

{heavyball-0.23.0 → heavyball-0.23.3}/heavyball/foreach_adopt.py RENAMED Viewed

@@ -2,10 +2,10 @@ import torch
 import torch.optim
 from heavyball.utils import copy_stochastic_list_
-from .utils import warmup, beta_debias, update_param_, StatefulOptimizer, promote
+from .utils import warmup, beta_debias, update_param_, StatefulOptimizer, promote, decorator_knowngood
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]
     update_param_(y, exp_avg, lr, decay, caution=caution, grad=g32)

{heavyball-0.23.0 → heavyball-0.23.3}/heavyball/foreach_laprop.py RENAMED Viewed

@@ -1,10 +1,10 @@
 import torch
 import torch.optim
-from .utils import warmup, exp_avg_sq_, beta_debias, update_param_, StatefulOptimizer, promote, copy_stochastic_list_
+from .utils import warmup, exp_avg_sq_, beta_debias, update_param_, StatefulOptimizer, promote, copy_stochastic_list_, decorator_knowngood
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]

{heavyball-0.23.0 → heavyball-0.23.3}/heavyball/foreach_sfadamw.py RENAMED Viewed

@@ -1,11 +1,10 @@
 import torch
 import torch.optim
-from heavyball.utils import get_ckp1, copy_stochastic_list_
-from .utils import warmup, ScheduleFree, exp_avg_sq_, beta_debias, promote, _compilable_schedule_free_
+from .utils import get_ckp1, copy_stochastic_list_, warmup, ScheduleFree, exp_avg_sq_, beta_debias, promote, _compilable_schedule_free_, decorator_knowngood
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_step_(y, grad, exp_avg_sq, z, beta1, beta2, step, ckp1, eps, decay, lr):
     old_debiased2 = beta_debias(beta2, step)

{heavyball-0.23.0 → heavyball-0.23.3}/heavyball/palm_foreach_sfadamw.py RENAMED Viewed

@@ -2,10 +2,10 @@ import torch
 import torch.optim
 from .utils import warmup, ScheduleFree, exp_avg_sq_, beta_debias, get_ckp1, promote, \
-    _compilable_schedule_free_, copy_stochastic_list_
+    _compilable_schedule_free_, copy_stochastic_list_, decorator_knowngood
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_step_(y, grad, exp_avg_sq, z, beta1, beta2, step, ckp1, eps, decay, lr):
     old_debiased2 = beta_debias(beta2, step)

{heavyball-0.23.0 → heavyball-0.23.3}/heavyball/precond_schedule_sfpsoap.py RENAMED Viewed

@@ -4,10 +4,10 @@ import torch
 from .utils import init_preconditioner, update_preconditioner, project, set_, adaptive_gradient_clipping_, exp_avg_sq_, \
     beta_debias, schedule_free_, warmup, ScheduleFree, precond_schedule, copy_stochastic_list_, \
-    promote
+    promote, decorator_knowngood
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, eps):
     eas32, gp32 = [list(map(promote, x)) for x in (exp_avg_sq, grad_projected)]
     denom = exp_avg_sq_(eas32, gp32, old_debiased2, eps)

{heavyball-0.23.0 → heavyball-0.23.3}/heavyball/schedule_free_palm_foreach_soap.py RENAMED Viewed

@@ -1,13 +1,13 @@
 import random
 import torch
-from heavyball.utils import mars_correction
 from .utils import init_preconditioner, update_preconditioner, project, set_, adaptive_gradient_clipping_, exp_avg_sq_, \
-    beta_debias, schedule_free_, warmup, ScheduleFree, copy_stochastic_list_, promote
+    beta_debias, schedule_free_, warmup, ScheduleFree, copy_stochastic_list_, promote, decorator_knowngood, \
+    mars_correction
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, eps):
     eas32, gp32 = [list(map(promote, x)) for x in (exp_avg_sq, grad_projected)]
     denom = exp_avg_sq_(eas32, gp32, old_debiased2, eps)

{heavyball-0.23.0 → heavyball-0.23.3}/heavyball/utils.py RENAMED Viewed

@@ -10,8 +10,11 @@ import torch
 from torch import Tensor
 from torch.backends import cudnn, opt_einsum
 from torch.utils._pytree import tree_map
+from torch._dynamo.exc import TorchDynamoException
-compile_mode = None
+compile_mode = "max-autotune-no-cudagraphs"
+dynamic = False
+compile_mode_recommended_to_none = None
 zeroth_power_mode = 'qr'  # 'qr' is baseline, 'newtonschulz' converges better and faster, 'eigh' is perfect but slow
@@ -24,7 +27,22 @@ def decorator(func):
             return func(*args, **kwargs)
         nonlocal compiled
         if compiled is None:
-            compiled = torch.compile(func, fullgraph=True, dynamic=False, mode=compile_mode)
+            compiled = torch.compile(func, fullgraph=True, dynamic=dynamic, mode=compile_mode_recommended_to_none)
+        return compiled(*args, **kwargs)
+    return _fn
+def decorator_knowngood(func):
+    compiled = None
+    @functools.wraps(func)
+    def _fn(*args, **kwargs):
+        if compile_mode is None:
+            return func(*args, **kwargs)
+        nonlocal compiled
+        if compiled is None:
+            compiled = torch.compile(func, fullgraph=True, dynamic=dynamic, mode=compile_mode)
         return compiled(*args, **kwargs)
     return _fn
@@ -39,7 +57,7 @@ def warmup(lr: float, step: int, warmup_steps: int):
     return lr * step / warmup_steps
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_schedule_free_(p: List[Tensor], z: List[Tensor], ckp1: Tensor, grad: List[Tensor], lr: Tensor, beta1: Tensor):
     p32, z32, g32 = [promote(x) for x in (p, z, grad)]
     for p_, z_, g_ in zip(p32, z32, g32):
@@ -139,7 +157,7 @@ def beta_debias(beta, step):
     return 1 - (1 - beta) / (1 - beta ** step)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor, out: List[Optional[Tensor]]):
     torch._foreach_mul_(state, beta2)
     [s.addcmul_(g, g, value=1 - beta2) for s, g in zip(state, grad)]
@@ -175,7 +193,7 @@ def adaptive_gradient_clipping_(parameters: List[Tensor], gradients: List[Tensor
 def is_compiling():
     try:
         return torch.compiler.is_compiling()
-    except AttributeError:
+    except TorchDynamoException:
         return True
@@ -339,7 +357,7 @@ def get_orthogonal_matrix(mat):
     return final
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_stochastic_lerp_(x: List[Tensor], y: List[Tensor], a: Union[float, int, Tensor]):
     for x_, y_ in zip(x, y):
         x32 = promote(x_)
@@ -368,7 +386,7 @@ def scalar_guard(x, ref):
     return x
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[float, int, Tensor]):
     for x_, y_ in zip(x, y):
         x32 = promote(x_)
@@ -595,7 +613,9 @@ class StatefulOptimizer(torch.optim.Optimizer):
         else:
             with torch.enable_grad():
                 loss = closure()
-        with torch.no_grad():
+        # we assume that parameters are constant and that there are no excessive recompiles
+        with torch.no_grad(), torch._dynamo.utils.disable_cache_limit():
             for top_group in self.param_groups:
                 for group in self.get_groups(top_group):
                     self._step(group)
@@ -643,7 +663,7 @@ def copy_stochastic_list_(target: List[Tensor], source: List[Tensor]):
         copy_stochastic_(t, s)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_exp_avg_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor],
                          grad_projected: List[Tensor], beta1: Tensor, beta2: Tensor, step: Tensor):
     beta1 = beta_debias(beta1, step)
@@ -667,7 +687,7 @@ def exp_avg_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor]
     return denom
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_copy_stochastic_(target: Tensor, source: Tensor):
     """Taken as-is from https://github.com/pytorch/pytorch/issues/120376#issuecomment-1974828905"""
     # create a random 16 bit integer
@@ -686,12 +706,12 @@ def _compilable_copy_stochastic_(target: Tensor, source: Tensor):
 def copy_stochastic_(target: Tensor, source: Tensor):
     if not is_compiling() and target.data_ptr() == source.data_ptr():
         return
-    if target.dtype != torch.bfloat16 or source.dtype not in (torch.float16, torch.float32, torch.float64):
-        set_(target, source)
-    _compilable_copy_stochastic_(target, source)
+    if target.dtype == torch.bfloat16 and source.dtype in (torch.float16, torch.float32, torch.float64):
+        _compilable_copy_stochastic_(target, source.float())
+    set_(target, source)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_update_(p: List[Tensor], u: List[Tensor], decay: Tensor, add_fn: callable, lr: Tensor, caution: bool,
                         g: List[Optional[Tensor]]):
     u = [u_.view_as(p_) for u_, p_ in zip(u, p)]
@@ -852,7 +872,7 @@ def psgd_lb(A, max_abs):
     return x
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def psgd_update_precond(Q, exprs, G, precond_lr, tiny, oq, store_triu_as_line):
     """Update Kronecker product preconditioner Q with pair (V, G)."""
     exprA, exprGs, _ = exprs
@@ -885,7 +905,7 @@ def psgd_update_precond(Q, exprs, G, precond_lr, tiny, oq, store_triu_as_line):
         stochastic_add_([o], [term1], -1)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def psgd_precond_grad(inplace: bool, exprs: str, grad: Tensor, *preconds: Tensor):
     """Precondition gradient G with preconditioner Q."""
     md = min_dtype(preconds)
@@ -1030,12 +1050,15 @@ class PSGDBase(StatefulOptimizer):
 # TODO: Figure out why this sometimes crashes
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+#@decorator_knowngood
 def _compilable_precond_grad_cached_(ea: Tensor, expr: str, param: Tensor, lr: Tensor, weight_decay: Tensor,
                                      clip_fn: callable, caution: bool, grad: Optional[Tensor], *cached_q: Tensor):
-    md = min_dtype(cached_q + [ea])
-    new = torch.einsum(expr, *[c_.to(md) for c_ in cached_q], ea.to(md)).to(torch.float32)
-    update_param_([param], clip_fn([new]), lr, weight_decay, caution=caution, grad=grad)
+    md = min_dtype(list(cached_q) + [ea])
+    args = [q.to(md) for q in cached_q]
+    args = args + [ea.to(md)]
+    new = torch.einsum(expr, *args)
+    new = new.to(torch.float32)
+    _compilable_update_([param], clip_fn([new]), weight_decay, stochastic_add_, lr, caution, [grad])
 def precond_grad_cached_(cached_q: List[Tensor], ea: Tensor, expr: str, param: Tensor, lr: float, weight_decay: float,
@@ -1044,7 +1067,7 @@ def precond_grad_cached_(cached_q: List[Tensor], ea: Tensor, expr: str, param: T
     _compilable_precond_grad_cached_(ea, expr, param, lr, weight_decay, clip_fn, caution, grad, *cached_q)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_mars_correction_(g: Tensor, old_g: Tensor, a: Tensor):
     g_copy = [g_.clone() for g_ in g]
     _compilable_stochastic_lerp_(g, old_g, a)
@@ -1058,7 +1081,7 @@ def mars_correction(g, old_g, beta1, gamma):
     _compilable_mars_correction_(g, old_g, a)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+@decorator_knowngood
 def _compilable_cautioning_(g: Tensor, update: Tensor):
     mask = (g * update) > 0
     update.masked_fill_(~mask, 0)

{heavyball-0.23.0 → heavyball-0.23.3}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.23.0
+Version: 0.23.3
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-0.23.0 → heavyball-0.23.3}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setuptools.setup(
     name='heavyball',
     license='BSD',
     description='Efficient optimizers',
-    version='0.23.0',
+    version='0.23.3',
     long_description=README,
     url='https://github.com/clashluke/heavyball',
     packages=setuptools.find_packages(),

{heavyball-0.23.0 → heavyball-0.23.3}/test/test_mars.py RENAMED Viewed

@@ -12,7 +12,7 @@ config.cache_size_limit = 128
 @pytest.mark.parametrize("opt", heavyball.__all__)
 @pytest.mark.parametrize("size,depth", [(128, 2)])
-def test_mars(opt, size, depth: int, iterations: int = 16384, outer_iterations: int = 2):
+def test_mars(opt, size, depth: int, iterations: int = 16384, outer_iterations: int = 1):
     set_torch()
     opt = getattr(heavyball, opt)
     if ScheduleFree in opt.__mro__:
@@ -27,11 +27,11 @@ def test_mars(opt, size, depth: int, iterations: int = 16384, outer_iterations:
         losses.append([])
         for i in range(outer_iterations):
-            model = nn.Sequential(*[nn.Linear(size, size) for _ in range(depth)]).cuda().double()
+            model = nn.Sequential(*[nn.Linear(size, size) for _ in range(depth)]).cuda()
             o = get_optim(opt, model.parameters(), lr=1e-5, mars=mars)
             for _ in range(iterations):
-                loss = model(torch.randn((1024, size), device='cuda', dtype=torch.double)).square().mean()
+                loss = model(torch.randn((1024, size), device='cuda')).square().mean()
                 loss.backward()
                 o.step()
                 o.zero_grad()