PyPI - heavyball - Versions diffs - 0.23.3__tar.gz → 0.24.0__tar.gz - Mend

heavyball 0.23.3tar.gz → 0.24.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

{heavyball-0.23.3 → heavyball-0.24.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.23.3
+Version: 0.24.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/cached_delayed_psgd_kron.py RENAMED Viewed

@@ -86,7 +86,7 @@ class ForeachCachedDelayedPSGDKron(PSGDBase):
             state = self.state_(p)
             if 'Q' not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype)
+                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
                                                  memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/cached_psgd_kron.py RENAMED Viewed

@@ -83,7 +83,7 @@ class ForeachCachedPSGDKron(PSGDBase):
             state = self.state_(p)
             if 'Q' not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype)
+                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
                                                  memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/delayed_psgd.py RENAMED Viewed

@@ -89,7 +89,7 @@ class ForeachDelayedPSGD(PSGDBase):
             state = self.state_(p)
             if 'Q' not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype)
+                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
                                                  memory_save_mode, dtype=q_dtype)
                 state["Q"] = triu_to_line(Q) if store_triu_as_line else Q

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/foreach_adamw.py RENAMED Viewed

@@ -45,8 +45,8 @@ class ForeachAdamW(StatefulOptimizer):
         for p in active_p:
             if 'exp_avg' not in self.state_(p):
-                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype)
-                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype)
+                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
         y, grad, exp_avg_sq, exp_avg = zip(
             *[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/foreach_adopt.py RENAMED Viewed

@@ -51,8 +51,8 @@ class ForeachADOPT(StatefulOptimizer):
         for p in active_p:
             if 'exp_avg' not in self.state_(p):
-                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype)
-                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype)
+                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
         y, grad, exp_avg_sq, exp_avg = zip(
             *[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/foreach_laprop.py RENAMED Viewed

@@ -47,8 +47,8 @@ class ForeachLaProp(StatefulOptimizer):
         for p in active_p:
             if 'exp_avg' not in self.state_(p):
-                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype)
-                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype)
+                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
         y, grad, exp_avg_sq, exp_avg = zip(
             *[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg'])  #

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/foreach_sfadamw.py RENAMED Viewed

@@ -50,8 +50,8 @@ class ForeachSFAdamW(ScheduleFree):
         for p in active_p:
             if 'z' not in self.state_(p):
-                self.state_(p)['z'] = torch.clone(p.data)
-                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype)
+                self.state_(p)['z'] = torch.clone(p.data, memory_format=torch.preserve_format)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
         y, grad, exp_avg_sq, z = zip(*[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['z'])  #
                                        for p in active_p])

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/foreach_soap.py RENAMED Viewed

@@ -48,8 +48,8 @@ class ForeachSOAP(StatefulOptimizer):
             step = state['step'] = state.get("step", -1) + 1
             if "exp_avg" not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/p_adam.py RENAMED Viewed

@@ -81,8 +81,8 @@ class ForeachPaLMPAdam(PSGDBase):
             state = self.state_(p)
             if 'Q' not in state:
-                state['exp_avg'] = torch.zeros_like(g, dtype=storage_dtype)
-                state['exp_avg_sq'] = torch.zeros_like(g, dtype=storage_dtype)
+                state['exp_avg'] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
+                state['exp_avg_sq'] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
                                                  memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/palm_foreach_sfadamw.py RENAMED Viewed

@@ -54,8 +54,8 @@ class PaLMForeachSFAdamW(ScheduleFree):
         for p in active_p:
             if 'z' not in self.state_(p):
-                self.state_(p)['z'] = torch.clone(p.data)
-                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype)
+                self.state_(p)['z'] = torch.clone(p.data, memory_format=torch.preserve_format)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
         # Decay the first moment running average coefficient
         beta2 = 1 - (k + 1) ** -group['beta2_scale']

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/palm_foreach_soap.py RENAMED Viewed

@@ -56,8 +56,8 @@ class PaLMForeachSOAP(StatefulOptimizer):
             step = state['step'] = state.get("step", -1) + 1
             if "exp_avg" not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/precond_schedule_foreach_soap.py RENAMED Viewed

@@ -50,8 +50,8 @@ class PrecondScheduleForeachSOAP(StatefulOptimizer):
             step = state['step'] = state.get("step", -1) + 1
             if "exp_avg" not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/precond_schedule_palm_foreach_soap.py RENAMED Viewed

@@ -58,8 +58,8 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
             step = state['step'] = state.get("step", -1) + 1
             if "exp_avg" not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/precond_schedule_sfpsoap.py RENAMED Viewed

@@ -96,8 +96,8 @@ class PrecondScheduleSFPaLMSOAP(ScheduleFree):
             state = self.state_(p)
             if "z" not in state:
-                state["z"] = torch.clone(p.data)
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["z"] = torch.clone(p.data, memory_format=torch.preserve_format)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/psgd_kron.py RENAMED Viewed

@@ -84,7 +84,7 @@ class ForeachPSGDKron(PSGDBase):
             state = self.state_(p)
             if 'Q' not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype)
+                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
                                                  memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/schedule_free_palm_foreach_soap.py RENAMED Viewed

@@ -90,7 +90,7 @@ class SFPaLMForeachSOAP(ScheduleFree):
             if "z" not in state:
                 state["z"] = torch.clone(p).float()
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 if mars:
                     state['mars_prev_grad'] = g.clone()
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball/utils.py RENAMED Viewed

@@ -8,9 +8,9 @@ from typing import List, Optional, Tuple, Callable, Union
 import numpy as np
 import torch
 from torch import Tensor
+from torch._dynamo.exc import TorchDynamoException
 from torch.backends import cudnn, opt_einsum
 from torch.utils._pytree import tree_map
-from torch._dynamo.exc import TorchDynamoException
 compile_mode = "max-autotune-no-cudagraphs"
 dynamic = False
@@ -23,26 +23,26 @@ def decorator(func):
     @functools.wraps(func)
     def _fn(*args, **kwargs):
-        if compile_mode is None:
+        if is_compiling() or compile_mode is None:
             return func(*args, **kwargs)
         nonlocal compiled
         if compiled is None:
-            compiled = torch.compile(func, fullgraph=True, dynamic=dynamic, mode=compile_mode_recommended_to_none)
+            compiled = torch.compile(fullgraph=True, dynamic=dynamic, mode=compile_mode_recommended_to_none)(func)
         return compiled(*args, **kwargs)
     return _fn
-def decorator_knowngood(func):
+def decorator_knowngood(func: Callable):
     compiled = None
     @functools.wraps(func)
     def _fn(*args, **kwargs):
-        if compile_mode is None:
+        if is_compiling() or compile_mode is None:
             return func(*args, **kwargs)
         nonlocal compiled
         if compiled is None:
-            compiled = torch.compile(func, fullgraph=True, dynamic=dynamic, mode=compile_mode)
+            compiled = torch.compile(fullgraph=True, dynamic=dynamic, mode=compile_mode)(func)
         return compiled(*args, **kwargs)
     return _fn
@@ -58,12 +58,13 @@ def warmup(lr: float, step: int, warmup_steps: int):
 @decorator_knowngood
-def _compilable_schedule_free_(p: List[Tensor], z: List[Tensor], ckp1: Tensor, grad: List[Tensor], lr: Tensor, beta1: Tensor):
-    p32, z32, g32 = [promote(x) for x in (p, z, grad)]
+def _compilable_schedule_free_(p: List[Tensor], z: List[Tensor], ckp1: Tensor, grad: List[Tensor], lr: Tensor,
+                               beta1: Tensor):
+    p32, z32, g32 = [list(map(promote, x)) for x in (p, z, grad)]
     for p_, z_, g_ in zip(p32, z32, g32):
         p_.lerp_(z_, ckp1)
         p_.add_(g_, alpha=lr * (beta1 * (1 - ckp1) - 1))
-        z_.add(g_, alpha=-lr)
+        z_.add_(g_, alpha=-lr)
     copy_stochastic_list_(p, p32)
     copy_stochastic_list_(z, z32)
@@ -158,7 +159,8 @@ def beta_debias(beta, step):
 @decorator_knowngood
-def _compilable_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor, out: List[Optional[Tensor]]):
+def _compilable_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor,
+                            out: List[Optional[Tensor]]):
     torch._foreach_mul_(state, beta2)
     [s.addcmul_(g, g, value=1 - beta2) for s, g in zip(state, grad)]
     denom = torch._foreach_sqrt(state)
@@ -1050,7 +1052,7 @@ class PSGDBase(StatefulOptimizer):
 # TODO: Figure out why this sometimes crashes
-#@decorator_knowngood
+# @decorator_knowngood
 def _compilable_precond_grad_cached_(ea: Tensor, expr: str, param: Tensor, lr: Tensor, weight_decay: Tensor,
                                      clip_fn: callable, caution: bool, grad: Optional[Tensor], *cached_q: Tensor):
     md = min_dtype(list(cached_q) + [ea])

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.23.3
+Version: 0.24.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-0.23.3 → heavyball-0.24.0}/heavyball.egg-info/SOURCES.txt RENAMED Viewed

@@ -29,6 +29,7 @@ test/test_bf16_params.py
 test/test_bf16_q.py
 test/test_bf16_storage.py
 test/test_caution.py
+test/test_channels_last.py
 test/test_closure.py
 test/test_ema.py
 test/test_foreach.py

{heavyball-0.23.3 → heavyball-0.24.0}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setuptools.setup(
     name='heavyball',
     license='BSD',
     description='Efficient optimizers',
-    version='0.23.3',
+    version='0.24.0',
     long_description=README,
     url='https://github.com/clashluke/heavyball',
     packages=setuptools.find_packages(),

heavyball-0.24.0/test/test_channels_last.py ADDED Viewed

@@ -0,0 +1,50 @@
+import os
+os.environ["TORCH_LOGS"] = "+recompiles"
+import heavyball
+import heavyball.utils
+import pytest
+import torch
+from benchmark.utils import get_optim
+from heavyball.utils import clean, set_torch
+from torch import nn
+from torch._dynamo import config
+config.cache_size_limit = 128
+@pytest.mark.parametrize("opt", heavyball.__all__)
+@pytest.mark.parametrize("size,depth", [(128, 1)])
+def test_foreach(opt, size, depth: int, iterations: int = 32, outer_iterations: int = 1):
+    set_torch()
+    opt = getattr(heavyball, opt)
+    peaks = []
+    losses = []
+    for is_channels_last in [False, True]:
+        torch.manual_seed(0x2131290)
+        peaks.append([])
+        losses.append([])
+        for i in range(outer_iterations):
+            model = nn.Sequential(*[nn.Conv2d(size, size, 1) for _ in range(depth)]).cuda()
+            if is_channels_last:
+                model.to(memory_format=torch.channels_last)
+            o = get_optim(opt, model.parameters(), lr=1e-3, weight_decay=1e-4, warmup_steps=16)
+            for _ in range(iterations):
+                loss = model(torch.randn((1024, size, 1, 1), device='cuda')).square().mean()
+                loss.backward()
+                o.step()
+                o.zero_grad()
+                losses[-1].append(loss.detach())
+            del model, o
+            clean()
+    for i, (l0, l1) in enumerate(zip(*losses)):
+        print(i, l0.item(), l1.item())
+        assert torch.allclose(l0.float(), l1.float(), rtol=0.1)