PyPI - heavyball - Versions diffs - 0.23.4__tar.gz → 0.24.1__tar.gz - Mend

heavyball 0.23.4tar.gz → 0.24.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

{heavyball-0.23.4 → heavyball-0.24.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.23.4
+Version: 0.24.1
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/cached_delayed_psgd_kron.py RENAMED Viewed

@@ -86,7 +86,7 @@ class ForeachCachedDelayedPSGDKron(PSGDBase):
             state = self.state_(p)
             if 'Q' not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype)
+                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
                                                  memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/cached_psgd_kron.py RENAMED Viewed

@@ -83,7 +83,7 @@ class ForeachCachedPSGDKron(PSGDBase):
             state = self.state_(p)
             if 'Q' not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype)
+                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
                                                  memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/delayed_psgd.py RENAMED Viewed

@@ -89,7 +89,7 @@ class ForeachDelayedPSGD(PSGDBase):
             state = self.state_(p)
             if 'Q' not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype)
+                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
                                                  memory_save_mode, dtype=q_dtype)
                 state["Q"] = triu_to_line(Q) if store_triu_as_line else Q

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/foreach_adamw.py RENAMED Viewed

@@ -45,8 +45,8 @@ class ForeachAdamW(StatefulOptimizer):
         for p in active_p:
             if 'exp_avg' not in self.state_(p):
-                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype)
-                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype)
+                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
         y, grad, exp_avg_sq, exp_avg = zip(
             *[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/foreach_adopt.py RENAMED Viewed

@@ -51,8 +51,8 @@ class ForeachADOPT(StatefulOptimizer):
         for p in active_p:
             if 'exp_avg' not in self.state_(p):
-                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype)
-                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype)
+                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
         y, grad, exp_avg_sq, exp_avg = zip(
             *[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/foreach_laprop.py RENAMED Viewed

@@ -47,8 +47,8 @@ class ForeachLaProp(StatefulOptimizer):
         for p in active_p:
             if 'exp_avg' not in self.state_(p):
-                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype)
-                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype)
+                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
         y, grad, exp_avg_sq, exp_avg = zip(
             *[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg'])  #

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/foreach_sfadamw.py RENAMED Viewed

@@ -50,8 +50,8 @@ class ForeachSFAdamW(ScheduleFree):
         for p in active_p:
             if 'z' not in self.state_(p):
-                self.state_(p)['z'] = torch.clone(p.data)
-                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype)
+                self.state_(p)['z'] = torch.clone(p.data, memory_format=torch.preserve_format)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
         y, grad, exp_avg_sq, z = zip(*[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['z'])  #
                                        for p in active_p])

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/foreach_soap.py RENAMED Viewed

@@ -48,8 +48,8 @@ class ForeachSOAP(StatefulOptimizer):
             step = state['step'] = state.get("step", -1) + 1
             if "exp_avg" not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/p_adam.py RENAMED Viewed

@@ -81,8 +81,8 @@ class ForeachPaLMPAdam(PSGDBase):
             state = self.state_(p)
             if 'Q' not in state:
-                state['exp_avg'] = torch.zeros_like(g, dtype=storage_dtype)
-                state['exp_avg_sq'] = torch.zeros_like(g, dtype=storage_dtype)
+                state['exp_avg'] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
+                state['exp_avg_sq'] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
                                                  memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/palm_foreach_sfadamw.py RENAMED Viewed

@@ -54,8 +54,8 @@ class PaLMForeachSFAdamW(ScheduleFree):
         for p in active_p:
             if 'z' not in self.state_(p):
-                self.state_(p)['z'] = torch.clone(p.data)
-                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype)
+                self.state_(p)['z'] = torch.clone(p.data, memory_format=torch.preserve_format)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=storage_dtype, memory_format=torch.preserve_format)
         # Decay the first moment running average coefficient
         beta2 = 1 - (k + 1) ** -group['beta2_scale']

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/palm_foreach_soap.py RENAMED Viewed

@@ -56,8 +56,8 @@ class PaLMForeachSOAP(StatefulOptimizer):
             step = state['step'] = state.get("step", -1) + 1
             if "exp_avg" not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/precond_schedule_foreach_soap.py RENAMED Viewed

@@ -50,8 +50,8 @@ class PrecondScheduleForeachSOAP(StatefulOptimizer):
             step = state['step'] = state.get("step", -1) + 1
             if "exp_avg" not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/precond_schedule_palm_foreach_soap.py RENAMED Viewed

@@ -58,8 +58,8 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
             step = state['step'] = state.get("step", -1) + 1
             if "exp_avg" not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/precond_schedule_sfpsoap.py RENAMED Viewed

@@ -96,8 +96,8 @@ class PrecondScheduleSFPaLMSOAP(ScheduleFree):
             state = self.state_(p)
             if "z" not in state:
-                state["z"] = torch.clone(p.data)
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["z"] = torch.clone(p.data, memory_format=torch.preserve_format)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/psgd_kron.py RENAMED Viewed

@@ -84,7 +84,7 @@ class ForeachPSGDKron(PSGDBase):
             state = self.state_(p)
             if 'Q' not in state:
-                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype)
+                state["exp_avg"] = torch.zeros_like(g, dtype=storage_dtype, memory_format=torch.preserve_format)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
                                                  memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/schedule_free_palm_foreach_soap.py RENAMED Viewed

@@ -90,7 +90,7 @@ class SFPaLMForeachSOAP(ScheduleFree):
             if "z" not in state:
                 state["z"] = torch.clone(p).float()
-                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32, memory_format=torch.preserve_format)
                 if mars:
                     state['mars_prev_grad'] = g.clone()
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball/utils.py RENAMED Viewed

@@ -60,11 +60,11 @@ def warmup(lr: float, step: int, warmup_steps: int):
 @decorator_knowngood
 def _compilable_schedule_free_(p: List[Tensor], z: List[Tensor], ckp1: Tensor, grad: List[Tensor], lr: Tensor,
                                beta1: Tensor):
-    p32, z32, g32 = [promote(x) for x in (p, z, grad)]
+    p32, z32, g32 = [list(map(promote, x)) for x in (p, z, grad)]
     for p_, z_, g_ in zip(p32, z32, g32):
         p_.lerp_(z_, ckp1)
         p_.add_(g_, alpha=lr * (beta1 * (1 - ckp1) - 1))
-        z_.add(g_, alpha=-lr)
+        z_.add_(g_, alpha=-lr)
     copy_stochastic_list_(p, p32)
     copy_stochastic_list_(z, z32)
@@ -134,7 +134,7 @@ def dim_merger(grad, max_precond_dim, split: bool = False):
     if curr_shape > 1 or len(new_shape) == 0:
         new_shape.append(curr_shape)
-    new_grad = grad.view(new_shape)
+    new_grad = grad.reshape(new_shape)  # needs to be .reshape() due to channels_last
     if not split:
         return new_grad

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.23.4
+Version: 0.24.1
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-0.23.4 → heavyball-0.24.1}/heavyball.egg-info/SOURCES.txt RENAMED Viewed

@@ -29,6 +29,7 @@ test/test_bf16_params.py
 test/test_bf16_q.py
 test/test_bf16_storage.py
 test/test_caution.py
+test/test_channels_last.py
 test/test_closure.py
 test/test_ema.py
 test/test_foreach.py

{heavyball-0.23.4 → heavyball-0.24.1}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setuptools.setup(
     name='heavyball',
     license='BSD',
     description='Efficient optimizers',
-    version='0.23.4',
+    version='0.24.1',
     long_description=README,
     url='https://github.com/clashluke/heavyball',
     packages=setuptools.find_packages(),

heavyball-0.24.1/test/test_channels_last.py ADDED Viewed

@@ -0,0 +1,51 @@
+import os
+os.environ["TORCH_LOGS"] = "+recompiles"
+import heavyball
+import heavyball.utils
+import pytest
+import torch
+from benchmark.utils import get_optim
+from heavyball.utils import clean, set_torch
+from torch import nn
+from torch._dynamo import config
+heavyball.utils.compile_mode = 'default'
+config.cache_size_limit = 128
+@pytest.mark.parametrize("opt", heavyball.__all__)
+@pytest.mark.parametrize("size,depth", [(128, 1)])
+def test_foreach(opt, size, depth: int, iterations: int = 32, outer_iterations: int = 1):
+    set_torch()
+    opt = getattr(heavyball, opt)
+    peaks = []
+    losses = []
+    for is_channels_last in [False, True]:
+        torch.manual_seed(0x2131290)
+        peaks.append([])
+        losses.append([])
+        for i in range(outer_iterations):
+            model = nn.Sequential(*[nn.Conv2d(size, size, 3) for _ in range(depth)]).cuda()
+            if is_channels_last:
+                model.to(memory_format=torch.channels_last)
+            o = get_optim(opt, model.parameters(), lr=1e-3, weight_decay=1e-4, warmup_steps=16)
+            for _ in range(iterations):
+                loss = model(torch.randn((1024, size, 4, 4), device='cuda')).square().mean()
+                loss.backward()
+                o.step()
+                o.zero_grad()
+                losses[-1].append(loss.detach())
+            del model, o
+            clean()
+    for i, (l0, l1) in enumerate(zip(*losses)):
+        print(i, l0.item(), l1.item())
+        assert torch.allclose(l0.float(), l1.float(), rtol=0.1)