PyPI - heavyball - Versions diffs - 0.21.7__py3-none-any.whl → 0.22.0__py3-none-any.whl - Mend

heavyball 0.21.7py3-none-any.whl → 0.22.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

heavyball/__init__.py +6 -5
heavyball/cached_delayed_psgd_kron.py +6 -5
heavyball/cached_psgd_kron.py +7 -5
heavyball/delayed_psgd.py +12 -9
heavyball/foreach_adamw.py +14 -7
heavyball/foreach_adopt.py +11 -6
heavyball/foreach_laprop.py +12 -6
heavyball/foreach_sfadamw.py +10 -3
heavyball/foreach_soap.py +10 -8
heavyball/p_adam.py +9 -7
heavyball/palm_foreach_sfadamw.py +11 -3
heavyball/palm_foreach_soap.py +8 -9
heavyball/precond_schedule_foreach_soap.py +10 -8
heavyball/precond_schedule_palm_foreach_soap.py +9 -9
heavyball/precond_schedule_sfpsoap.py +10 -5
heavyball/psgd_kron.py +8 -5
heavyball/pure_psgd.py +10 -6
heavyball/schedule_free_palm_foreach_soap.py +13 -5
heavyball/utils.py +120 -54
{heavyball-0.21.7.dist-info → heavyball-0.22.0.dist-info}/METADATA +2 -2
heavyball-0.22.0.dist-info/RECORD +24 -0
heavyball-0.21.7.dist-info/RECORD +0 -24
{heavyball-0.21.7.dist-info → heavyball-0.22.0.dist-info}/LICENSE +0 -0
{heavyball-0.21.7.dist-info → heavyball-0.22.0.dist-info}/WHEEL +0 -0
{heavyball-0.21.7.dist-info → heavyball-0.22.0.dist-info}/top_level.txt +0 -0

heavyball/precond_schedule_palm_foreach_soap.py CHANGED Viewed

@@ -3,7 +3,7 @@ import random
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, beta_debias, exp_avg_, update_param_, \
-    precond_schedule, set_, split_p_and_g_in_group, StatefulOptimizer
+    precond_schedule, set_, StatefulOptimizer
 class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
@@ -33,14 +33,15 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
                  precond_scheduler=(1 / 3, 9), betas=(None, None), beta2_scale: float = 0.8, split: bool = False,
-                 foreach: bool = True):
+                 foreach: bool = True, mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
         defaults = {"lr": lr, "beta": beta, "shampoo_beta": shampoo_beta, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'precond_scheduler': precond_scheduler,
-                    'beta2_scale': beta2_scale, 'split': split}
+                    'beta2_scale': beta2_scale, 'split': split, 'mars': mars, 'caution': caution,
+                    'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
@@ -52,7 +53,7 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             step = state['step'] = state.get("step", -1) + 1
@@ -86,6 +87,8 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
         denom = exp_avg_(exp_avg, exp_avg_sq, grad, grad_projected, beta1, beta2, step_tensor)
         update_precond = precond_schedule(step, group['precond_scheduler'], self.rng)
+        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
         for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
             state = self.state_(p)
             # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
@@ -96,10 +99,7 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
             # to the original space
             # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
             exp_avg_projected = exp_avg_projected / d
-            set_(d, project(exp_avg_projected, state['Q'], True))
+            precond = project(exp_avg_projected, state['Q'], True)
             update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2, update_precond)
-        # Why does this have to be rebiased here?
-        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
-        update_param_(p_list, denom, step_size, group["weight_decay"])
+            update_param_([p], [precond], step_size, group["weight_decay"], caution=group['caution'], grad=[g])

heavyball/precond_schedule_sfpsoap.py CHANGED Viewed

@@ -3,11 +3,11 @@ import random
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, set_, adaptive_gradient_clipping_, exp_avg_sq_, \
-    beta_debias, schedule_free_, warmup, ScheduleFree, precond_schedule, split_p_and_g_in_group, copy_stochastic_list_, \
+    beta_debias, schedule_free_, warmup, ScheduleFree, precond_schedule, copy_stochastic_list_, \
     promote
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, eps):
     eas32, gp32 = [list(map(promote, x)) for x in (exp_avg_sq, grad_projected)]
     denom = exp_avg_sq_(eas32, gp32, old_debiased2, eps)
@@ -52,15 +52,20 @@ class PrecondScheduleSFPaLMSOAP(ScheduleFree):
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1, r=0.0,
                  weight_lr_power=2.0, gradient_clip_val: float = 0.1, precond_scheduler=(1 / 3, 9), betas=(None, None),
-                 split: bool = False, foreach: bool = True):
+                 split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
+        assert not caution, "Caution is not implemented in ScheduleFree optimizers"
         defaults = {"lr": lr, "beta": beta, "beta2_scale": beta2_scale, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'r': r,
                     'weight_lr_power': weight_lr_power, 'train_mode': True, 'step': -1, 'weight_sum': 0,
-                    'gradient_clip_val': gradient_clip_val, 'precond_scheduler': precond_scheduler, 'split': split}
+                    'gradient_clip_val': gradient_clip_val, 'precond_scheduler': precond_scheduler, 'split': split,
+                    'mars': mars, 'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
@@ -87,7 +92,7 @@ class PrecondScheduleSFPaLMSOAP(ScheduleFree):
         # adaptive gradient clipping
         adaptive_gradient_clipping_(p_list, grad, group["gradient_clip_val"], eps=group["eps"])
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             if "z" not in state:

heavyball/psgd_kron.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Optional
 import torch
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
-    split_p_and_g_in_group, line_to_triu, triu_to_line, promote, stochastic_lerp_, beta_debias
+    line_to_triu, triu_to_line, promote, stochastic_lerp_, beta_debias
 class ForeachPSGDKron(PSGDBase):
@@ -40,7 +40,8 @@ class ForeachPSGDKron(PSGDBase):
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True,
                  foreach: bool = True, q_dtype='float32', stochastic_schedule: bool = True,
-                 storage_dtype: str = 'float32',  #
+                 storage_dtype: str = 'float32', mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025,
+                 #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -57,7 +58,9 @@ class ForeachPSGDKron(PSGDBase):
                         min_ndim_triangular=min_ndim_triangular, memory_save_mode=memory_save_mode,
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
-                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, storage_dtype=storage_dtype)
+                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype,
+                        storage_dtype=storage_dtype,
+                        mars=mars, caution=caution, mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -77,7 +80,7 @@ class ForeachPSGDKron(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=beta):
             state = self.state_(p)
             if 'Q' not in state:
@@ -114,4 +117,4 @@ class ForeachPSGDKron(PSGDBase):
                 self.do_update(group, [p], [ea if momentum_into_precond_update else g], [q32], precond_lr, [q_orig],
                                store_triu_as_line)
             g = psgd_precond_grad(q, self.state_(p)["exprs"], ea)
-            update_param_([p], self.clip_fn([g]), lr, weight_decay)
+            update_param_([p], self.clip_fn([g]), lr, weight_decay, caution=group['caution'], grad=[g])

heavyball/pure_psgd.py CHANGED Viewed

@@ -5,9 +5,9 @@ Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5e
 """
 import torch
 from heavyball.utils import identity
-from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, PSGDBase, split_p_and_g_in_group, \
+from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, PSGDBase, \
     line_to_triu, triu_to_line, promote
@@ -38,7 +38,8 @@ class ForeachPurePSGD(PSGDBase):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: callable = None, store_triu_as_line: bool = True, foreach: bool = True,
-                 q_dtype='float32', stochastic_schedule: bool = True,  #
+                 q_dtype='float32', stochastic_schedule: bool = True, mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025,  #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -49,11 +50,14 @@ class ForeachPurePSGD(PSGDBase):
         if clip_fn is None:
             clip_fn = identity
+        assert not mars, "MARS is not supported in this optimizer"
         defaults = dict(lr=lr, weight_decay=weight_decay, max_size_triangular=max_size_triangular,
                         min_ndim_triangular=min_ndim_triangular, memory_save_mode=memory_save_mode,
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
-                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype)
+                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -70,7 +74,7 @@ class ForeachPurePSGD(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=0.0):
             state = self.state_(p)
             if 'Q' not in state:
@@ -98,4 +102,4 @@ class ForeachPurePSGD(PSGDBase):
                 q32 = [promote(q_) for q_ in q]
                 self.do_update(group, [p], [g], [q32], precond_lr, [q_orig], store_triu_as_line)
             psgd_precond_grad(q, self.state_(p)["exprs"], g, inplace=True)
-            update_param_([p], self.clip_fn([g]), lr, weight_decay)
+            update_param_([p], self.clip_fn([g]), lr, weight_decay, caution=group['caution'], grad=[g])

heavyball/schedule_free_palm_foreach_soap.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import random
 import torch
+from heavyball.utils import mars_correction
 from .utils import init_preconditioner, update_preconditioner, project, set_, adaptive_gradient_clipping_, exp_avg_sq_, \
-    beta_debias, schedule_free_, warmup, ScheduleFree, split_p_and_g_in_group, copy_stochastic_list_, promote
+    beta_debias, schedule_free_, warmup, ScheduleFree, copy_stochastic_list_, promote
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, eps):
     eas32, gp32 = [list(map(promote, x)) for x in (exp_avg_sq, grad_projected)]
     denom = exp_avg_sq_(eas32, gp32, old_debiased2, eps)
@@ -44,15 +45,19 @@ class SFPaLMForeachSOAP(ScheduleFree):
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1, r=0.0,
                  weight_lr_power=2.0, gradient_clip_val: float = 0.1, betas=(None, None), split: bool = False,
-                 foreach: bool = True):
+                 foreach: bool = True, mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
+        assert not caution, "Caution is not implemented in ScheduleFree optimizers"
         defaults = {"lr": lr, "beta": beta, "beta2_scale": beta2_scale, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'r': r,
                     'weight_lr_power': weight_lr_power, 'train_mode': True, 'step': -1,
-                    'gradient_clip_val': gradient_clip_val, 'weight_sum': 0, 'split': split}
+                    'gradient_clip_val': gradient_clip_val, 'weight_sum': 0, 'split': split, 'mars': mars,
+                    'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
@@ -61,6 +66,7 @@ class SFPaLMForeachSOAP(ScheduleFree):
         vals = []
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
+        mars = group['mars']
         step = group['step'] = group.get("step", 0) + 1
@@ -79,12 +85,14 @@ class SFPaLMForeachSOAP(ScheduleFree):
         vals = []
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             if "z" not in state:
                 state["z"] = torch.clone(p).float()
                 state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                if mars:
+                    state['mars_prev_grad'] = g.clone()
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

heavyball/utils.py CHANGED Viewed

@@ -38,7 +38,7 @@ def warmup(lr: float, step: int, warmup_steps: int):
     return lr * step / warmup_steps
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_schedule_free_(p, z, ckp1, grad, lr, beta1):
     p32 = promote(p)
     z32 = promote(z)
@@ -141,19 +141,27 @@ def beta_debias(beta, step):
     return 1 - (1 - beta) / (1 - beta ** step)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
-def exp_avg_sq_(state, grad, beta2, eps, out=None):
-    if isinstance(state, torch.Tensor):
-        state.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-        return torch.sqrt(state, out=out).clamp_(min=eps)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_exp_avg_sq_(state, grad, beta2, eps, out=None):
     torch._foreach_mul_(state, beta2)
     [s.addcmul_(g, g, value=1 - beta2) for s, g in zip(state, grad)]
     denom = torch._foreach_sqrt(state)
-    torch._foreach_maximum_(denom, eps)
+    [denom.clamp_(min=eps) for denom in denom]
+    if out is not None:
+        copy_stochastic_list_(out, denom)
+        return out
     return denom
+def exp_avg_sq_(state, grad, beta2, eps, out=None):
+    state, grad = list_guard(state), list_guard(grad)
+    if not isinstance(beta2, torch.Tensor):
+        beta2 = torch.empty((), dtype=torch.float32, device=state[0].device).fill_(beta2)
+    if not isinstance(eps, torch.Tensor):
+        eps = torch.empty((), dtype=torch.float32, device=state[0].device).fill_(eps)
+    return _compilable_exp_avg_sq_(state, grad, beta2, eps, out)
 def adaptive_gradient_clipping_(parameters: List[torch.Tensor], gradients: List[torch.Tensor], clip_val: float,
                                 minimum: float = 1e-3, eps: float = 1e-8):
     if clip_val <= 0:
@@ -168,12 +176,19 @@ def adaptive_gradient_clipping_(parameters: List[torch.Tensor], gradients: List[
     torch._foreach_mul_(gradients, p_norm)
+def is_compiling():
+    try:
+        return torch.compiler.is_compiling()
+    except AttributeError:
+        return True
 def set_(dst: torch.Tensor, src: torch.Tensor):
-    if not torch.compiler.is_compiling() and src.data_ptr() == dst.data_ptr():
+    if not is_compiling() and src.data_ptr() == dst.data_ptr():
         return
     if src.shape != dst.shape:
         src = src.reshape_as(dst)
-    if not torch.compiler.is_compiling() and src.is_contiguous() and dst.is_contiguous() and src.dtype == dst.dtype:
+    if not is_compiling() and src.is_contiguous() and dst.is_contiguous() and src.dtype == dst.dtype:
         dst.set_(src)
     else:
         dst.copy_(src)
@@ -328,7 +343,7 @@ def get_orthogonal_matrix(mat):
     return final
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_stochastic_lerp_(x: List[torch.Tensor], y: List[torch.Tensor], a: Union[float, int, torch.Tensor]):
     for x_, y_ in zip(x, y):
         x32 = promote(x_)
@@ -338,12 +353,19 @@ def _compilable_stochastic_lerp_(x: List[torch.Tensor], y: List[torch.Tensor], a
 def stochastic_lerp_(x: List[torch.Tensor], y: List[torch.Tensor], a: Union[float, int, torch.Tensor]):
+    x, y = list_guard(x), list_guard(y)
     if not isinstance(a, torch.Tensor):
         a = torch.empty((), dtype=torch.float32, device=x[0].device).fill_(a)
     _compilable_stochastic_lerp_(x, y, a)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+def list_guard(x):
+    if isinstance(x, (list, tuple)):
+        return x
+    return [x]
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_stochastic_add_(x: List[torch.Tensor], y: List[torch.Tensor], alpha: Union[float, int, torch.Tensor]):
     for x_, y_ in zip(x, y):
         x32 = promote(x_)
@@ -353,6 +375,7 @@ def _compilable_stochastic_add_(x: List[torch.Tensor], y: List[torch.Tensor], al
 def stochastic_add_(x: List[torch.Tensor], y: List[torch.Tensor], alpha: Union[float, int, torch.Tensor]):
+    x, y = list_guard(x), list_guard(y)
     if not isinstance(alpha, torch.Tensor):
         alpha = torch.empty((), dtype=torch.float32, device=x[0].device).fill_(alpha)
     _compilable_stochastic_add_(x, y, alpha)
@@ -463,6 +486,43 @@ class StatefulOptimizer(torch.optim.Optimizer):
     def state_(self, arg: torch.Tensor):
         return self.state[self.key(arg)]
+    def mars_correct_list(self, group, p_list, g_list, mars_gamma, beta):
+        for p, g in zip(p_list, g_list):
+            state = self.state_(p)
+            if 'mars_old_grad' not in state:
+                state['mars_old_grad'] = torch.zeros_like(g)
+        old_gs = [self.state_(p)['mars_old_grad'] for p in p_list]
+        mars_correction(g_list, old_gs, mars_gamma, beta)
+    def split_p_and_g_in_group(self, group: dict, skip_none: bool = True, should_promote: bool = True,
+                               beta1: float = -1.0):
+        for p in group["params"]:
+            if skip_none and p.grad is None:
+                continue
+            if p.grad is None:
+                grad = None
+            else:
+                if should_promote:
+                    grad = promote(p.grad)
+                else:
+                    grad = p.grad
+                if beta1 >= 0 and group.get('mars', False):
+                    self.mars_correct_list(group, [p], [grad], group['mars_gamma'], beta1)
+                p.grad = None
+            p_views = merge_group(group, p)
+            if grad is not None:
+                grad = merge_group(group, grad)
+            if isinstance(p_views, torch.Tensor):
+                yield p_views, grad
+                continue
+            if grad is None:
+                yield from zip(p_views, [None] * len(p_views))
+                continue
+            yield from zip(p_views, grad)
     def state_size(self) -> int:
         total_bytes = 0
@@ -472,7 +532,7 @@ class StatefulOptimizer(torch.optim.Optimizer):
                 total_bytes += x.numel() * x.element_size()
         for group in self.param_groups:
-            for p, _ in split_p_and_g_in_group(group, skip_none=False):
+            for p, _ in self.split_p_and_g_in_group(group, skip_none=False):
                 tree_map(_add, self.state_(p))
         return total_bytes
@@ -581,7 +641,7 @@ def copy_stochastic_list_(target: List[torch.Tensor], source: List[torch.Tensor]
         copy_stochastic_(t, s)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_exp_avg_(exp_avg, exp_avg_sq, grad, grad_projected, beta1, beta2, step):
     beta1 = beta_debias(beta1, step)
     beta2 = beta_debias(beta2, step)
@@ -625,22 +685,24 @@ def _compilable_copy_stochastic_(target: torch.Tensor, source: torch.Tensor):
 def copy_stochastic_(target: torch.Tensor, source: torch.Tensor):
-    if not torch.compiler.is_compiling() and target.data_ptr() == source.data_ptr():
+    if not is_compiling() and target.data_ptr() == source.data_ptr():
         return
     if target.dtype != torch.bfloat16 or source.dtype not in (torch.float16, torch.float32, torch.float64):
         set_(target, source)
     _compilable_copy_stochastic_(target, source)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
-def _compilable_update_(p, u, decay, add_fn, lr):
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_update_(p, u, decay, add_fn, lr, caution, g):
     u = [u_.view_as(p_) for u_, p_ in zip(u, p)]
-    p32, u32 = [list(map(promote, x)) for x in [p, u]]
+    p32, u32, g32 = [list(map(promote, x)) for x in [p, u, g]]
     if decay > 0:
         torch._foreach_mul_(p32, 1 - decay * lr)
-    for p32_, u32_ in zip(p32, u32):  # lr is data-dependent -> can't compile a foreach
+    for p32_, u32_, g32_ in zip(p32, u32, g32):  # lr is data-dependent -> can't compile a foreach
+        if caution:
+            _compilable_cautioning_(g32_, u32_)
         if add_fn is None:
             p32_.add_(u32_, alpha=lr)
         else:
@@ -650,9 +712,12 @@ def _compilable_update_(p, u, decay, add_fn, lr):
 def update_param_(param: List[torch.Tensor], update: List[torch.Tensor], lr: float, decay: float,
-                  add_fn: callable = None):
+                  add_fn: callable = None, caution: bool = False, grad: List[torch.Tensor] = None):
     lr_tensor = torch.empty((), dtype=torch.float32, device=param[0].device).fill_(lr)
-    _compilable_update_(param, update, decay, add_fn, lr_tensor)
+    param, update, grad = list_guard(param), list_guard(update), list_guard(grad)
+    if not caution:
+        grad = [None] * len(param)
+    _compilable_update_(param, update, decay, add_fn, lr_tensor, caution, grad)
 def precond_schedule(step, precond_scheduler, rng):
@@ -788,7 +853,7 @@ def psgd_lb(A, max_abs):
     return x
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def psgd_update_precond(Q, exprs, G, precond_lr, tiny, oq, store_triu_as_line):
     """Update Kronecker product preconditioner Q with pair (V, G)."""
     exprA, exprGs, _ = exprs
@@ -821,7 +886,7 @@ def psgd_update_precond(Q, exprs, G, precond_lr, tiny, oq, store_triu_as_line):
         stochastic_add_([o], [term1], -1)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def psgd_precond_grad(Q, exprs, G, inplace: bool = False):
     """Precondition gradient G with preconditioner Q."""
     md = min_dtype(Q)
@@ -965,18 +1030,45 @@ class PSGDBase(StatefulOptimizer):
                         psgd_balance_Q(q)
-#@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
-def _compilable_precond_grad_cached_(cached_q, ea, expr, param, lr, weight_decay, clip_fn):
+# TODO: Figure out why this sometimes crashes
+# @torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_precond_grad_cached_(cached_q, ea, expr, param, lr, weight_decay, clip_fn, caution, grad):
     md = min_dtype(cached_q + [ea])
     new = torch.einsum(expr, *[c_.to(md) for c_ in cached_q], ea.to(md)).to(torch.float32)
-    update_param_([param], clip_fn([new]), lr, weight_decay)
+    update_param_([param], clip_fn([new]), lr, weight_decay, caution=caution, grad=grad)
 def precond_grad_cached_(cached_q: List[torch.Tensor], ea: torch.Tensor, expr: str, param: torch.Tensor, lr: float,
-                         weight_decay: float, clip_fn):
+                         weight_decay: float, clip_fn, caution, grad):
     if isinstance(lr, float):
         lr = torch.empty((), dtype=torch.float32, device=param.device).fill_(lr)
-    _compilable_precond_grad_cached_(cached_q, ea, expr, param, lr, weight_decay, clip_fn)
+    _compilable_precond_grad_cached_(cached_q, ea, expr, param, lr, weight_decay, clip_fn, caution, grad)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_mars_correction_(g, old_g, a):
+    g_copy = [g_.clone() for g_ in g]
+    _compilable_stochastic_lerp_(g, old_g, a)
+    copy_stochastic_list_(old_g, g_copy)
+def mars_correction(g, old_g, beta1, gamma):
+    a = -gamma * beta1 / (1 - beta1)
+    g, old_g = list_guard(g), list_guard(old_g)
+    a = torch.empty((), dtype=torch.float32, device=g[0].device).fill_(a)
+    _compilable_mars_correction_(g, old_g, a)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_cautioning_(g, update):
+    mask = (g * update) > 0
+    update.masked_fill_(~mask, 0)
+    scale = mask.numel() / mask.sum().clamp(min=1)
+    update.mul_(scale)
+def caution(g, update):
+    _compilable_cautioning_(g, update)
 def precond_update_prob_schedule(max_prob=1.0, min_prob=0.03, decay=0.001, flat_start=250):
@@ -1013,29 +1105,3 @@ def merge_group(group, *tensors):
         append_or_extend(out, dim_merger(t, group['max_size_triangular'] if 'max_size_triangular' in group else group[
             'max_precond_dim'], group.get('split', False)))
     return out
-def split_p_and_g_in_group(group: dict, skip_none: bool = True, should_promote: bool = True):
-    for p in group["params"]:
-        if skip_none and p.grad is None:
-            continue
-        if p.grad is None:
-            grad = None
-        else:
-            if should_promote:
-                grad = promote(p.grad)
-            else:
-                grad = p.grad
-            p.grad = None
-        p_views = merge_group(group, p)
-        if grad is not None:
-            grad = merge_group(group, grad)
-        if isinstance(p_views, torch.Tensor):
-            yield p_views, grad
-            continue
-        if grad is None:
-            yield from zip(p_views, [None] * len(p_views))
-            continue
-        yield from zip(p_views, grad)

{heavyball-0.21.7.dist-info → heavyball-0.22.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.21.7
+Version: 0.22.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler
@@ -32,7 +32,7 @@ A simple package of efficient optimizers
 The goal is not to thrive for completeness, full maintenance or abstraction, but instead to provide a simple
 largely static alternative to `torch.optim` with more and better optimizers.
-Currently (2024-11-22, 0.21.0), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
+Currently (2024-11-26, 0.22.0), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
 recommended experimental optimizer is `DelayedPSGDKron` ([tuning guide](docs/psgd_efficiency.md)).
 ## Features

heavyball-0.22.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,24 @@
+heavyball/__init__.py,sha256=icHYN-MGsmHkLUlHCMcZkOlwY7GT63_ayR_a5iPKmzM,2226
+heavyball/cached_delayed_psgd_kron.py,sha256=n3wIOhrop0Ls4MZ0kXpwGuImp1jzPs6VGdxIlPyoYdQ,6827
+heavyball/cached_psgd_kron.py,sha256=KCLsfvj9qh_2FNwRTdWM3zjnt2oGHfsf4Y341rPcceI,6778
+heavyball/delayed_psgd.py,sha256=CaG17zqorLsCSDeGEePOyb6n9ugv8W6gyRQqeQNq-e8,6272
+heavyball/foreach_adamw.py,sha256=uawSbGGUD2E1RtcwspP83yQNElERdGX-diqCI5e8FqE,2825
+heavyball/foreach_adopt.py,sha256=DFEaPswVzdHcbxC-mirsf_okM_HR6r34PDUTty5CrUE,3547
+heavyball/foreach_laprop.py,sha256=J4Vms0nAOMh3GQtAOPyrYOe5WtpzokVv25b9oDnwc2A,2833
+heavyball/foreach_sfadamw.py,sha256=HWbLekY5BloHDIgrN2J0a7IolZCt8Ah2xkLAU_-5oSc,3079
+heavyball/foreach_soap.py,sha256=7B_dP2Hm_xqwpBQiPYkv_c6eoRnU1dV2VZfvSoa4uJ8,4729
+heavyball/p_adam.py,sha256=F-id4qOkAaDTJaKTSNhSsonX-Js5IzIu1Bdj1S4qE2g,6306
+heavyball/palm_foreach_sfadamw.py,sha256=E8raxrBIkSmTEGFzwnfWxKwDJjBQE2vdsmyqfc8aL_A,3375
+heavyball/palm_foreach_soap.py,sha256=IknGm_CzrqDIFEoCkejxjoZ4sfIy6RSoInqlMUOYLB4,6156
+heavyball/precond_schedule_foreach_soap.py,sha256=bJ2ifPFa8zEP9GO8eBpqZzsmP7p_iQkkCkllNeEMHPU,4892
+heavyball/precond_schedule_palm_foreach_soap.py,sha256=4dT9f134-Faq2KuCMCHzMtrkMO-es5p_DYS1of5yF-s,6428
+heavyball/precond_schedule_sfpsoap.py,sha256=FOR-axwlkSN7IHZWYYUVFfjSFCLxc_NdiTlb-n5gmgs,7530
+heavyball/psgd_kron.py,sha256=achB23mQUT3F00IGhjjVf_8YW7VOTHR6YdoCDRyWxsI,6039
+heavyball/pure_psgd.py,sha256=dbYgkunFFA6EsO6fJEhaJRxTH0smi7qLX3Np9XTQ9E4,5079
+heavyball/schedule_free_palm_foreach_soap.py,sha256=0WT_gvTKymqLQzYT6ewDgCmpDq-HgMAewipw1QvyQYA,7267
+heavyball/utils.py,sha256=TVpyev0oL4a78px4cvtaGoGPJqfpfTKE-xBWkRCmzkw,39785
+heavyball-0.22.0.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
+heavyball-0.22.0.dist-info/METADATA,sha256=LqVR3tUgxpk21zsmKxfJAQCKLPzmaQz2PQiKvlvpQe8,11926
+heavyball-0.22.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+heavyball-0.22.0.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
+heavyball-0.22.0.dist-info/RECORD,,

heavyball-0.21.7.dist-info/RECORD DELETED Viewed

@@ -1,24 +0,0 @@
-heavyball/__init__.py,sha256=iqP428JWwwx-XDOZ0nUdbCkOLEyfoqVyWZLQLAcwxaw,2214
-heavyball/cached_delayed_psgd_kron.py,sha256=Nyxl-G-o6greKwDN-vLiw5W02GXO2LRvknc0OzvzFnE,6674
-heavyball/cached_psgd_kron.py,sha256=HzD6se0AYb-W5hpydUxcR9uqrpe_54PBwgL1VWX3DHU,6592
-heavyball/delayed_psgd.py,sha256=m4c-OvcLMrRxSAPYs2l6Up21uCyF2kvHvpcnfe3nzGs,6212
-heavyball/foreach_adamw.py,sha256=Rb5U80cgUcEqlEbUU250UTWdoqA7nyiqkV5w1U4bWX4,2445
-heavyball/foreach_adopt.py,sha256=ecdi1fKg9i087OGjtKWVbE_DD6Yf4pvpzv4ELCcusvQ,3211
-heavyball/foreach_laprop.py,sha256=vi6C_gfjXxw5uN0KHgzxI9itUI1dcgOf3ufoO_VVMp0,2471
-heavyball/foreach_sfadamw.py,sha256=rLZORmCIMu9G09FdDgMSiI6pNq34IVoxsPVWtmeDdbQ,2753
-heavyball/foreach_soap.py,sha256=4mWSMWYTdjgiXiboI5DwdigecruDtNGKylGAFAVhCRA,4562
-heavyball/p_adam.py,sha256=Xyxsavwtw-t0OyTHitYQXZSmF9UJlMDzDAURge-MbbQ,6047
-heavyball/palm_foreach_sfadamw.py,sha256=JbNrcoquBGGUI5XNMFouDjpNurVHUW9DbX1A3tSrtno,3025
-heavyball/palm_foreach_soap.py,sha256=GzAwM8kOt1X0QCmUZDTdHwPxbJwjH8ic43dyAK5BYCA,6015
-heavyball/precond_schedule_foreach_soap.py,sha256=HcObXLfSNN_lKNb4nmC6tkdHcqDIMNX6hILpHKScqLc,4744
-heavyball/precond_schedule_palm_foreach_soap.py,sha256=xZ7CJvIfdu2RNAZt2g1S7Xb0Jyy1hNC4MowOFU3nWkk,6283
-heavyball/precond_schedule_sfpsoap.py,sha256=PNneiOkrRyV1yIZn91lPmYofd1_OiLqJTDy75RLpXJk,7273
-heavyball/psgd_kron.py,sha256=RSLJi5FnSmYjvYBufNDClnYRm-eN_Kpa1Ar2tNP6-X0,5824
-heavyball/pure_psgd.py,sha256=LZK0qmvZkBF8g00evaVLtW-sIUJmdoxag1K7O26AqEo,4820
-heavyball/schedule_free_palm_foreach_soap.py,sha256=_AqrnChY6iDlQUkF2YUxS7eLjSWCIuvEUOHvMHVM1yY,6873
-heavyball/utils.py,sha256=lyJRL-j_-LW6nVcbYTOrzcMACT3lIBmYioMseqgiexk,37211
-heavyball-0.21.7.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
-heavyball-0.21.7.dist-info/METADATA,sha256=rO1SbkbCdLKf2SD9aTxb_oDuQSZTOq1uh4NmcsjBt4g,11926
-heavyball-0.21.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-heavyball-0.21.7.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
-heavyball-0.21.7.dist-info/RECORD,,

{heavyball-0.21.7.dist-info → heavyball-0.22.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{heavyball-0.21.7.dist-info → heavyball-0.22.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{heavyball-0.21.7.dist-info → heavyball-0.22.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

heavyball 0.21.7__py3-none-any.whl → 0.22.0__py3-none-any.whl

heavyball 0.21.7py3-none-any.whl → 0.22.0py3-none-any.whl