PyPI - heavyball - Versions diffs - 0.21.8__tar.gz → 0.23.0__tar.gz - Mend

heavyball 0.21.8tar.gz → 0.23.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

{heavyball-0.21.8 → heavyball-0.23.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.21.8
+Version: 0.23.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler
@@ -32,7 +32,7 @@ A simple package of efficient optimizers
 The goal is not to thrive for completeness, full maintenance or abstraction, but instead to provide a simple
 largely static alternative to `torch.optim` with more and better optimizers.
-Currently (2024-11-22, 0.21.0), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
+Currently (2024-11-26, 0.22.1), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
 recommended experimental optimizer is `DelayedPSGDKron` ([tuning guide](docs/psgd_efficiency.md)).
 ## Features

{heavyball-0.21.8 → heavyball-0.23.0}/README.md RENAMED Viewed

@@ -8,7 +8,7 @@ A simple package of efficient optimizers
 The goal is not to thrive for completeness, full maintenance or abstraction, but instead to provide a simple
 largely static alternative to `torch.optim` with more and better optimizers.
-Currently (2024-11-22, 0.21.0), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
+Currently (2024-11-26, 0.22.1), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
 recommended experimental optimizer is `DelayedPSGDKron` ([tuning guide](docs/psgd_efficiency.md)).
 ## Features

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/__init__.py RENAMED Viewed

@@ -1,3 +1,4 @@
+from .cached_delayed_psgd_kron import ForeachCachedDelayedPSGDKron
 from .cached_psgd_kron import ForeachCachedPSGDKron
 from .delayed_psgd import ForeachDelayedPSGD
 from .foreach_adamw import ForeachAdamW
@@ -14,7 +15,6 @@ from .precond_schedule_sfpsoap import PrecondScheduleSFPaLMSOAP
 from .psgd_kron import ForeachPSGDKron
 from .pure_psgd import ForeachPurePSGD
 from .schedule_free_palm_foreach_soap import SFPaLMForeachSOAP
-from .cached_delayed_psgd_kron import ForeachCachedDelayedPSGDKron
 PalmForEachSoap = PaLMForeachSOAP
@@ -39,7 +39,8 @@ CachedDelayedPSGDKron = ForeachCachedDelayedPSGDKron
 __all__ = ['PalmForEachSoap', 'PaLMForeachSFAdamW', 'PaLMForeachSOAP', 'SFPaLMForeachSOAP', 'PrecondScheduleSFPaLMSOAP',
            'ForeachSOAP', 'ForeachSFAdamW', 'ForeachLaProp', 'ForeachADOPT', 'PrecondScheduleForeachSOAP',
            'PrecondSchedulePaLMForeachSOAP', 'ForeachPSGDKron', 'ForeachAdamW', 'ForeachPurePSGD', 'ForeachPaLMPAdam',
-           'ForeachDelayedPSGD', 'ForeachCachedPSGDKron', 'ForeachCachedDelayedPSGDKron',  #
-           'PaLMSOAP', 'PaLMSFAdamW', 'PaLMSFSoap', 'PaLMSFAdamW', 'PrecondScheduleSFPaLMSOAP',
-           'SOAP', 'SFAdamW', 'LaProp', 'ADOPT', 'PSGDKron', 'AdamW', 'PurePSGD', 'PaLMPAdam', 'DelayedPSGD',
-           'CachedPSGDKron', 'CachedDelayedPSGDKron', 'PrecondScheduleSOAP', 'PrecondSchedulePaLMSOAP']
+           'ForeachDelayedPSGD', 'ForeachCachedPSGDKron', 'ForeachCachedDelayedPSGDKron',
+             #
+           'PaLMSOAP', 'PaLMSFAdamW', 'PaLMSFSoap', 'PaLMSFAdamW', 'PrecondScheduleSFPaLMSOAP', 'SOAP', 'SFAdamW',
+           'LaProp', 'ADOPT', 'PSGDKron', 'AdamW', 'PurePSGD', 'PaLMPAdam', 'DelayedPSGD', 'CachedPSGDKron',
+           'CachedDelayedPSGDKron', 'PrecondScheduleSOAP', 'PrecondSchedulePaLMSOAP']

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/cached_delayed_psgd_kron.py RENAMED Viewed

@@ -9,7 +9,7 @@ from typing import Optional
 import torch
 from heavyball.utils import min_dtype, precond_grad_cached_
-from .utils import update_param_, warmup, init_Q_exprs, trust_region_clip_, PSGDBase, split_p_and_g_in_group, \
+from .utils import update_param_, warmup, init_Q_exprs, trust_region_clip_, PSGDBase,  \
     line_to_triu, triu_to_line, einsum_base, promote, stochastic_lerp_, beta_debias, precond_grad_cached_
@@ -43,7 +43,8 @@ class ForeachCachedDelayedPSGDKron(PSGDBase):
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True,
                  foreach: bool = True, q_dtype='float32', stochastic_schedule: bool = True,
-                 storage_dtype: str = 'float32',  #
+                 storage_dtype: str = 'float32', mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025,
+                 #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -61,7 +62,7 @@ class ForeachCachedDelayedPSGDKron(PSGDBase):
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
                         split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype,
-                        storage_dtype=storage_dtype)
+                        storage_dtype=storage_dtype, caution=caution, mars_gamma=mars_gamma, mars=mars)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -81,7 +82,7 @@ class ForeachCachedDelayedPSGDKron(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=beta):
             state = self.state_(p)
             if 'Q' not in state:
@@ -120,7 +121,7 @@ class ForeachCachedDelayedPSGDKron(PSGDBase):
             q_orig = Q_list.pop(0)
             ea = exp_avg_list.pop(0)
-            precond_grad_cached_(cached_q, ea, self.state_(p)['cache_expr'], p, lr, weight_decay, self.clip_fn)
+            precond_grad_cached_(cached_q, ea, self.state_(p)['cache_expr'], p, lr, weight_decay, self.clip_fn, group['caution'], g)
             if should_update:
                 q = line_to_triu(q_orig) if store_triu_as_line else q_orig

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/cached_psgd_kron.py RENAMED Viewed

@@ -8,7 +8,7 @@ from typing import Optional
 import torch
-from .utils import update_param_, warmup, init_Q_exprs, trust_region_clip_, PSGDBase, split_p_and_g_in_group, \
+from .utils import update_param_, warmup, init_Q_exprs, trust_region_clip_, PSGDBase, \
     line_to_triu, triu_to_line, einsum_base, promote, stochastic_lerp_, beta_debias, precond_grad_cached_
@@ -40,7 +40,8 @@ class ForeachCachedPSGDKron(PSGDBase):
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True,
                  foreach: bool = True, q_dtype='float32', stochastic_schedule: bool = True,
-                 storage_dtype: str = 'float32',  #
+                 storage_dtype: str = 'float32', mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025,
+                 #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -58,7 +59,7 @@ class ForeachCachedPSGDKron(PSGDBase):
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
                         split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype,
-                        storage_dtype=storage_dtype)
+                        storage_dtype=storage_dtype, caution=caution, mars_gamma=mars_gamma, mars=mars)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -78,7 +79,7 @@ class ForeachCachedPSGDKron(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=beta):
             state = self.state_(p)
             if 'Q' not in state:
@@ -128,4 +129,5 @@ class ForeachCachedPSGDKron(PSGDBase):
                     else:
                         torch.mul(q_.conj(), q_, out=c_)
-            precond_grad_cached_(cached_q, ea, self.state_(p)['cache_expr'], p, lr, weight_decay, self.clip_fn)
+            precond_grad_cached_(cached_q, ea, self.state_(p)['cache_expr'], p, lr, weight_decay, self.clip_fn,
+                                 group['caution'], g)

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/delayed_psgd.py RENAMED Viewed

@@ -5,17 +5,16 @@ Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5e
 """
 import torch
-from heavyball.utils import stochastic_lerp_, beta_debias
+from heavyball.utils import stochastic_lerp_, beta_debias, stochastic_add_
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
-    split_p_and_g_in_group, triu_to_line, line_to_triu, promote
+    triu_to_line, line_to_triu, promote,_compilable_update_
-# TODO: E1123 00:51:55.423000 159394 site-packages/torch/_guards.py:283] [5/0] Error while creating guard:
-# E1123 00:51:55.423000 159394 site-packages/torch/_guards.py:283] [5/0] Name: "G['psgd_precond_grad'].__defaults__[0]"
 @torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
-def _compilable_psgd_precond_grad_(q, exprs, ea, p, lr, weight_decay, clip_fn):
-    new = psgd_precond_grad(q, exprs, ea)
-    update_param_([p], clip_fn([new]), lr, weight_decay)
+def _compilable_psgd_precond_grad_(q, exprs, ea, p, lr, weight_decay, clip_fn, caution, grad):
+    new = psgd_precond_grad(False, exprs, ea, *q)
+    _compilable_update_([p], clip_fn([new]), weight_decay, stochastic_add_, lr, caution, [grad])
 class ForeachDelayedPSGD(PSGDBase):
@@ -46,7 +45,8 @@ class ForeachDelayedPSGD(PSGDBase):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: callable = None, store_triu_as_line: bool = True, foreach: bool = True,
-                 q_dtype='float32', stochastic_schedule: bool = True, storage_dtype: str = 'float32',  #
+                 q_dtype='float32', stochastic_schedule: bool = True, storage_dtype: str = 'float32',
+                 mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025,  #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -63,7 +63,9 @@ class ForeachDelayedPSGD(PSGDBase):
                         min_ndim_triangular=min_ndim_triangular, memory_save_mode=memory_save_mode,
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
-                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, storage_dtype=storage_dtype)
+                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype,
+                        storage_dtype=storage_dtype,
+                        caution=caution, mars_gamma=mars_gamma, mars=mars)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -83,7 +85,7 @@ class ForeachDelayedPSGD(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=beta):
             state = self.state_(p)
             if 'Q' not in state:
@@ -112,7 +114,8 @@ class ForeachDelayedPSGD(PSGDBase):
             q_orig = Q_list.pop(0)
             ea = exp_avg_list.pop(0)
             q = line_to_triu(q_orig) if store_triu_as_line else q_orig
-            _compilable_psgd_precond_grad_(q, self.state_(p)["exprs"], ea, p, lr, weight_decay, self.clip_fn)
+            _compilable_psgd_precond_grad_(q, self.state_(p)["exprs"][-1], ea, p, lr, weight_decay, self.clip_fn, group['caution'],
+                                           g)
             if should_update:
                 q32 = [promote(q_) for q_ in q]
                 self.do_update(group, [p], [ea if momentum_into_precond_update else g], [q32], precond_lr, [q_orig],

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/foreach_adamw.py RENAMED Viewed

@@ -1,18 +1,19 @@
 import torch
 import torch.optim
 from heavyball.utils import copy_stochastic_list_
 from .utils import warmup, exp_avg_sq_, beta_debias, update_param_, StatefulOptimizer, promote
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
-def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay):
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]
     torch._foreach_lerp_(exp_avg32, g32, 1 - beta_debias(beta1, step + 1))
     denom = list(exp_avg_sq_(exp_avg_sq32, g32, beta_debias(beta2, step + 1), eps))
-    update_param_(y, exp_avg32, lr, decay, lambda p, e, l: p.addcdiv_(e, denom.pop(0), value=l))
+    update_param_(y, exp_avg32, lr, decay, lambda p, e, l: p.addcdiv_(e, denom.pop(0), value=l), caution=caution,
+                  grad=g32)
     copy_stochastic_list_(exp_avg, exp_avg32)
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
@@ -20,9 +21,11 @@ def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps,
 class ForeachAdamW(StatefulOptimizer):
     def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32'):
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
-                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype)
+                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach)
     def _step(self, group):
@@ -48,9 +51,13 @@ class ForeachAdamW(StatefulOptimizer):
         y, grad, exp_avg_sq, exp_avg = zip(
             *[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])
+        if group['mars']:
+            self.mars_correct_list(group, y, grad, group['mars_gamma'], group['betas'][0])
         lr = -warmup(group['lr'], k + 1, group['warmup_steps'])
         lr = torch.empty((), dtype=torch.float32, device=y[0].device).fill_(lr)
         step = torch.empty((), dtype=torch.int32, device=y[0].device).fill_(k)
-        _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], step, lr, eps, decay)
+        _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], step, lr, eps, decay,
+                          group['caution'])
         group['k'] = k + 1

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/foreach_adopt.py RENAMED Viewed

@@ -5,10 +5,10 @@ from heavyball.utils import copy_stochastic_list_
 from .utils import warmup, beta_debias, update_param_, StatefulOptimizer, promote
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
-def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay):
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]
-    update_param_(y, exp_avg, lr, decay)
+    update_param_(y, exp_avg, lr, decay, caution=caution, grad=g32)
     beta1 = beta_debias(beta1, step)
     denom = torch._foreach_sqrt(exp_avg_sq32)
@@ -27,9 +27,11 @@ def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps,
 class ForeachADOPT(StatefulOptimizer):
     def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32'):
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
-                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype)
+                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach)
     def _step(self, group):
@@ -57,11 +59,14 @@ class ForeachADOPT(StatefulOptimizer):
         group['k'] = k + 1
+        if group['mars']:
+            self.mars_correct_list(group, y, grad, group['mars_gamma'], group['betas'][0])
         if k > 1:
             lr = -warmup(group['lr'], k - 1, group['warmup_steps'])
             lr = torch.empty((), dtype=torch.float32, device=y[0].device).fill_(lr)
             k = torch.empty((), dtype=torch.int32, device=y[0].device).fill_(k)
-            _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], k, lr, eps, decay)
+            _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], k, lr, eps, decay, group['caution'])
             return
         grad = [promote(g) for g in grad]

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/foreach_laprop.py RENAMED Viewed

@@ -4,8 +4,8 @@ import torch.optim
 from .utils import warmup, exp_avg_sq_, beta_debias, update_param_, StatefulOptimizer, promote, copy_stochastic_list_
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
-def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay):
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]
     denom = exp_avg_sq_(exp_avg_sq32, g32, beta_debias(beta2, step), eps)
@@ -14,7 +14,7 @@ def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps,
     torch._foreach_mul_(exp_avg32, beta1)
     [ea32.addcdiv_(g, d, value=1 - beta1) for ea32, g, d in zip(exp_avg32, g32, denom)]
-    update_param_(y, exp_avg32, lr, decay)
+    update_param_(y, exp_avg32, lr, decay, caution=caution, grad=g32)
     copy_stochastic_list_(exp_avg, exp_avg32)
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
@@ -23,9 +23,11 @@ def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps,
 class ForeachLaProp(StatefulOptimizer):
     def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=1,
-                 foreach: bool = True, storage_dtype: str = 'float32'):
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
-                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype)
+                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach)
     def _step(self, group):
@@ -52,10 +54,14 @@ class ForeachLaProp(StatefulOptimizer):
             *[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg'])  #
               for p in active_p])
+        if group['mars']:
+            self.mars_correct_list(group, y, grad, group['mars_gamma'], group['betas'][0])
         lr = -warmup(group['lr'], k + 1, group['warmup_steps'])
         lr = torch.empty((), dtype=torch.float32, device=y[0].device).fill_(lr)
         step = torch.empty((), dtype=torch.int32, device=y[0].device).fill_(k + 1)
-        _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], step, lr, eps, decay)
+        _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], step, lr, eps, decay,
+                          group['caution'])
         group['k'] = k + 1

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/foreach_sfadamw.py RENAMED Viewed

@@ -5,7 +5,7 @@ from heavyball.utils import get_ckp1, copy_stochastic_list_
 from .utils import warmup, ScheduleFree, exp_avg_sq_, beta_debias, promote, _compilable_schedule_free_
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_step_(y, grad, exp_avg_sq, z, beta1, beta2, step, ckp1, eps, decay, lr):
     old_debiased2 = beta_debias(beta2, step)
@@ -21,13 +21,17 @@ def _compilable_step_(y, grad, exp_avg_sq, z, beta1, beta2, step, ckp1, eps, dec
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
 class ForeachSFAdamW(ScheduleFree):
     def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0, r=0.0,
-                 weight_lr_power=2.0, foreach: bool = True, storage_dtype: str = 'float32'):
+                 weight_lr_power=2.0, foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False,
+                 caution: bool = False, mars_gamma: float = 0.0025):
+        assert not caution, "Caution not implemented for SFAdamW"
         defaults = dict(lr=lr, betas=betas, eps=eps, r=r, k=0, warmup_steps=warmup_steps, train_mode=True,
                         weight_sum=0.0, lr_max=-1.0, weight_lr_power=weight_lr_power, weight_decay=weight_decay,
-                        foreach=foreach, storage_dtype=storage_dtype)
+                        foreach=foreach, storage_dtype=storage_dtype, mars=mars, caution=caution, mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach)
     def _step(self, group):
@@ -53,6 +57,9 @@ class ForeachSFAdamW(ScheduleFree):
         y, grad, exp_avg_sq, z = zip(*[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['z'])  #
                                        for p in active_p])
+        if group['mars']:
+            self.mars_correct_list(group, y, grad, group['mars_gamma'], group['betas'][0])
         lr = warmup(group['lr'], k + 1, group['warmup_steps'])
         ckp1, group['weight_sum'] = get_ckp1(lr, group['weight_lr_power'], group['weight_sum'], group['r'], k + 1)

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/foreach_soap.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, beta_debias, exp_avg_sq_, update_param_, set_, \
-    split_p_and_g_in_group, StatefulOptimizer, exp_avg_
+    StatefulOptimizer, exp_avg_
 class ForeachSOAP(StatefulOptimizer):
@@ -26,11 +26,13 @@ class ForeachSOAP(StatefulOptimizer):
                  weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
-                 split: bool = False, foreach: bool = True):
+                 split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         defaults = {"lr": lr, "betas": betas, "shampoo_beta": shampoo_beta, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
-                    "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'split': split}
+                    "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'split': split, 'mars': mars,
+                    'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
@@ -41,7 +43,7 @@ class ForeachSOAP(StatefulOptimizer):
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['betas'][0]):
             state = self.state_(p)
             step = state['step'] = state.get("step", -1) + 1
@@ -71,6 +73,8 @@ class ForeachSOAP(StatefulOptimizer):
         step_tensor = torch.empty((), dtype=torch.int32, device=p_list[0].device).fill_(step)
         denom = exp_avg_(exp_avg, exp_avg_sq, grad, grad_projected, beta1, beta2, step_tensor)
+        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
         for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
             state = self.state_(p)
             # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
@@ -80,11 +84,9 @@ class ForeachSOAP(StatefulOptimizer):
             # Projecting back the preconditioned (by Adam) exponential moving average of gradients
             # to the original space
             # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
-            set_(d, project(exp_avg_projected / d, state['Q'], True))
+            precond = project(exp_avg_projected / d, state['Q'], True)
             update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2,
                                   step > 0 and step % group['precondition_frequency'] == 0)
-        # Why does this have to be rebiased here?
-        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
-        update_param_(p_list, denom, step_size, group["weight_decay"])
+            update_param_([p], [precond], step_size, group["weight_decay"], caution=group['caution'], grad=[g])

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/p_adam.py RENAMED Viewed

@@ -5,10 +5,10 @@ Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5e
 """
 import torch
 from heavyball.utils import triu_to_line, line_to_triu, identity, stochastic_lerp_
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, PSGDBase, exp_avg_sq_, beta_debias, \
-    split_p_and_g_in_group, promote
+    promote
 class ForeachPaLMPAdam(PSGDBase):
@@ -39,7 +39,8 @@ class ForeachPaLMPAdam(PSGDBase):
                  momentum_into_precond_update=True, warmup_steps: int = 1, betas=(None, None), beta: float = 0.9,
                  beta2_scale: float = 0.8, merge_dims: bool = False, split: bool = False, clip_fn: callable = None,
                  store_triu_as_line: bool = True, foreach: bool = True, q_dtype='float32',
-                 stochastic_schedule: bool = True,  storage_dtype:str ='float32',#
+                 stochastic_schedule: bool = True, storage_dtype: str = 'float32', mars: bool = False,
+                 caution: bool = False, mars_gamma: float = 0.0025,  #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -57,7 +58,8 @@ class ForeachPaLMPAdam(PSGDBase):
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, beta=beta,
                         beta2_scale=beta2_scale, merge_dims=merge_dims, split=split,
-                        store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, storage_dtype=storage_dtype)
+                        store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, storage_dtype=storage_dtype,
+                        mars=mars, caution=caution, mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -75,7 +77,7 @@ class ForeachPaLMPAdam(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=group['beta']):
             state = self.state_(p)
             if 'Q' not in state:
@@ -107,13 +109,13 @@ class ForeachPaLMPAdam(PSGDBase):
         lr = -warmup(lr, group['step'], group['warmup_steps'])
         for p, Q, g, ea, eas in zip(p_list, Q_triu, grad_list, exp_avg, exp_avg_sq):
-            psgd_precond_grad(Q, self.state_(p)["exprs"], g, inplace=True)
-            ea = psgd_precond_grad(Q, self.state_(p)["exprs"], ea)
+            gc = g.clone() if group['caution'] else None
+            psgd_precond_grad(True, self.state_(p)["exprs"][-1], g, *Q)
+            ea = psgd_precond_grad(False, self.state_(p)["exprs"][-1], ea, *Q)
             exp_avg_sq_(eas, g, beta_debias(beta2, group['step']), 1e-8, out=g)
             torch.div(ea, g, out=g)
             """
             divide by g here, because g == denom (from exp_avg_sq_(out=g)), avoids denom allocation
             divide into g so we can deallocate ea, avoids one allocation (-> less memory than equivalent foreach)
             """
-            update_param_([p], self.clip_fn([g]), lr, weight_decay)
+            update_param_([p], self.clip_fn([g]), lr, weight_decay, caution=group['caution'], grad=gc)

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/palm_foreach_sfadamw.py RENAMED Viewed

@@ -5,7 +5,7 @@ from .utils import warmup, ScheduleFree, exp_avg_sq_, beta_debias, get_ckp1, pro
     _compilable_schedule_free_, copy_stochastic_list_
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_step_(y, grad, exp_avg_sq, z, beta1, beta2, step, ckp1, eps, decay, lr):
     old_debiased2 = beta_debias(beta2, step)
@@ -24,12 +24,17 @@ def _compilable_step_(y, grad, exp_avg_sq, z, beta1, beta2, step, ckp1, eps, dec
 class PaLMForeachSFAdamW(ScheduleFree):
     def __init__(self, params, lr=0.0025, beta=0.9, betas=(None, None), eps=1e-8, weight_decay=0, warmup_steps=0, r=0.0,
-                 weight_lr_power=2.0, beta2_scale: float = 0.8, foreach: bool = True, storage_dtype: str = 'float32'):
+                 weight_lr_power=2.0, beta2_scale: float = 0.8, foreach: bool = True, storage_dtype: str = 'float32',
+                 mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
+        assert not caution, "Caution not implemented for SFAdamW"
         defaults = dict(lr=lr, beta=beta, eps=eps, r=r, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
                         lr_max=-1.0, weight_lr_power=weight_lr_power, weight_decay=weight_decay,
-                        beta2_scale=beta2_scale, storage_dtype=storage_dtype)
+                        beta2_scale=beta2_scale, storage_dtype=storage_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach)
     def _step(self, group):
@@ -58,6 +63,9 @@ class PaLMForeachSFAdamW(ScheduleFree):
         y, grad, exp_avg_sq, z = zip(*[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['z'])  #
                                        for p in active_p])
+        if group['mars']:
+            self.mars_correct_list(group, y, grad, group['mars_gamma'], group['betas'][0])
         lr = warmup(group['lr'], k + 1, group['warmup_steps'])
         ckp1, group['weight_sum'] = get_ckp1(lr, group['weight_lr_power'], group['weight_sum'], group['r'], k + 1)

{heavyball-0.21.8 → heavyball-0.23.0}/heavyball/palm_foreach_soap.py RENAMED Viewed

@@ -1,8 +1,7 @@
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, beta_debias, exp_avg_sq_, update_param_, set_, \
-    split_p_and_g_in_group, StatefulOptimizer, exp_avg_
+    StatefulOptimizer, exp_avg_
 class PaLMForeachSOAP(StatefulOptimizer):
@@ -33,14 +32,15 @@ class PaLMForeachSOAP(StatefulOptimizer):
                  max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
-                 beta2_scale: float = 0.8, split: bool = False, foreach: bool = True):
+                 beta2_scale: float = 0.8, split: bool = False, foreach: bool = True, mars: bool = False,
+                 caution: bool = False, mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
         defaults = {"lr": lr, "beta": beta, "shampoo_beta": shampoo_beta, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'beta2_scale': beta2_scale,
-                    'split': split}
+                    'split': split, 'mars': mars, 'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
@@ -51,7 +51,7 @@ class PaLMForeachSOAP(StatefulOptimizer):
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             step = state['step'] = state.get("step", -1) + 1
@@ -82,6 +82,7 @@ class PaLMForeachSOAP(StatefulOptimizer):
         beta2 = torch.empty((), dtype=torch.float32, device=p_list[0].device).fill_(beta2)
         step_tensor = torch.empty((), dtype=torch.int32, device=p_list[0].device).fill_(step)
         denom = exp_avg_(exp_avg, exp_avg_sq, grad, grad_projected, beta1, beta2, step_tensor)
+        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
         for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
             state = self.state_(p)
@@ -92,11 +93,9 @@ class PaLMForeachSOAP(StatefulOptimizer):
             # Projecting back the preconditioned (by Adam) exponential moving average of gradients
             # to the original space
             # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
-            set_(d, project(exp_avg_projected / d, state['Q'], True))
+            precond = project(exp_avg_projected / d, state['Q'], True)
             update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2,
                                   step > 0 and step % group['precondition_frequency'] == 0)
-        # Why does this have to be rebiased here?
-        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
-        update_param_(p_list, denom, step_size, group["weight_decay"])
+            update_param_([p], [precond], step_size, group["weight_decay"], caution=group['caution'], grad=[g])

heavyball 0.21.8__tar.gz → 0.23.0__tar.gz

heavyball 0.21.8tar.gz → 0.23.0tar.gz