PyPI - heavyball - Versions diffs - 0.21.8__py3-none-any.whl → 0.22.0__py3-none-any.whl - Mend

heavyball 0.21.8py3-none-any.whl → 0.22.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

heavyball/__init__.py +6 -5
heavyball/cached_delayed_psgd_kron.py +6 -5
heavyball/cached_psgd_kron.py +7 -5
heavyball/delayed_psgd.py +12 -9
heavyball/foreach_adamw.py +14 -7
heavyball/foreach_adopt.py +11 -6
heavyball/foreach_laprop.py +12 -6
heavyball/foreach_sfadamw.py +10 -3
heavyball/foreach_soap.py +10 -8
heavyball/p_adam.py +9 -7
heavyball/palm_foreach_sfadamw.py +11 -3
heavyball/palm_foreach_soap.py +8 -9
heavyball/precond_schedule_foreach_soap.py +10 -8
heavyball/precond_schedule_palm_foreach_soap.py +9 -9
heavyball/precond_schedule_sfpsoap.py +10 -5
heavyball/psgd_kron.py +8 -5
heavyball/pure_psgd.py +10 -6
heavyball/schedule_free_palm_foreach_soap.py +13 -5
heavyball/utils.py +112 -46
{heavyball-0.21.8.dist-info → heavyball-0.22.0.dist-info}/METADATA +2 -2
heavyball-0.22.0.dist-info/RECORD +24 -0
heavyball-0.21.8.dist-info/RECORD +0 -24
{heavyball-0.21.8.dist-info → heavyball-0.22.0.dist-info}/LICENSE +0 -0
{heavyball-0.21.8.dist-info → heavyball-0.22.0.dist-info}/WHEEL +0 -0
{heavyball-0.21.8.dist-info → heavyball-0.22.0.dist-info}/top_level.txt +0 -0

heavyball/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from .cached_delayed_psgd_kron import ForeachCachedDelayedPSGDKron
 from .cached_psgd_kron import ForeachCachedPSGDKron
 from .delayed_psgd import ForeachDelayedPSGD
 from .foreach_adamw import ForeachAdamW
@@ -14,7 +15,6 @@ from .precond_schedule_sfpsoap import PrecondScheduleSFPaLMSOAP
 from .psgd_kron import ForeachPSGDKron
 from .pure_psgd import ForeachPurePSGD
 from .schedule_free_palm_foreach_soap import SFPaLMForeachSOAP
-from .cached_delayed_psgd_kron import ForeachCachedDelayedPSGDKron
 PalmForEachSoap = PaLMForeachSOAP
@@ -39,7 +39,8 @@ CachedDelayedPSGDKron = ForeachCachedDelayedPSGDKron
 __all__ = ['PalmForEachSoap', 'PaLMForeachSFAdamW', 'PaLMForeachSOAP', 'SFPaLMForeachSOAP', 'PrecondScheduleSFPaLMSOAP',
            'ForeachSOAP', 'ForeachSFAdamW', 'ForeachLaProp', 'ForeachADOPT', 'PrecondScheduleForeachSOAP',
            'PrecondSchedulePaLMForeachSOAP', 'ForeachPSGDKron', 'ForeachAdamW', 'ForeachPurePSGD', 'ForeachPaLMPAdam',
-           'ForeachDelayedPSGD', 'ForeachCachedPSGDKron', 'ForeachCachedDelayedPSGDKron',  #
-           'PaLMSOAP', 'PaLMSFAdamW', 'PaLMSFSoap', 'PaLMSFAdamW', 'PrecondScheduleSFPaLMSOAP',
-           'SOAP', 'SFAdamW', 'LaProp', 'ADOPT', 'PSGDKron', 'AdamW', 'PurePSGD', 'PaLMPAdam', 'DelayedPSGD',
-           'CachedPSGDKron', 'CachedDelayedPSGDKron', 'PrecondScheduleSOAP', 'PrecondSchedulePaLMSOAP']
+           'ForeachDelayedPSGD', 'ForeachCachedPSGDKron', 'ForeachCachedDelayedPSGDKron',
+             #
+           'PaLMSOAP', 'PaLMSFAdamW', 'PaLMSFSoap', 'PaLMSFAdamW', 'PrecondScheduleSFPaLMSOAP', 'SOAP', 'SFAdamW',
+           'LaProp', 'ADOPT', 'PSGDKron', 'AdamW', 'PurePSGD', 'PaLMPAdam', 'DelayedPSGD', 'CachedPSGDKron',
+           'CachedDelayedPSGDKron', 'PrecondScheduleSOAP', 'PrecondSchedulePaLMSOAP']

heavyball/cached_delayed_psgd_kron.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Optional
 import torch
 from heavyball.utils import min_dtype, precond_grad_cached_
-from .utils import update_param_, warmup, init_Q_exprs, trust_region_clip_, PSGDBase, split_p_and_g_in_group, \
+from .utils import update_param_, warmup, init_Q_exprs, trust_region_clip_, PSGDBase,  \
     line_to_triu, triu_to_line, einsum_base, promote, stochastic_lerp_, beta_debias, precond_grad_cached_
@@ -43,7 +43,8 @@ class ForeachCachedDelayedPSGDKron(PSGDBase):
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True,
                  foreach: bool = True, q_dtype='float32', stochastic_schedule: bool = True,
-                 storage_dtype: str = 'float32',  #
+                 storage_dtype: str = 'float32', mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025,
+                 #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -61,7 +62,7 @@ class ForeachCachedDelayedPSGDKron(PSGDBase):
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
                         split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype,
-                        storage_dtype=storage_dtype)
+                        storage_dtype=storage_dtype, caution=caution, mars_gamma=mars_gamma, mars=mars)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -81,7 +82,7 @@ class ForeachCachedDelayedPSGDKron(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=beta):
             state = self.state_(p)
             if 'Q' not in state:
@@ -120,7 +121,7 @@ class ForeachCachedDelayedPSGDKron(PSGDBase):
             q_orig = Q_list.pop(0)
             ea = exp_avg_list.pop(0)
-            precond_grad_cached_(cached_q, ea, self.state_(p)['cache_expr'], p, lr, weight_decay, self.clip_fn)
+            precond_grad_cached_(cached_q, ea, self.state_(p)['cache_expr'], p, lr, weight_decay, self.clip_fn, group['caution'], g)
             if should_update:
                 q = line_to_triu(q_orig) if store_triu_as_line else q_orig

heavyball/cached_psgd_kron.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Optional
 import torch
-from .utils import update_param_, warmup, init_Q_exprs, trust_region_clip_, PSGDBase, split_p_and_g_in_group, \
+from .utils import update_param_, warmup, init_Q_exprs, trust_region_clip_, PSGDBase, \
     line_to_triu, triu_to_line, einsum_base, promote, stochastic_lerp_, beta_debias, precond_grad_cached_
@@ -40,7 +40,8 @@ class ForeachCachedPSGDKron(PSGDBase):
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True,
                  foreach: bool = True, q_dtype='float32', stochastic_schedule: bool = True,
-                 storage_dtype: str = 'float32',  #
+                 storage_dtype: str = 'float32', mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025,
+                 #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -58,7 +59,7 @@ class ForeachCachedPSGDKron(PSGDBase):
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
                         split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype,
-                        storage_dtype=storage_dtype)
+                        storage_dtype=storage_dtype, caution=caution, mars_gamma=mars_gamma, mars=mars)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -78,7 +79,7 @@ class ForeachCachedPSGDKron(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=beta):
             state = self.state_(p)
             if 'Q' not in state:
@@ -128,4 +129,5 @@ class ForeachCachedPSGDKron(PSGDBase):
                     else:
                         torch.mul(q_.conj(), q_, out=c_)
-            precond_grad_cached_(cached_q, ea, self.state_(p)['cache_expr'], p, lr, weight_decay, self.clip_fn)
+            precond_grad_cached_(cached_q, ea, self.state_(p)['cache_expr'], p, lr, weight_decay, self.clip_fn,
+                                 group['caution'], g)

heavyball/delayed_psgd.py CHANGED Viewed

@@ -8,14 +8,13 @@ import torch
 from heavyball.utils import stochastic_lerp_, beta_debias
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
-    split_p_and_g_in_group, triu_to_line, line_to_triu, promote
+    triu_to_line, line_to_triu, promote
-# TODO: E1123 00:51:55.423000 159394 site-packages/torch/_guards.py:283] [5/0] Error while creating guard:
-# E1123 00:51:55.423000 159394 site-packages/torch/_guards.py:283] [5/0] Name: "G['psgd_precond_grad'].__defaults__[0]"
 @torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
-def _compilable_psgd_precond_grad_(q, exprs, ea, p, lr, weight_decay, clip_fn):
+def _compilable_psgd_precond_grad_(q, exprs, ea, p, lr, weight_deca, clip_fn, caution, grad):
     new = psgd_precond_grad(q, exprs, ea)
-    update_param_([p], clip_fn([new]), lr, weight_decay)
+    update_param_([p], clip_fn([new]), lr, weight_decay, caution=caution, grad=grad)
 class ForeachDelayedPSGD(PSGDBase):
@@ -46,7 +45,8 @@ class ForeachDelayedPSGD(PSGDBase):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: callable = None, store_triu_as_line: bool = True, foreach: bool = True,
-                 q_dtype='float32', stochastic_schedule: bool = True, storage_dtype: str = 'float32',  #
+                 q_dtype='float32', stochastic_schedule: bool = True, storage_dtype: str = 'float32',
+                 mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025,  #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -63,7 +63,9 @@ class ForeachDelayedPSGD(PSGDBase):
                         min_ndim_triangular=min_ndim_triangular, memory_save_mode=memory_save_mode,
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
-                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, storage_dtype=storage_dtype)
+                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype,
+                        storage_dtype=storage_dtype,
+                        caution=caution, mars_gamma=mars_gamma, mars=mars)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -83,7 +85,7 @@ class ForeachDelayedPSGD(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=beta):
             state = self.state_(p)
             if 'Q' not in state:
@@ -112,7 +114,8 @@ class ForeachDelayedPSGD(PSGDBase):
             q_orig = Q_list.pop(0)
             ea = exp_avg_list.pop(0)
             q = line_to_triu(q_orig) if store_triu_as_line else q_orig
-            _compilable_psgd_precond_grad_(q, self.state_(p)["exprs"], ea, p, lr, weight_decay, self.clip_fn)
+            _compilable_psgd_precond_grad_(q, self.state_(p)["exprs"], ea, p, lr, weight_decay, self.clip_fn, group['caution'],
+                                           g)
             if should_update:
                 q32 = [promote(q_) for q_ in q]
                 self.do_update(group, [p], [ea if momentum_into_precond_update else g], [q32], precond_lr, [q_orig],

heavyball/foreach_adamw.py CHANGED Viewed

@@ -1,18 +1,19 @@
 import torch
 import torch.optim
 from heavyball.utils import copy_stochastic_list_
 from .utils import warmup, exp_avg_sq_, beta_debias, update_param_, StatefulOptimizer, promote
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
-def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay):
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]
     torch._foreach_lerp_(exp_avg32, g32, 1 - beta_debias(beta1, step + 1))
     denom = list(exp_avg_sq_(exp_avg_sq32, g32, beta_debias(beta2, step + 1), eps))
-    update_param_(y, exp_avg32, lr, decay, lambda p, e, l: p.addcdiv_(e, denom.pop(0), value=l))
+    update_param_(y, exp_avg32, lr, decay, lambda p, e, l: p.addcdiv_(e, denom.pop(0), value=l), caution=caution,
+                  grad=g32)
     copy_stochastic_list_(exp_avg, exp_avg32)
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
@@ -20,9 +21,11 @@ def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps,
 class ForeachAdamW(StatefulOptimizer):
     def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32'):
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
-                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype)
+                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach)
     def _step(self, group):
@@ -48,9 +51,13 @@ class ForeachAdamW(StatefulOptimizer):
         y, grad, exp_avg_sq, exp_avg = zip(
             *[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])
+        if group['mars']:
+            self.mars_correct_list(group, y, grad, group['mars_gamma'], group['betas'][0])
         lr = -warmup(group['lr'], k + 1, group['warmup_steps'])
         lr = torch.empty((), dtype=torch.float32, device=y[0].device).fill_(lr)
         step = torch.empty((), dtype=torch.int32, device=y[0].device).fill_(k)
-        _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], step, lr, eps, decay)
+        _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], step, lr, eps, decay,
+                          group['caution'])
         group['k'] = k + 1

heavyball/foreach_adopt.py CHANGED Viewed

@@ -5,10 +5,10 @@ from heavyball.utils import copy_stochastic_list_
 from .utils import warmup, beta_debias, update_param_, StatefulOptimizer, promote
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
-def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay):
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]
-    update_param_(y, exp_avg, lr, decay)
+    update_param_(y, exp_avg, lr, decay, caution=caution, grad=g32)
     beta1 = beta_debias(beta1, step)
     denom = torch._foreach_sqrt(exp_avg_sq32)
@@ -27,9 +27,11 @@ def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps,
 class ForeachADOPT(StatefulOptimizer):
     def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
-                 foreach: bool = True, storage_dtype: str = 'float32'):
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
-                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype)
+                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach)
     def _step(self, group):
@@ -57,11 +59,14 @@ class ForeachADOPT(StatefulOptimizer):
         group['k'] = k + 1
+        if group['mars']:
+            self.mars_correct_list(group, y, grad, group['mars_gamma'], group['betas'][0])
         if k > 1:
             lr = -warmup(group['lr'], k - 1, group['warmup_steps'])
             lr = torch.empty((), dtype=torch.float32, device=y[0].device).fill_(lr)
             k = torch.empty((), dtype=torch.int32, device=y[0].device).fill_(k)
-            _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], k, lr, eps, decay)
+            _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], k, lr, eps, decay, group['caution'])
             return
         grad = [promote(g) for g in grad]

heavyball/foreach_laprop.py CHANGED Viewed

@@ -4,8 +4,8 @@ import torch.optim
 from .utils import warmup, exp_avg_sq_, beta_debias, update_param_, StatefulOptimizer, promote, copy_stochastic_list_
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
-def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay):
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]
     denom = exp_avg_sq_(exp_avg_sq32, g32, beta_debias(beta2, step), eps)
@@ -14,7 +14,7 @@ def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps,
     torch._foreach_mul_(exp_avg32, beta1)
     [ea32.addcdiv_(g, d, value=1 - beta1) for ea32, g, d in zip(exp_avg32, g32, denom)]
-    update_param_(y, exp_avg32, lr, decay)
+    update_param_(y, exp_avg32, lr, decay, caution=caution, grad=g32)
     copy_stochastic_list_(exp_avg, exp_avg32)
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
@@ -23,9 +23,11 @@ def _compilable_step_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps,
 class ForeachLaProp(StatefulOptimizer):
     def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=1,
-                 foreach: bool = True, storage_dtype: str = 'float32'):
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
-                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype)
+                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach)
     def _step(self, group):
@@ -52,10 +54,14 @@ class ForeachLaProp(StatefulOptimizer):
             *[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg'])  #
               for p in active_p])
+        if group['mars']:
+            self.mars_correct_list(group, y, grad, group['mars_gamma'], group['betas'][0])
         lr = -warmup(group['lr'], k + 1, group['warmup_steps'])
         lr = torch.empty((), dtype=torch.float32, device=y[0].device).fill_(lr)
         step = torch.empty((), dtype=torch.int32, device=y[0].device).fill_(k + 1)
-        _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], step, lr, eps, decay)
+        _compilable_step_(y, grad, exp_avg_sq, exp_avg, group['betas'][0], group['betas'][1], step, lr, eps, decay,
+                          group['caution'])
         group['k'] = k + 1

heavyball/foreach_sfadamw.py CHANGED Viewed

@@ -5,7 +5,7 @@ from heavyball.utils import get_ckp1, copy_stochastic_list_
 from .utils import warmup, ScheduleFree, exp_avg_sq_, beta_debias, promote, _compilable_schedule_free_
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_step_(y, grad, exp_avg_sq, z, beta1, beta2, step, ckp1, eps, decay, lr):
     old_debiased2 = beta_debias(beta2, step)
@@ -21,13 +21,17 @@ def _compilable_step_(y, grad, exp_avg_sq, z, beta1, beta2, step, ckp1, eps, dec
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
 class ForeachSFAdamW(ScheduleFree):
     def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0, r=0.0,
-                 weight_lr_power=2.0, foreach: bool = True, storage_dtype: str = 'float32'):
+                 weight_lr_power=2.0, foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False,
+                 caution: bool = False, mars_gamma: float = 0.0025):
+        assert not caution, "Caution not implemented for SFAdamW"
         defaults = dict(lr=lr, betas=betas, eps=eps, r=r, k=0, warmup_steps=warmup_steps, train_mode=True,
                         weight_sum=0.0, lr_max=-1.0, weight_lr_power=weight_lr_power, weight_decay=weight_decay,
-                        foreach=foreach, storage_dtype=storage_dtype)
+                        foreach=foreach, storage_dtype=storage_dtype, mars=mars, caution=caution, mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach)
     def _step(self, group):
@@ -53,6 +57,9 @@ class ForeachSFAdamW(ScheduleFree):
         y, grad, exp_avg_sq, z = zip(*[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['z'])  #
                                        for p in active_p])
+        if group['mars']:
+            self.mars_correct_list(group, y, grad, group['mars_gamma'], group['betas'][0])
         lr = warmup(group['lr'], k + 1, group['warmup_steps'])
         ckp1, group['weight_sum'] = get_ckp1(lr, group['weight_lr_power'], group['weight_sum'], group['r'], k + 1)

heavyball/foreach_soap.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, beta_debias, exp_avg_sq_, update_param_, set_, \
-    split_p_and_g_in_group, StatefulOptimizer, exp_avg_
+    StatefulOptimizer, exp_avg_
 class ForeachSOAP(StatefulOptimizer):
@@ -26,11 +26,13 @@ class ForeachSOAP(StatefulOptimizer):
                  weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
-                 split: bool = False, foreach: bool = True):
+                 split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         defaults = {"lr": lr, "betas": betas, "shampoo_beta": shampoo_beta, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
-                    "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'split': split}
+                    "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'split': split, 'mars': mars,
+                    'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
@@ -41,7 +43,7 @@ class ForeachSOAP(StatefulOptimizer):
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['betas'][0]):
             state = self.state_(p)
             step = state['step'] = state.get("step", -1) + 1
@@ -71,6 +73,8 @@ class ForeachSOAP(StatefulOptimizer):
         step_tensor = torch.empty((), dtype=torch.int32, device=p_list[0].device).fill_(step)
         denom = exp_avg_(exp_avg, exp_avg_sq, grad, grad_projected, beta1, beta2, step_tensor)
+        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
         for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
             state = self.state_(p)
             # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
@@ -80,11 +84,9 @@ class ForeachSOAP(StatefulOptimizer):
             # Projecting back the preconditioned (by Adam) exponential moving average of gradients
             # to the original space
             # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
-            set_(d, project(exp_avg_projected / d, state['Q'], True))
+            precond = project(exp_avg_projected / d, state['Q'], True)
             update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2,
                                   step > 0 and step % group['precondition_frequency'] == 0)
-        # Why does this have to be rebiased here?
-        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
-        update_param_(p_list, denom, step_size, group["weight_decay"])
+            update_param_([p], [precond], step_size, group["weight_decay"], caution=group['caution'], grad=[g])

heavyball/p_adam.py CHANGED Viewed

@@ -5,10 +5,10 @@ Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5e
 """
 import torch
 from heavyball.utils import triu_to_line, line_to_triu, identity, stochastic_lerp_
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, PSGDBase, exp_avg_sq_, beta_debias, \
-    split_p_and_g_in_group, promote
+    promote
 class ForeachPaLMPAdam(PSGDBase):
@@ -39,7 +39,8 @@ class ForeachPaLMPAdam(PSGDBase):
                  momentum_into_precond_update=True, warmup_steps: int = 1, betas=(None, None), beta: float = 0.9,
                  beta2_scale: float = 0.8, merge_dims: bool = False, split: bool = False, clip_fn: callable = None,
                  store_triu_as_line: bool = True, foreach: bool = True, q_dtype='float32',
-                 stochastic_schedule: bool = True,  storage_dtype:str ='float32',#
+                 stochastic_schedule: bool = True, storage_dtype: str = 'float32', mars: bool = False,
+                 caution: bool = False, mars_gamma: float = 0.0025,  #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -57,7 +58,8 @@ class ForeachPaLMPAdam(PSGDBase):
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, beta=beta,
                         beta2_scale=beta2_scale, merge_dims=merge_dims, split=split,
-                        store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, storage_dtype=storage_dtype)
+                        store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, storage_dtype=storage_dtype,
+                        mars=mars, caution=caution, mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -75,7 +77,7 @@ class ForeachPaLMPAdam(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=group['beta']):
             state = self.state_(p)
             if 'Q' not in state:
@@ -107,6 +109,7 @@ class ForeachPaLMPAdam(PSGDBase):
         lr = -warmup(lr, group['step'], group['warmup_steps'])
         for p, Q, g, ea, eas in zip(p_list, Q_triu, grad_list, exp_avg, exp_avg_sq):
+            gc = g.clone() if group['caution'] else None
             psgd_precond_grad(Q, self.state_(p)["exprs"], g, inplace=True)
             ea = psgd_precond_grad(Q, self.state_(p)["exprs"], ea)
             exp_avg_sq_(eas, g, beta_debias(beta2, group['step']), 1e-8, out=g)
@@ -115,5 +118,4 @@ class ForeachPaLMPAdam(PSGDBase):
             divide by g here, because g == denom (from exp_avg_sq_(out=g)), avoids denom allocation
             divide into g so we can deallocate ea, avoids one allocation (-> less memory than equivalent foreach)
             """
-            update_param_([p], self.clip_fn([g]), lr, weight_decay)
+            update_param_([p], self.clip_fn([g]), lr, weight_decay, caution=group['caution'], grad=gc)

heavyball/palm_foreach_sfadamw.py CHANGED Viewed

@@ -5,7 +5,7 @@ from .utils import warmup, ScheduleFree, exp_avg_sq_, beta_debias, get_ckp1, pro
     _compilable_schedule_free_, copy_stochastic_list_
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_step_(y, grad, exp_avg_sq, z, beta1, beta2, step, ckp1, eps, decay, lr):
     old_debiased2 = beta_debias(beta2, step)
@@ -24,12 +24,17 @@ def _compilable_step_(y, grad, exp_avg_sq, z, beta1, beta2, step, ckp1, eps, dec
 class PaLMForeachSFAdamW(ScheduleFree):
     def __init__(self, params, lr=0.0025, beta=0.9, betas=(None, None), eps=1e-8, weight_decay=0, warmup_steps=0, r=0.0,
-                 weight_lr_power=2.0, beta2_scale: float = 0.8, foreach: bool = True, storage_dtype: str = 'float32'):
+                 weight_lr_power=2.0, beta2_scale: float = 0.8, foreach: bool = True, storage_dtype: str = 'float32',
+                 mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
+        assert not caution, "Caution not implemented for SFAdamW"
         defaults = dict(lr=lr, beta=beta, eps=eps, r=r, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
                         lr_max=-1.0, weight_lr_power=weight_lr_power, weight_decay=weight_decay,
-                        beta2_scale=beta2_scale, storage_dtype=storage_dtype)
+                        beta2_scale=beta2_scale, storage_dtype=storage_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach)
     def _step(self, group):
@@ -58,6 +63,9 @@ class PaLMForeachSFAdamW(ScheduleFree):
         y, grad, exp_avg_sq, z = zip(*[(p.data, p.grad, self.state_(p)['exp_avg_sq'], self.state_(p)['z'])  #
                                        for p in active_p])
+        if group['mars']:
+            self.mars_correct_list(group, y, grad, group['mars_gamma'], group['betas'][0])
         lr = warmup(group['lr'], k + 1, group['warmup_steps'])
         ckp1, group['weight_sum'] = get_ckp1(lr, group['weight_lr_power'], group['weight_sum'], group['r'], k + 1)

heavyball/palm_foreach_soap.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, beta_debias, exp_avg_sq_, update_param_, set_, \
-    split_p_and_g_in_group, StatefulOptimizer, exp_avg_
+    StatefulOptimizer, exp_avg_
 class PaLMForeachSOAP(StatefulOptimizer):
@@ -33,14 +32,15 @@ class PaLMForeachSOAP(StatefulOptimizer):
                  max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
-                 beta2_scale: float = 0.8, split: bool = False, foreach: bool = True):
+                 beta2_scale: float = 0.8, split: bool = False, foreach: bool = True, mars: bool = False,
+                 caution: bool = False, mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
         defaults = {"lr": lr, "beta": beta, "shampoo_beta": shampoo_beta, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'beta2_scale': beta2_scale,
-                    'split': split}
+                    'split': split, 'mars': mars, 'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
@@ -51,7 +51,7 @@ class PaLMForeachSOAP(StatefulOptimizer):
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             step = state['step'] = state.get("step", -1) + 1
@@ -82,6 +82,7 @@ class PaLMForeachSOAP(StatefulOptimizer):
         beta2 = torch.empty((), dtype=torch.float32, device=p_list[0].device).fill_(beta2)
         step_tensor = torch.empty((), dtype=torch.int32, device=p_list[0].device).fill_(step)
         denom = exp_avg_(exp_avg, exp_avg_sq, grad, grad_projected, beta1, beta2, step_tensor)
+        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
         for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
             state = self.state_(p)
@@ -92,11 +93,9 @@ class PaLMForeachSOAP(StatefulOptimizer):
             # Projecting back the preconditioned (by Adam) exponential moving average of gradients
             # to the original space
             # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
-            set_(d, project(exp_avg_projected / d, state['Q'], True))
+            precond = project(exp_avg_projected / d, state['Q'], True)
             update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2,
                                   step > 0 and step % group['precondition_frequency'] == 0)
-        # Why does this have to be rebiased here?
-        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
-        update_param_(p_list, denom, step_size, group["weight_decay"])
+            update_param_([p], [precond], step_size, group["weight_decay"], caution=group['caution'], grad=[g])

heavyball/precond_schedule_foreach_soap.py CHANGED Viewed

@@ -3,7 +3,7 @@ import random
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, beta_debias, update_param_, \
-    precond_schedule, set_, split_p_and_g_in_group, StatefulOptimizer, exp_avg_
+    precond_schedule, set_, StatefulOptimizer, exp_avg_
 class PrecondScheduleForeachSOAP(StatefulOptimizer):
@@ -27,12 +27,13 @@ class PrecondScheduleForeachSOAP(StatefulOptimizer):
                  weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
-                 precond_scheduler=(1 / 3, 9), split: bool = False, foreach: bool = True):
+                 precond_scheduler=(1 / 3, 9), split: bool = False, foreach: bool = True, mars: bool = False,
+                 caution: bool = False, mars_gamma: float = 0.0025):
         defaults = {"lr": lr, "betas": betas, "shampoo_beta": shampoo_beta, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'precond_scheduler': precond_scheduler,
-                    'split': split}
+                    'split': split, 'mars': mars, 'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
@@ -44,7 +45,7 @@ class PrecondScheduleForeachSOAP(StatefulOptimizer):
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['betas'][0]):
             state = self.state_(p)
             step = state['step'] = state.get("step", -1) + 1
@@ -75,6 +76,8 @@ class PrecondScheduleForeachSOAP(StatefulOptimizer):
         denom = exp_avg_(exp_avg, exp_avg_sq, grad, grad_projected, beta1, beta2, step_tensor)
         update_precond = precond_schedule(step, group['precond_scheduler'], self.rng)
+        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
         for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
             state = self.state_(p)
             # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
@@ -84,10 +87,9 @@ class PrecondScheduleForeachSOAP(StatefulOptimizer):
             # Projecting back the preconditioned (by Adam) exponential moving average of gradients
             # to the original space
             # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
-            set_(d, project(exp_avg_projected / d, state['Q'], True))
+            precond = project(exp_avg_projected / d, state['Q'], True)
             update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2, update_precond)
-        # Why does this have to be rebiased here?
-        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
-        update_param_(p_list, denom, step_size, group["weight_decay"])
+            update_param_([p], [precond], step_size, group["weight_decay"], caution=group['caution'], grad=[g])

heavyball/precond_schedule_palm_foreach_soap.py CHANGED Viewed

@@ -3,7 +3,7 @@ import random
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, beta_debias, exp_avg_, update_param_, \
-    precond_schedule, set_, split_p_and_g_in_group, StatefulOptimizer
+    precond_schedule, set_, StatefulOptimizer
 class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
@@ -33,14 +33,15 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
                  precond_scheduler=(1 / 3, 9), betas=(None, None), beta2_scale: float = 0.8, split: bool = False,
-                 foreach: bool = True):
+                 foreach: bool = True, mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
         defaults = {"lr": lr, "beta": beta, "shampoo_beta": shampoo_beta, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'precond_scheduler': precond_scheduler,
-                    'beta2_scale': beta2_scale, 'split': split}
+                    'beta2_scale': beta2_scale, 'split': split, 'mars': mars, 'caution': caution,
+                    'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
@@ -52,7 +53,7 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             step = state['step'] = state.get("step", -1) + 1
@@ -86,6 +87,8 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
         denom = exp_avg_(exp_avg, exp_avg_sq, grad, grad_projected, beta1, beta2, step_tensor)
         update_precond = precond_schedule(step, group['precond_scheduler'], self.rng)
+        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
         for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
             state = self.state_(p)
             # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
@@ -96,10 +99,7 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
             # to the original space
             # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
             exp_avg_projected = exp_avg_projected / d
-            set_(d, project(exp_avg_projected, state['Q'], True))
+            precond = project(exp_avg_projected, state['Q'], True)
             update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2, update_precond)
-        # Why does this have to be rebiased here?
-        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
-        update_param_(p_list, denom, step_size, group["weight_decay"])
+            update_param_([p], [precond], step_size, group["weight_decay"], caution=group['caution'], grad=[g])

heavyball/precond_schedule_sfpsoap.py CHANGED Viewed

@@ -3,11 +3,11 @@ import random
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, set_, adaptive_gradient_clipping_, exp_avg_sq_, \
-    beta_debias, schedule_free_, warmup, ScheduleFree, precond_schedule, split_p_and_g_in_group, copy_stochastic_list_, \
+    beta_debias, schedule_free_, warmup, ScheduleFree, precond_schedule, copy_stochastic_list_, \
     promote
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, eps):
     eas32, gp32 = [list(map(promote, x)) for x in (exp_avg_sq, grad_projected)]
     denom = exp_avg_sq_(eas32, gp32, old_debiased2, eps)
@@ -52,15 +52,20 @@ class PrecondScheduleSFPaLMSOAP(ScheduleFree):
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1, r=0.0,
                  weight_lr_power=2.0, gradient_clip_val: float = 0.1, precond_scheduler=(1 / 3, 9), betas=(None, None),
-                 split: bool = False, foreach: bool = True):
+                 split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
+        assert not caution, "Caution is not implemented in ScheduleFree optimizers"
         defaults = {"lr": lr, "beta": beta, "beta2_scale": beta2_scale, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'r': r,
                     'weight_lr_power': weight_lr_power, 'train_mode': True, 'step': -1, 'weight_sum': 0,
-                    'gradient_clip_val': gradient_clip_val, 'precond_scheduler': precond_scheduler, 'split': split}
+                    'gradient_clip_val': gradient_clip_val, 'precond_scheduler': precond_scheduler, 'split': split,
+                    'mars': mars, 'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
@@ -87,7 +92,7 @@ class PrecondScheduleSFPaLMSOAP(ScheduleFree):
         # adaptive gradient clipping
         adaptive_gradient_clipping_(p_list, grad, group["gradient_clip_val"], eps=group["eps"])
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             if "z" not in state:

heavyball/psgd_kron.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Optional
 import torch
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
-    split_p_and_g_in_group, line_to_triu, triu_to_line, promote, stochastic_lerp_, beta_debias
+    line_to_triu, triu_to_line, promote, stochastic_lerp_, beta_debias
 class ForeachPSGDKron(PSGDBase):
@@ -40,7 +40,8 @@ class ForeachPSGDKron(PSGDBase):
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True,
                  foreach: bool = True, q_dtype='float32', stochastic_schedule: bool = True,
-                 storage_dtype: str = 'float32',  #
+                 storage_dtype: str = 'float32', mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025,
+                 #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -57,7 +58,9 @@ class ForeachPSGDKron(PSGDBase):
                         min_ndim_triangular=min_ndim_triangular, memory_save_mode=memory_save_mode,
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
-                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, storage_dtype=storage_dtype)
+                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype,
+                        storage_dtype=storage_dtype,
+                        mars=mars, caution=caution, mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -77,7 +80,7 @@ class ForeachPSGDKron(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=beta):
             state = self.state_(p)
             if 'Q' not in state:
@@ -114,4 +117,4 @@ class ForeachPSGDKron(PSGDBase):
                 self.do_update(group, [p], [ea if momentum_into_precond_update else g], [q32], precond_lr, [q_orig],
                                store_triu_as_line)
             g = psgd_precond_grad(q, self.state_(p)["exprs"], ea)
-            update_param_([p], self.clip_fn([g]), lr, weight_decay)
+            update_param_([p], self.clip_fn([g]), lr, weight_decay, caution=group['caution'], grad=[g])

heavyball/pure_psgd.py CHANGED Viewed

@@ -5,9 +5,9 @@ Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5e
 """
 import torch
 from heavyball.utils import identity
-from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, PSGDBase, split_p_and_g_in_group, \
+from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, PSGDBase, \
     line_to_triu, triu_to_line, promote
@@ -38,7 +38,8 @@ class ForeachPurePSGD(PSGDBase):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: callable = None, store_triu_as_line: bool = True, foreach: bool = True,
-                 q_dtype='float32', stochastic_schedule: bool = True,  #
+                 q_dtype='float32', stochastic_schedule: bool = True, mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025,  #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -49,11 +50,14 @@ class ForeachPurePSGD(PSGDBase):
         if clip_fn is None:
             clip_fn = identity
+        assert not mars, "MARS is not supported in this optimizer"
         defaults = dict(lr=lr, weight_decay=weight_decay, max_size_triangular=max_size_triangular,
                         min_ndim_triangular=min_ndim_triangular, memory_save_mode=memory_save_mode,
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
-                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype)
+                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -70,7 +74,7 @@ class ForeachPurePSGD(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=0.0):
             state = self.state_(p)
             if 'Q' not in state:
@@ -98,4 +102,4 @@ class ForeachPurePSGD(PSGDBase):
                 q32 = [promote(q_) for q_ in q]
                 self.do_update(group, [p], [g], [q32], precond_lr, [q_orig], store_triu_as_line)
             psgd_precond_grad(q, self.state_(p)["exprs"], g, inplace=True)
-            update_param_([p], self.clip_fn([g]), lr, weight_decay)
+            update_param_([p], self.clip_fn([g]), lr, weight_decay, caution=group['caution'], grad=[g])

heavyball/schedule_free_palm_foreach_soap.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import random
 import torch
+from heavyball.utils import mars_correction
 from .utils import init_preconditioner, update_preconditioner, project, set_, adaptive_gradient_clipping_, exp_avg_sq_, \
-    beta_debias, schedule_free_, warmup, ScheduleFree, split_p_and_g_in_group, copy_stochastic_list_, promote
+    beta_debias, schedule_free_, warmup, ScheduleFree, copy_stochastic_list_, promote
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, eps):
     eas32, gp32 = [list(map(promote, x)) for x in (exp_avg_sq, grad_projected)]
     denom = exp_avg_sq_(eas32, gp32, old_debiased2, eps)
@@ -44,15 +45,19 @@ class SFPaLMForeachSOAP(ScheduleFree):
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1, r=0.0,
                  weight_lr_power=2.0, gradient_clip_val: float = 0.1, betas=(None, None), split: bool = False,
-                 foreach: bool = True):
+                 foreach: bool = True, mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
+        assert not caution, "Caution is not implemented in ScheduleFree optimizers"
         defaults = {"lr": lr, "beta": beta, "beta2_scale": beta2_scale, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'r': r,
                     'weight_lr_power': weight_lr_power, 'train_mode': True, 'step': -1,
-                    'gradient_clip_val': gradient_clip_val, 'weight_sum': 0, 'split': split}
+                    'gradient_clip_val': gradient_clip_val, 'weight_sum': 0, 'split': split, 'mars': mars,
+                    'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
@@ -61,6 +66,7 @@ class SFPaLMForeachSOAP(ScheduleFree):
         vals = []
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
+        mars = group['mars']
         step = group['step'] = group.get("step", 0) + 1
@@ -79,12 +85,14 @@ class SFPaLMForeachSOAP(ScheduleFree):
         vals = []
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             if "z" not in state:
                 state["z"] = torch.clone(p).float()
                 state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                if mars:
+                    state['mars_prev_grad'] = g.clone()
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

heavyball/utils.py CHANGED Viewed

@@ -142,18 +142,26 @@ def beta_debias(beta, step):
 @torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
-def exp_avg_sq_(state, grad, beta2, eps, out=None):
-    if isinstance(state, torch.Tensor):
-        state.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-        return torch.sqrt(state, out=out).clamp_(min=eps)
+def _compilable_exp_avg_sq_(state, grad, beta2, eps, out=None):
     torch._foreach_mul_(state, beta2)
     [s.addcmul_(g, g, value=1 - beta2) for s, g in zip(state, grad)]
     denom = torch._foreach_sqrt(state)
-    torch._foreach_maximum_(denom, eps)
+    [denom.clamp_(min=eps) for denom in denom]
+    if out is not None:
+        copy_stochastic_list_(out, denom)
+        return out
     return denom
+def exp_avg_sq_(state, grad, beta2, eps, out=None):
+    state, grad = list_guard(state), list_guard(grad)
+    if not isinstance(beta2, torch.Tensor):
+        beta2 = torch.empty((), dtype=torch.float32, device=state[0].device).fill_(beta2)
+    if not isinstance(eps, torch.Tensor):
+        eps = torch.empty((), dtype=torch.float32, device=state[0].device).fill_(eps)
+    return _compilable_exp_avg_sq_(state, grad, beta2, eps, out)
 def adaptive_gradient_clipping_(parameters: List[torch.Tensor], gradients: List[torch.Tensor], clip_val: float,
                                 minimum: float = 1e-3, eps: float = 1e-8):
     if clip_val <= 0:
@@ -168,12 +176,19 @@ def adaptive_gradient_clipping_(parameters: List[torch.Tensor], gradients: List[
     torch._foreach_mul_(gradients, p_norm)
+def is_compiling():
+    try:
+        return torch.compiler.is_compiling()
+    except AttributeError:
+        return True
 def set_(dst: torch.Tensor, src: torch.Tensor):
-    if not torch.compiler.is_compiling() and src.data_ptr() == dst.data_ptr():
+    if not is_compiling() and src.data_ptr() == dst.data_ptr():
         return
     if src.shape != dst.shape:
         src = src.reshape_as(dst)
-    if not torch.compiler.is_compiling() and src.is_contiguous() and dst.is_contiguous() and src.dtype == dst.dtype:
+    if not is_compiling() and src.is_contiguous() and dst.is_contiguous() and src.dtype == dst.dtype:
         dst.set_(src)
     else:
         dst.copy_(src)
@@ -338,11 +353,18 @@ def _compilable_stochastic_lerp_(x: List[torch.Tensor], y: List[torch.Tensor], a
 def stochastic_lerp_(x: List[torch.Tensor], y: List[torch.Tensor], a: Union[float, int, torch.Tensor]):
+    x, y = list_guard(x), list_guard(y)
     if not isinstance(a, torch.Tensor):
         a = torch.empty((), dtype=torch.float32, device=x[0].device).fill_(a)
     _compilable_stochastic_lerp_(x, y, a)
+def list_guard(x):
+    if isinstance(x, (list, tuple)):
+        return x
+    return [x]
 @torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_stochastic_add_(x: List[torch.Tensor], y: List[torch.Tensor], alpha: Union[float, int, torch.Tensor]):
     for x_, y_ in zip(x, y):
@@ -353,6 +375,7 @@ def _compilable_stochastic_add_(x: List[torch.Tensor], y: List[torch.Tensor], al
 def stochastic_add_(x: List[torch.Tensor], y: List[torch.Tensor], alpha: Union[float, int, torch.Tensor]):
+    x, y = list_guard(x), list_guard(y)
     if not isinstance(alpha, torch.Tensor):
         alpha = torch.empty((), dtype=torch.float32, device=x[0].device).fill_(alpha)
     _compilable_stochastic_add_(x, y, alpha)
@@ -463,6 +486,43 @@ class StatefulOptimizer(torch.optim.Optimizer):
     def state_(self, arg: torch.Tensor):
         return self.state[self.key(arg)]
+    def mars_correct_list(self, group, p_list, g_list, mars_gamma, beta):
+        for p, g in zip(p_list, g_list):
+            state = self.state_(p)
+            if 'mars_old_grad' not in state:
+                state['mars_old_grad'] = torch.zeros_like(g)
+        old_gs = [self.state_(p)['mars_old_grad'] for p in p_list]
+        mars_correction(g_list, old_gs, mars_gamma, beta)
+    def split_p_and_g_in_group(self, group: dict, skip_none: bool = True, should_promote: bool = True,
+                               beta1: float = -1.0):
+        for p in group["params"]:
+            if skip_none and p.grad is None:
+                continue
+            if p.grad is None:
+                grad = None
+            else:
+                if should_promote:
+                    grad = promote(p.grad)
+                else:
+                    grad = p.grad
+                if beta1 >= 0 and group.get('mars', False):
+                    self.mars_correct_list(group, [p], [grad], group['mars_gamma'], beta1)
+                p.grad = None
+            p_views = merge_group(group, p)
+            if grad is not None:
+                grad = merge_group(group, grad)
+            if isinstance(p_views, torch.Tensor):
+                yield p_views, grad
+                continue
+            if grad is None:
+                yield from zip(p_views, [None] * len(p_views))
+                continue
+            yield from zip(p_views, grad)
     def state_size(self) -> int:
         total_bytes = 0
@@ -472,7 +532,7 @@ class StatefulOptimizer(torch.optim.Optimizer):
                 total_bytes += x.numel() * x.element_size()
         for group in self.param_groups:
-            for p, _ in split_p_and_g_in_group(group, skip_none=False):
+            for p, _ in self.split_p_and_g_in_group(group, skip_none=False):
                 tree_map(_add, self.state_(p))
         return total_bytes
@@ -625,7 +685,7 @@ def _compilable_copy_stochastic_(target: torch.Tensor, source: torch.Tensor):
 def copy_stochastic_(target: torch.Tensor, source: torch.Tensor):
-    if not torch.compiler.is_compiling() and target.data_ptr() == source.data_ptr():
+    if not is_compiling() and target.data_ptr() == source.data_ptr():
         return
     if target.dtype != torch.bfloat16 or source.dtype not in (torch.float16, torch.float32, torch.float64):
         set_(target, source)
@@ -633,14 +693,16 @@ def copy_stochastic_(target: torch.Tensor, source: torch.Tensor):
 @torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
-def _compilable_update_(p, u, decay, add_fn, lr):
+def _compilable_update_(p, u, decay, add_fn, lr, caution, g):
     u = [u_.view_as(p_) for u_, p_ in zip(u, p)]
-    p32, u32 = [list(map(promote, x)) for x in [p, u]]
+    p32, u32, g32 = [list(map(promote, x)) for x in [p, u, g]]
     if decay > 0:
         torch._foreach_mul_(p32, 1 - decay * lr)
-    for p32_, u32_ in zip(p32, u32):  # lr is data-dependent -> can't compile a foreach
+    for p32_, u32_, g32_ in zip(p32, u32, g32):  # lr is data-dependent -> can't compile a foreach
+        if caution:
+            _compilable_cautioning_(g32_, u32_)
         if add_fn is None:
             p32_.add_(u32_, alpha=lr)
         else:
@@ -650,9 +712,12 @@ def _compilable_update_(p, u, decay, add_fn, lr):
 def update_param_(param: List[torch.Tensor], update: List[torch.Tensor], lr: float, decay: float,
-                  add_fn: callable = None):
+                  add_fn: callable = None, caution: bool = False, grad: List[torch.Tensor] = None):
     lr_tensor = torch.empty((), dtype=torch.float32, device=param[0].device).fill_(lr)
-    _compilable_update_(param, update, decay, add_fn, lr_tensor)
+    param, update, grad = list_guard(param), list_guard(update), list_guard(grad)
+    if not caution:
+        grad = [None] * len(param)
+    _compilable_update_(param, update, decay, add_fn, lr_tensor, caution, grad)
 def precond_schedule(step, precond_scheduler, rng):
@@ -965,18 +1030,45 @@ class PSGDBase(StatefulOptimizer):
                         psgd_balance_Q(q)
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
-def _compilable_precond_grad_cached_(cached_q, ea, expr, param, lr, weight_decay, clip_fn):
+# TODO: Figure out why this sometimes crashes
+# @torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_precond_grad_cached_(cached_q, ea, expr, param, lr, weight_decay, clip_fn, caution, grad):
     md = min_dtype(cached_q + [ea])
     new = torch.einsum(expr, *[c_.to(md) for c_ in cached_q], ea.to(md)).to(torch.float32)
-    update_param_([param], clip_fn([new]), lr, weight_decay)
+    update_param_([param], clip_fn([new]), lr, weight_decay, caution=caution, grad=grad)
 def precond_grad_cached_(cached_q: List[torch.Tensor], ea: torch.Tensor, expr: str, param: torch.Tensor, lr: float,
-                         weight_decay: float, clip_fn):
+                         weight_decay: float, clip_fn, caution, grad):
     if isinstance(lr, float):
         lr = torch.empty((), dtype=torch.float32, device=param.device).fill_(lr)
-    _compilable_precond_grad_cached_(cached_q, ea, expr, param, lr, weight_decay, clip_fn)
+    _compilable_precond_grad_cached_(cached_q, ea, expr, param, lr, weight_decay, clip_fn, caution, grad)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_mars_correction_(g, old_g, a):
+    g_copy = [g_.clone() for g_ in g]
+    _compilable_stochastic_lerp_(g, old_g, a)
+    copy_stochastic_list_(old_g, g_copy)
+def mars_correction(g, old_g, beta1, gamma):
+    a = -gamma * beta1 / (1 - beta1)
+    g, old_g = list_guard(g), list_guard(old_g)
+    a = torch.empty((), dtype=torch.float32, device=g[0].device).fill_(a)
+    _compilable_mars_correction_(g, old_g, a)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
+def _compilable_cautioning_(g, update):
+    mask = (g * update) > 0
+    update.masked_fill_(~mask, 0)
+    scale = mask.numel() / mask.sum().clamp(min=1)
+    update.mul_(scale)
+def caution(g, update):
+    _compilable_cautioning_(g, update)
 def precond_update_prob_schedule(max_prob=1.0, min_prob=0.03, decay=0.001, flat_start=250):
@@ -1013,29 +1105,3 @@ def merge_group(group, *tensors):
         append_or_extend(out, dim_merger(t, group['max_size_triangular'] if 'max_size_triangular' in group else group[
             'max_precond_dim'], group.get('split', False)))
     return out
-def split_p_and_g_in_group(group: dict, skip_none: bool = True, should_promote: bool = True):
-    for p in group["params"]:
-        if skip_none and p.grad is None:
-            continue
-        if p.grad is None:
-            grad = None
-        else:
-            if should_promote:
-                grad = promote(p.grad)
-            else:
-                grad = p.grad
-            p.grad = None
-        p_views = merge_group(group, p)
-        if grad is not None:
-            grad = merge_group(group, grad)
-        if isinstance(p_views, torch.Tensor):
-            yield p_views, grad
-            continue
-        if grad is None:
-            yield from zip(p_views, [None] * len(p_views))
-            continue
-        yield from zip(p_views, grad)

{heavyball-0.21.8.dist-info → heavyball-0.22.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.21.8
+Version: 0.22.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler
@@ -32,7 +32,7 @@ A simple package of efficient optimizers
 The goal is not to thrive for completeness, full maintenance or abstraction, but instead to provide a simple
 largely static alternative to `torch.optim` with more and better optimizers.
-Currently (2024-11-22, 0.21.0), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
+Currently (2024-11-26, 0.22.0), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
 recommended experimental optimizer is `DelayedPSGDKron` ([tuning guide](docs/psgd_efficiency.md)).
 ## Features

heavyball-0.22.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,24 @@
+heavyball/__init__.py,sha256=icHYN-MGsmHkLUlHCMcZkOlwY7GT63_ayR_a5iPKmzM,2226
+heavyball/cached_delayed_psgd_kron.py,sha256=n3wIOhrop0Ls4MZ0kXpwGuImp1jzPs6VGdxIlPyoYdQ,6827
+heavyball/cached_psgd_kron.py,sha256=KCLsfvj9qh_2FNwRTdWM3zjnt2oGHfsf4Y341rPcceI,6778
+heavyball/delayed_psgd.py,sha256=CaG17zqorLsCSDeGEePOyb6n9ugv8W6gyRQqeQNq-e8,6272
+heavyball/foreach_adamw.py,sha256=uawSbGGUD2E1RtcwspP83yQNElERdGX-diqCI5e8FqE,2825
+heavyball/foreach_adopt.py,sha256=DFEaPswVzdHcbxC-mirsf_okM_HR6r34PDUTty5CrUE,3547
+heavyball/foreach_laprop.py,sha256=J4Vms0nAOMh3GQtAOPyrYOe5WtpzokVv25b9oDnwc2A,2833
+heavyball/foreach_sfadamw.py,sha256=HWbLekY5BloHDIgrN2J0a7IolZCt8Ah2xkLAU_-5oSc,3079
+heavyball/foreach_soap.py,sha256=7B_dP2Hm_xqwpBQiPYkv_c6eoRnU1dV2VZfvSoa4uJ8,4729
+heavyball/p_adam.py,sha256=F-id4qOkAaDTJaKTSNhSsonX-Js5IzIu1Bdj1S4qE2g,6306
+heavyball/palm_foreach_sfadamw.py,sha256=E8raxrBIkSmTEGFzwnfWxKwDJjBQE2vdsmyqfc8aL_A,3375
+heavyball/palm_foreach_soap.py,sha256=IknGm_CzrqDIFEoCkejxjoZ4sfIy6RSoInqlMUOYLB4,6156
+heavyball/precond_schedule_foreach_soap.py,sha256=bJ2ifPFa8zEP9GO8eBpqZzsmP7p_iQkkCkllNeEMHPU,4892
+heavyball/precond_schedule_palm_foreach_soap.py,sha256=4dT9f134-Faq2KuCMCHzMtrkMO-es5p_DYS1of5yF-s,6428
+heavyball/precond_schedule_sfpsoap.py,sha256=FOR-axwlkSN7IHZWYYUVFfjSFCLxc_NdiTlb-n5gmgs,7530
+heavyball/psgd_kron.py,sha256=achB23mQUT3F00IGhjjVf_8YW7VOTHR6YdoCDRyWxsI,6039
+heavyball/pure_psgd.py,sha256=dbYgkunFFA6EsO6fJEhaJRxTH0smi7qLX3Np9XTQ9E4,5079
+heavyball/schedule_free_palm_foreach_soap.py,sha256=0WT_gvTKymqLQzYT6ewDgCmpDq-HgMAewipw1QvyQYA,7267
+heavyball/utils.py,sha256=TVpyev0oL4a78px4cvtaGoGPJqfpfTKE-xBWkRCmzkw,39785
+heavyball-0.22.0.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
+heavyball-0.22.0.dist-info/METADATA,sha256=LqVR3tUgxpk21zsmKxfJAQCKLPzmaQz2PQiKvlvpQe8,11926
+heavyball-0.22.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+heavyball-0.22.0.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
+heavyball-0.22.0.dist-info/RECORD,,

heavyball-0.21.8.dist-info/RECORD DELETED Viewed

@@ -1,24 +0,0 @@
-heavyball/__init__.py,sha256=iqP428JWwwx-XDOZ0nUdbCkOLEyfoqVyWZLQLAcwxaw,2214
-heavyball/cached_delayed_psgd_kron.py,sha256=Nyxl-G-o6greKwDN-vLiw5W02GXO2LRvknc0OzvzFnE,6674
-heavyball/cached_psgd_kron.py,sha256=HzD6se0AYb-W5hpydUxcR9uqrpe_54PBwgL1VWX3DHU,6592
-heavyball/delayed_psgd.py,sha256=m4c-OvcLMrRxSAPYs2l6Up21uCyF2kvHvpcnfe3nzGs,6212
-heavyball/foreach_adamw.py,sha256=Rb5U80cgUcEqlEbUU250UTWdoqA7nyiqkV5w1U4bWX4,2445
-heavyball/foreach_adopt.py,sha256=ecdi1fKg9i087OGjtKWVbE_DD6Yf4pvpzv4ELCcusvQ,3211
-heavyball/foreach_laprop.py,sha256=vi6C_gfjXxw5uN0KHgzxI9itUI1dcgOf3ufoO_VVMp0,2471
-heavyball/foreach_sfadamw.py,sha256=rLZORmCIMu9G09FdDgMSiI6pNq34IVoxsPVWtmeDdbQ,2753
-heavyball/foreach_soap.py,sha256=4mWSMWYTdjgiXiboI5DwdigecruDtNGKylGAFAVhCRA,4562
-heavyball/p_adam.py,sha256=Xyxsavwtw-t0OyTHitYQXZSmF9UJlMDzDAURge-MbbQ,6047
-heavyball/palm_foreach_sfadamw.py,sha256=JbNrcoquBGGUI5XNMFouDjpNurVHUW9DbX1A3tSrtno,3025
-heavyball/palm_foreach_soap.py,sha256=GzAwM8kOt1X0QCmUZDTdHwPxbJwjH8ic43dyAK5BYCA,6015
-heavyball/precond_schedule_foreach_soap.py,sha256=HcObXLfSNN_lKNb4nmC6tkdHcqDIMNX6hILpHKScqLc,4744
-heavyball/precond_schedule_palm_foreach_soap.py,sha256=xZ7CJvIfdu2RNAZt2g1S7Xb0Jyy1hNC4MowOFU3nWkk,6283
-heavyball/precond_schedule_sfpsoap.py,sha256=PNneiOkrRyV1yIZn91lPmYofd1_OiLqJTDy75RLpXJk,7273
-heavyball/psgd_kron.py,sha256=RSLJi5FnSmYjvYBufNDClnYRm-eN_Kpa1Ar2tNP6-X0,5824
-heavyball/pure_psgd.py,sha256=LZK0qmvZkBF8g00evaVLtW-sIUJmdoxag1K7O26AqEo,4820
-heavyball/schedule_free_palm_foreach_soap.py,sha256=_AqrnChY6iDlQUkF2YUxS7eLjSWCIuvEUOHvMHVM1yY,6873
-heavyball/utils.py,sha256=xTDZEt2_DM57EYnJkRq7d7scTnro4eKPdMtEwPdLy-c,37218
-heavyball-0.21.8.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
-heavyball-0.21.8.dist-info/METADATA,sha256=nLyxHlENmhAGyU9GManYKKJJTykhsAMt7hkJNXPu_YY,11926
-heavyball-0.21.8.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-heavyball-0.21.8.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
-heavyball-0.21.8.dist-info/RECORD,,

{heavyball-0.21.8.dist-info → heavyball-0.22.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{heavyball-0.21.8.dist-info → heavyball-0.22.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{heavyball-0.21.8.dist-info → heavyball-0.22.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

heavyball 0.21.8__py3-none-any.whl → 0.22.0__py3-none-any.whl

heavyball 0.21.8py3-none-any.whl → 0.22.0py3-none-any.whl