PyPI - heavyball - Versions diffs - 0.21.8__py3-none-any.whl → 0.23.0__py3-none-any.whl - Mend

heavyball 0.21.8py3-none-any.whl → 0.23.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

heavyball/__init__.py +6 -5
heavyball/cached_delayed_psgd_kron.py +6 -5
heavyball/cached_psgd_kron.py +7 -5
heavyball/delayed_psgd.py +14 -11
heavyball/foreach_adamw.py +14 -7
heavyball/foreach_adopt.py +11 -6
heavyball/foreach_laprop.py +12 -6
heavyball/foreach_sfadamw.py +10 -3
heavyball/foreach_soap.py +10 -8
heavyball/p_adam.py +11 -9
heavyball/palm_foreach_sfadamw.py +11 -3
heavyball/palm_foreach_soap.py +8 -9
heavyball/precond_schedule_foreach_soap.py +10 -8
heavyball/precond_schedule_palm_foreach_soap.py +9 -9
heavyball/precond_schedule_sfpsoap.py +10 -5
heavyball/psgd_kron.py +9 -6
heavyball/pure_psgd.py +11 -7
heavyball/schedule_free_palm_foreach_soap.py +13 -5
heavyball/utils.py +171 -106
{heavyball-0.21.8.dist-info → heavyball-0.23.0.dist-info}/METADATA +2 -2
heavyball-0.23.0.dist-info/RECORD +24 -0
heavyball-0.21.8.dist-info/RECORD +0 -24
{heavyball-0.21.8.dist-info → heavyball-0.23.0.dist-info}/LICENSE +0 -0
{heavyball-0.21.8.dist-info → heavyball-0.23.0.dist-info}/WHEEL +0 -0
{heavyball-0.21.8.dist-info → heavyball-0.23.0.dist-info}/top_level.txt +0 -0

heavyball/precond_schedule_palm_foreach_soap.py CHANGED Viewed

@@ -3,7 +3,7 @@ import random
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, beta_debias, exp_avg_, update_param_, \
-    precond_schedule, set_, split_p_and_g_in_group, StatefulOptimizer
+    precond_schedule, set_, StatefulOptimizer
 class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
@@ -33,14 +33,15 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
                  precond_scheduler=(1 / 3, 9), betas=(None, None), beta2_scale: float = 0.8, split: bool = False,
-                 foreach: bool = True):
+                 foreach: bool = True, mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
         defaults = {"lr": lr, "beta": beta, "shampoo_beta": shampoo_beta, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'precond_scheduler': precond_scheduler,
-                    'beta2_scale': beta2_scale, 'split': split}
+                    'beta2_scale': beta2_scale, 'split': split, 'mars': mars, 'caution': caution,
+                    'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
@@ -52,7 +53,7 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             step = state['step'] = state.get("step", -1) + 1
@@ -86,6 +87,8 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
         denom = exp_avg_(exp_avg, exp_avg_sq, grad, grad_projected, beta1, beta2, step_tensor)
         update_precond = precond_schedule(step, group['precond_scheduler'], self.rng)
+        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
         for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
             state = self.state_(p)
             # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
@@ -96,10 +99,7 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
             # to the original space
             # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
             exp_avg_projected = exp_avg_projected / d
-            set_(d, project(exp_avg_projected, state['Q'], True))
+            precond = project(exp_avg_projected, state['Q'], True)
             update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2, update_precond)
-        # Why does this have to be rebiased here?
-        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
-        update_param_(p_list, denom, step_size, group["weight_decay"])
+            update_param_([p], [precond], step_size, group["weight_decay"], caution=group['caution'], grad=[g])

heavyball/precond_schedule_sfpsoap.py CHANGED Viewed

@@ -3,11 +3,11 @@ import random
 import torch
 from .utils import init_preconditioner, update_preconditioner, project, set_, adaptive_gradient_clipping_, exp_avg_sq_, \
-    beta_debias, schedule_free_, warmup, ScheduleFree, precond_schedule, split_p_and_g_in_group, copy_stochastic_list_, \
+    beta_debias, schedule_free_, warmup, ScheduleFree, precond_schedule, copy_stochastic_list_, \
     promote
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, eps):
     eas32, gp32 = [list(map(promote, x)) for x in (exp_avg_sq, grad_projected)]
     denom = exp_avg_sq_(eas32, gp32, old_debiased2, eps)
@@ -52,15 +52,20 @@ class PrecondScheduleSFPaLMSOAP(ScheduleFree):
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1, r=0.0,
                  weight_lr_power=2.0, gradient_clip_val: float = 0.1, precond_scheduler=(1 / 3, 9), betas=(None, None),
-                 split: bool = False, foreach: bool = True):
+                 split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
+        assert not caution, "Caution is not implemented in ScheduleFree optimizers"
         defaults = {"lr": lr, "beta": beta, "beta2_scale": beta2_scale, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'r': r,
                     'weight_lr_power': weight_lr_power, 'train_mode': True, 'step': -1, 'weight_sum': 0,
-                    'gradient_clip_val': gradient_clip_val, 'precond_scheduler': precond_scheduler, 'split': split}
+                    'gradient_clip_val': gradient_clip_val, 'precond_scheduler': precond_scheduler, 'split': split,
+                    'mars': mars, 'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
@@ -87,7 +92,7 @@ class PrecondScheduleSFPaLMSOAP(ScheduleFree):
         # adaptive gradient clipping
         adaptive_gradient_clipping_(p_list, grad, group["gradient_clip_val"], eps=group["eps"])
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             if "z" not in state:

heavyball/psgd_kron.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Optional
 import torch
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
-    split_p_and_g_in_group, line_to_triu, triu_to_line, promote, stochastic_lerp_, beta_debias
+    line_to_triu, triu_to_line, promote, stochastic_lerp_, beta_debias
 class ForeachPSGDKron(PSGDBase):
@@ -40,7 +40,8 @@ class ForeachPSGDKron(PSGDBase):
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True,
                  foreach: bool = True, q_dtype='float32', stochastic_schedule: bool = True,
-                 storage_dtype: str = 'float32',  #
+                 storage_dtype: str = 'float32', mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025,
+                 #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -57,7 +58,9 @@ class ForeachPSGDKron(PSGDBase):
                         min_ndim_triangular=min_ndim_triangular, memory_save_mode=memory_save_mode,
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
-                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, storage_dtype=storage_dtype)
+                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype,
+                        storage_dtype=storage_dtype,
+                        mars=mars, caution=caution, mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -77,7 +80,7 @@ class ForeachPSGDKron(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=beta):
             state = self.state_(p)
             if 'Q' not in state:
@@ -113,5 +116,5 @@ class ForeachPSGDKron(PSGDBase):
                 q32 = [promote(q_) for q_ in q]
                 self.do_update(group, [p], [ea if momentum_into_precond_update else g], [q32], precond_lr, [q_orig],
                                store_triu_as_line)
-            g = psgd_precond_grad(q, self.state_(p)["exprs"], ea)
-            update_param_([p], self.clip_fn([g]), lr, weight_decay)
+            g = psgd_precond_grad(False, self.state_(p)["exprs"][-1], ea, *q)
+            update_param_([p], self.clip_fn([g]), lr, weight_decay, caution=group['caution'], grad=[g])

heavyball/pure_psgd.py CHANGED Viewed

@@ -5,9 +5,9 @@ Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5e
 """
 import torch
 from heavyball.utils import identity
-from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, PSGDBase, split_p_and_g_in_group, \
+from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, PSGDBase, \
     line_to_triu, triu_to_line, promote
@@ -38,7 +38,8 @@ class ForeachPurePSGD(PSGDBase):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: callable = None, store_triu_as_line: bool = True, foreach: bool = True,
-                 q_dtype='float32', stochastic_schedule: bool = True,  #
+                 q_dtype='float32', stochastic_schedule: bool = True, mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025,  #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         if not 0.0 <= lr:
@@ -49,11 +50,14 @@ class ForeachPurePSGD(PSGDBase):
         if clip_fn is None:
             clip_fn = identity
+        assert not mars, "MARS is not supported in this optimizer"
         defaults = dict(lr=lr, weight_decay=weight_decay, max_size_triangular=max_size_triangular,
                         min_ndim_triangular=min_ndim_triangular, memory_save_mode=memory_save_mode,
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
                         precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
-                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype)
+                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype, mars=mars, caution=caution,
+                        mars_gamma=mars_gamma)
         super().__init__(params, defaults, foreach, stochastic_schedule, clip_fn, preconditioner_update_probability)
     def _step(self, group):
@@ -70,7 +74,7 @@ class ForeachPurePSGD(PSGDBase):
         vals = []
-        for p, g in split_p_and_g_in_group(group, should_promote=False):
+        for p, g in self.split_p_and_g_in_group(group, should_promote=False, beta1=0.0):
             state = self.state_(p)
             if 'Q' not in state:
@@ -97,5 +101,5 @@ class ForeachPurePSGD(PSGDBase):
             if group:
                 q32 = [promote(q_) for q_ in q]
                 self.do_update(group, [p], [g], [q32], precond_lr, [q_orig], store_triu_as_line)
-            psgd_precond_grad(q, self.state_(p)["exprs"], g, inplace=True)
-            update_param_([p], self.clip_fn([g]), lr, weight_decay)
+            psgd_precond_grad(True, self.state_(p)["exprs"][-1], g, *q)
+            update_param_([p], self.clip_fn([g]), lr, weight_decay, caution=group['caution'], grad=[g])

heavyball/schedule_free_palm_foreach_soap.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import random
 import torch
+from heavyball.utils import mars_correction
 from .utils import init_preconditioner, update_preconditioner, project, set_, adaptive_gradient_clipping_, exp_avg_sq_, \
-    beta_debias, schedule_free_, warmup, ScheduleFree, split_p_and_g_in_group, copy_stochastic_list_, promote
+    beta_debias, schedule_free_, warmup, ScheduleFree, copy_stochastic_list_, promote
-@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=True)
+@torch.compile(mode='max-autotune-no-cudagraphs', fullgraph=True, dynamic=False)
 def _compilable_exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, eps):
     eas32, gp32 = [list(map(promote, x)) for x in (exp_avg_sq, grad_projected)]
     denom = exp_avg_sq_(eas32, gp32, old_debiased2, eps)
@@ -44,15 +45,19 @@ class SFPaLMForeachSOAP(ScheduleFree):
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
                  data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1, r=0.0,
                  weight_lr_power=2.0, gradient_clip_val: float = 0.1, betas=(None, None), split: bool = False,
-                 foreach: bool = True):
+                 foreach: bool = True, mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025):
         if betas[0] is not None:
             beta = betas[0]
+        assert not caution, "Caution is not implemented in ScheduleFree optimizers"
         defaults = {"lr": lr, "beta": beta, "beta2_scale": beta2_scale, "eps": eps, "weight_decay": weight_decay,
                     "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
                     "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
                     "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'r': r,
                     'weight_lr_power': weight_lr_power, 'train_mode': True, 'step': -1,
-                    'gradient_clip_val': gradient_clip_val, 'weight_sum': 0, 'split': split}
+                    'gradient_clip_val': gradient_clip_val, 'weight_sum': 0, 'split': split, 'mars': mars,
+                    'caution': caution, 'mars_gamma': mars_gamma}
         super().__init__(params, defaults, foreach)
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
@@ -61,6 +66,7 @@ class SFPaLMForeachSOAP(ScheduleFree):
         vals = []
         max_precond_dim = group['max_precond_dim']
         precondition_1d = group['precondition_1d']
+        mars = group['mars']
         step = group['step'] = group.get("step", 0) + 1
@@ -79,12 +85,14 @@ class SFPaLMForeachSOAP(ScheduleFree):
         vals = []
-        for p, g in split_p_and_g_in_group(group):
+        for p, g in self.split_p_and_g_in_group(group, beta1=group['beta']):
             state = self.state_(p)
             if "z" not in state:
                 state["z"] = torch.clone(p).float()
                 state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                if mars:
+                    state['mars_prev_grad'] = g.clone()
                 init_preconditioner(g, state, max_precond_dim, precondition_1d)
                 update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
                 continue  # first step is skipped so that we never use the current gradients in the projection.

heavyball 0.21.8__py3-none-any.whl → 0.23.0__py3-none-any.whl

heavyball 0.21.8py3-none-any.whl → 0.23.0py3-none-any.whl