PyPI - heavyball - Versions diffs - 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

heavyball 1.4.3py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

heavyball/__init__.py +36 -5
heavyball/chainable.py +38 -23
heavyball/utils.py +159 -101
{heavyball-1.4.3.dist-info → heavyball-1.5.0.dist-info}/METADATA +1 -1
heavyball-1.5.0.dist-info/RECORD +8 -0
heavyball-1.4.3.dist-info/RECORD +0 -8
{heavyball-1.4.3.dist-info → heavyball-1.5.0.dist-info}/LICENSE +0 -0
{heavyball-1.4.3.dist-info → heavyball-1.5.0.dist-info}/WHEEL +0 -0
{heavyball-1.4.3.dist-info → heavyball-1.5.0.dist-info}/top_level.txt +0 -0

heavyball/__init__.py CHANGED Viewed

@@ -116,7 +116,7 @@ class ForeachSOAP(C.BaseOpt):
     def __init__(self, params, lr: float = 3e-3, betas=(0.9, 0.95), shampoo_beta: float = 0.95, eps: float = 1e-8,
                  weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
-                 data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
+                 data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 0,
                  split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
                  mars_gamma: float = 0.0025, palm: bool = C.use_default, precond_scheduler=(1 / 3, 9),
                  beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
@@ -129,8 +129,10 @@ class ForeachSOAP(C.BaseOpt):
         if use_precond_schedule:
             del defaults['precondition_frequency']
+            self.precond_schedule = utils.get_soap_precond_schedule(defaults.pop("precond_scheduler"))
         else:
             del defaults['precond_scheduler']
+            self.precond_schedule = 1 / defaults.pop("precondition_frequency")
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,  #
                          C.scale_by_soap)
@@ -149,6 +151,30 @@ class PrecondSchedulePaLMForeachSOAP(ForeachSOAP):
     palm: bool = True
+class OrthoLaProp(C.BaseOpt):
+    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
+                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,
+                         C.orthogonalize_grad_to_param, C.scale_by_laprop)
+class OrthoAdamW(C.BaseOpt):
+    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
+                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,
+                         C.orthogonalize_grad_to_param, C.scale_by_adam)
 class ForeachPSGDKron(C.BaseOpt):
     """
     Originally from Evan Walters and Omead Pooladzandi, 2024
@@ -162,7 +188,7 @@ class ForeachPSGDKron(C.BaseOpt):
     def __init__(self, params, lr=0.001, beta=0.9, weight_decay=0.0, preconditioner_update_probability=None,
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
-                 momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
+                 momentum_into_precond_update=True, warmup_steps: int = 0, merge_dims: bool = False,
                  split: bool = False, store_triu_as_line: bool = True, foreach: bool = True, q_dtype='float32',
                  stochastic_schedule: bool = True, storage_dtype: str = 'float32', mars: bool = False,
                  caution: bool = False, mars_gamma: float = 0.0025, delayed: Optional[bool] = C.use_default,
@@ -172,6 +198,8 @@ class ForeachPSGDKron(C.BaseOpt):
                  precond_init_scale=1.0, precond_lr=0.1):
         defaults = locals()
         defaults.pop("self")
+        self.precond_schedule = defaults.pop(
+            "preconditioner_update_probability") or utils.precond_update_prob_schedule()
         params = defaults.pop("params")
         delayed = C.default(delayed, self.delayed)
@@ -181,8 +209,7 @@ class ForeachPSGDKron(C.BaseOpt):
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, False,  #
                          *(C.exp_avg,) * exp_avg_input,  #
-                         functools.partial(C.scale_by_delayed_psgd if delayed else C.scale_by_psgd, cached=cached,
-                                           prob=preconditioner_update_probability))
+                         functools.partial(C.scale_by_delayed_psgd if delayed else C.scale_by_psgd, cached=cached))
 class ForeachPurePSGD(ForeachPSGDKron):
@@ -202,6 +229,10 @@ class ForeachDelayedPSGD(ForeachPSGDKron):
     delayed: bool = True
+class ForeachCachedNewtonPSGD(ForeachCachedPSGDKron):
+    hessian_approx = True
 PalmForEachSoap = PaLMForeachSOAP
 PaLMSOAP = PaLMForeachSOAP
 PaLMSFAdamW = PaLMForeachSFAdamW
@@ -225,4 +256,4 @@ __all__ = ["Muon", "RMSprop", "PrecondSchedulePaLMSOAP", "PSGDKron", "PurePSGD",
            "PrecondScheduleSOAP", "PrecondSchedulePaLMSOAP", 'RMSprop', 'MuonLaProp',  #
            "ForeachAdamW", "ForeachSFAdamW", "ForeachLaProp", "ForeachADOPT", "ForeachSOAP", "ForeachPSGDKron",
            "ForeachPurePSGD", "ForeachDelayedPSGD", "ForeachCachedPSGDKron", "ForeachCachedDelayedPSGDKron",
-           "ForeachRMSprop", "ForeachMuon"]
+           "ForeachRMSprop", "ForeachMuon", 'ForeachCachedNewtonPSGD']

heavyball/chainable.py CHANGED Viewed

@@ -127,7 +127,7 @@ def zero_guard(*names):
 def copy_guard(index, *names):
-    return functools.partial(CopyGuard, index=index, names=names,)
+    return functools.partial(CopyGuard, index=index, names=names, )
 def general_guard(*names, init_fn, skip_first: bool = True):
@@ -188,6 +188,11 @@ def update_by_laprop(group, update, grad, param, exp_avg, exp_avg_sq):
     raise SkipUpdate
+@no_state
+def orthogonalize_grad_to_param(group, update, grad, param):
+    return utils.orthogonalize_grad_to_param(param, update, group['eps'])
 @copy_guard(2, "z")
 @no_state
 def update_by_schedule_free(group, update, grad, param, z):
@@ -312,17 +317,23 @@ def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG, inner:
     for u, q, gg, eas in zip(update, Q, GG, exp_avg_sq):
         utils.update_preconditioner(u, q, gg, eas, group['max_precond_dim'], group['precondition_1d'],
-                                    utils.beta_debias(group['shampoo_beta'], group['step']), precond_schedule(group))
+                                    utils.beta_debias(group['shampoo_beta'], group['step']),
+                                    group['is_preconditioning'])
     return precond
 def _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, prob: Optional[callable] = None):
     if prob is None:
         prob = utils.precond_update_prob_schedule()
-    if not precond_schedule(group, prob, name=f"cumulative_prob_{id(Q)}"):
+    if not group['is_preconditioning']:
         return Q_mat
-    utils.psgd_update_precond(Q_mat, exprs, grad, group['precond_lr'], Q, group['store_triu_as_line'])
+    utils.psgd_update_precond(Q_mat, exprs, getattr(param, 'hessian_vector', grad), group['precond_lr'], Q,
+                              group['store_triu_as_line'], getattr(param, 'vector', None))
+    if hasattr(param, 'vector'):
+        del param.vector
+        del param.hessian_vector
     if grad.dim() > 1 and precond_schedule(group, balance_probability, f"balance_prob_{id(Q)}"):
         if group['store_triu_as_line']:
@@ -330,7 +341,15 @@ def _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, p
         else:
             utils.psgd_balance_Q(Q)
-    return _update_psgd_cache(cached, Q_cache, Q_mat)
+    if isinstance(prob, float):
+        float_prob = prob
+    else:
+        float_prob = prob(group.get(f'cumulative_prob_{id(Q)}_prob_step', 1))
+    group['is_cached'] = should_use_cache = cached and float_prob < 0.5
+    if should_use_cache:  # caching adds extra ops and is not worth the overhead when we precondition at every step
+        return _update_psgd_cache(cached, Q_cache, Q_mat)
+    return Q_mat
 def _update_psgd_cache(cached, Q_cache, q):
@@ -345,14 +364,14 @@ def _update_psgd_cache(cached, Q_cache, q):
     return Q_cache
-def _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache):
-    if cached:
+def _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache):
+    if group.get('is_cached', False):
         return utils.precond_grad_cached_(cache_expr, update, *Q_cache)
     return utils.psgd_precond_grad(exprs[-1], update, *Q_mat)
-def _fused_cached_psgd_precond_grad(group, grad, param, cached, cache_expr, exprs, update, Q_mat, Q_cache):
-    if cached:
+def _fused_cached_psgd_precond_grad(group, grad, param, cache_expr, exprs, update, Q_mat, Q_cache):
+    if group.get('is_cached', False):
         utils.fused_precond_grad_cached_(cache_expr, update, param, group['lr'], grad, group['weight_decay'],
                                          group['caution'], *Q_cache)
     else:
@@ -368,7 +387,7 @@ def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     Q_mat = _update_psgd_precond(cached, Q_cache, group, param,
                                  update if group['momentum_into_precond_update'] else grad, Q_mat, Q, exprs, prob)
-    return _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
+    return _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache)
 @general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
@@ -376,9 +395,9 @@ def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str
 def scale_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                           prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    precond = _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
-    _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
-                         Q_mat, Q, exprs, prob)
+    precond = _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache)
+    _ = _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
+                             Q_mat, Q, exprs, prob)
     return precond
@@ -389,7 +408,7 @@ def update_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: st
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     Q_mat = _update_psgd_precond(cached, Q_cache, group, param,
                                  update if group['momentum_into_precond_update'] else grad, Q_mat, Q, exprs, prob)
-    _fused_cached_psgd_precond_grad(group, update, param, cached, cache_expr, exprs, update, Q_mat, Q_cache)
+    _fused_cached_psgd_precond_grad(group, update, param, cache_expr, exprs, update, Q_mat, Q_cache)
     raise SkipUpdate
@@ -398,9 +417,9 @@ def update_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: st
 def update_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                            prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    _fused_cached_psgd_precond_grad(group, update, param, cached, cache_expr, exprs, update, Q_mat, Q_cache)
-    _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
-                         Q_mat, Q, exprs, prob)
+    _fused_cached_psgd_precond_grad(group, update, param, cache_expr, exprs, update, Q_mat, Q_cache)
+    _ = _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
+                             Q_mat, Q, exprs, prob)
     raise SkipUpdate
@@ -449,6 +468,7 @@ class ChainOpt(utils.StatefulOptimizer):
             group['base_lr'] = group['lr']
         vals = list(self.split_p_and_g_in_group(group, should_promote=self.promote, beta1=utils.get_beta1(group)))
         if not vals:
             return
         p, g = zip(*vals)
@@ -464,12 +484,7 @@ class ChainOpt(utils.StatefulOptimizer):
             break
         group['step'] = state['step'] = step = step + 1
-        if group['warmup_steps'] and step < group['warmup_steps']:
-            group['prev_lr'] = group['lr'] = group['base_lr'] * step / group['warmup_steps']
-        else:
-            group['prev_lr'] = group['lr'] = group['base_lr']
+        group['prev_lr'] = group['lr'] = group['base_lr'] * step / max(step, group['warmup_steps'] + 1)
         if not group['foreach'] or len(p) == 1:
             for param, grad in zip(p, g):

heavyball/utils.py CHANGED Viewed

@@ -54,12 +54,6 @@ def decorator_knowngood(func: Callable):
 einsum_base = string.ascii_lowercase + string.ascii_uppercase
-def warmup(lr: float, step: int, warmup_steps: int):
-    if step >= warmup_steps:  # if instead of min to guard against 0 div
-        return lr
-    return lr * step / warmup_steps
 @decorator_knowngood
 def _compilable_schedule_free_(p: List[Tensor], z: List[Tensor], ckp1: Tensor, update: List[Tensor], lr: Tensor,
                                beta1: Tensor, decay: float, grad: List[Tensor], caution):
@@ -323,6 +317,10 @@ def nesterov_momentum(state, grad, beta):
     return grad
+def _compilable_grafting(magnitude, direction):
+    return direction * (magnitude.norm() / direction.norm().clamp(min=1e-6))
 # mode in ("newtonschulz", "qr", "svd")
 # scale_mode in ("none", "scale", "graft")
 @decorator_knowngood
@@ -341,12 +339,17 @@ def inplace_orthogonal_(x: Tensor, mode: str, out: Tensor, scale_mode: str):
     elif scale_mode == "scale":
         y *= max(1, x.size(0) / x.size(1)) ** 0.5
     elif scale_mode == "graft":
-        y *= x.norm() / y.norm().clamp(min=1e-6)
+        y = _compilable_grafting(x, y)
     else:
         raise NotImplementedError(f"Unknown scale_mode: {scale_mode}")
     set_(out, y)
+@decorator_knowngood
+def _compilable_scatter_set(target, source, index):
+    target[:] = source.contiguous()[index].reshape_as(target)
 def get_orthogonal_matrix_QR(GG, Q, exp_avg_sq):
     """
     Computes the eigenbases of the preconditioner using one round of power iteration
@@ -381,7 +384,7 @@ def get_orthogonal_matrix_QR(GG, Q, exp_avg_sq):
     indices = tuple(slice(None) if ind is None else ind.view(*(1,) * i, -1, *(1,) * (exp_avg_sq.dim() - i - 1))  #
                     for i, ind in enumerate(indices))
-    set_(exp_avg_sq, exp_avg_sq[indices])
+    _compilable_scatter_set(exp_avg_sq, exp_avg_sq, indices)
 def get_orthogonal_matrix(mat):
@@ -482,7 +485,7 @@ def scalar_guard(*args):
     out = []
     for x in xs:
         if isinstance(x, float):
-            out.append(torch.empty((), dtype=torch.float32, device=ref.device).fill_(x))
+            out.append(torch.empty((), dtype=promote(ref.dtype), device=ref.device).fill_(x))
         elif isinstance(x, int):
             out.append(torch.empty((), dtype=torch.int64, device=ref.device).fill_(x))
         else:
@@ -500,7 +503,7 @@ def _compilable_stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[f
         copy_stochastic_(x_, x32 + y32 * alpha)
-def stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[float, int, Tensor]):
+def stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[float, int, Tensor] = 1):
     x, y = list_guard(x, y)
     alpha = scalar_guard(alpha, x[0])
     _compilable_stochastic_add_(x, y, alpha)
@@ -591,25 +594,26 @@ def project(grad, Q, back: bool):
 class StatefulOptimizer(torch.optim.Optimizer):
     ema_decay: float = 0.001
     compile_step: bool = False
+    hessian_approx: bool = False
+    precond_schedule: Union[Callable, float, None] = None
+    stochastic_schedule: bool = False
     def __init__(self, params, defaults, foreach: bool = True, use_ema: bool = False):
         super().__init__(params, {**defaults, 'foreach': foreach})
-        self.fake_groups = {}
         self.use_ema = use_ema
         self.mapping = {}
+        self._inner_group = {'stochastic_schedule': self.stochastic_schedule}
+        self._precond_rng = random.Random(0x12312)
+        self._is_preconditioning = None
-    def get_groups(self, group):
-        if group['foreach']:
-            return [group]
-        for p in group['params']:
-            if p not in self.fake_groups:
-                self.fake_groups[p] = {**group, 'params': [p]}
+        if self.hessian_approx and self.compile_step:
+            raise ValueError("Hessian approximation can't be used with compile_step.")
-        return [self.fake_groups[p] for p in group['params']]
+    def get_groups(self, group):
+        return [group]
     def state_(self, arg: Tensor):
-        return self.state[self.mapping.get(arg, arg)]
+        return self.state[arg]
     def mars_correct_list(self, group, p_list, g_list, mars_gamma, beta):
         for p, g in zip(p_list, g_list):
@@ -622,36 +626,27 @@ class StatefulOptimizer(torch.optim.Optimizer):
     def split_p_and_g_in_group(self, group: dict, skip_none: bool = True, should_promote: bool = True,
                                beta1: float = -1.0):
         for p in group["params"]:
-            if p.grad is None:
-                if skip_none:
-                    continue
-                grad = None
+            if p in self.mapping:
+                p_views = self.mapping[p]
             else:
-                if should_promote:
-                    grad = promote(p.grad)
-                else:
-                    grad = p.grad
-                if beta1 >= 0 and group.get('mars', False):
-                    self.mars_correct_list(group, [p], [grad], group['mars_gamma'], beta1)
-                p.grad = None
+                self.mapping[p] = p_views = merge_group(group, p)
-            if self.compile_step:
-                yield p, grad
-                continue
+            grad = getattr(p, 'grad', None)
+            p.grad = None
-            p_views = merge_group(group, p)
-            if grad is not None:
-                grad = merge_group(group, grad)
-            for i, pv in enumerate(p_views):
-                self.mapping[pv] = (p, i)
-            if isinstance(p_views, Tensor):
-                yield p_views, grad
-                continue
             if grad is None:
-                yield from zip(p_views, [None] * len(p_views))
-                continue
-            yield from zip(p_views, grad)
+                grad = [getattr(pv, 'grad', None) for pv in p_views]
+            else:
+                grad = merge_group(group, grad)
+            for pv, g in zip(p_views, grad):
+                if skip_none and g is None:
+                    continue
+                if should_promote:
+                    g = promote(g)
+                if beta1 >= 0 and group.get('mars', False):
+                    self.mars_correct_list(group, [pv], [g], group['mars_gamma'], beta1)
+                yield pv, g
     def state_size(self) -> int:
         total_bytes = 0
@@ -671,67 +666,89 @@ class StatefulOptimizer(torch.optim.Optimizer):
     def ema_update(self):
         with torch.no_grad():
-            for top_group in self.param_groups:
-                for group in self.get_groups(top_group):
-                    active_p = [p for p in group['params']]
+            for group in self.param_groups:
+                active_p = [p for p in group['params']]
-                    if not active_p:
-                        return
+                if not active_p:
+                    return
-                    k = group['ema_step'] = group.get('ema_step', -1) + 1
+                k = group['ema_step'] = group.get('ema_step', -1) + 1
-                    for p in active_p:
-                        if 'param_ema' not in self.state_(p):
-                            self.state_(p)['param_ema'] = torch.zeros_like(p.data, memory_format=torch.preserve_format)
+                for p in active_p:
+                    if 'param_ema' not in self.state_(p):
+                        self.state_(p)['param_ema'] = torch.zeros_like(p.data, memory_format=torch.preserve_format)
-                    y, param_ema = zip(*[(p.data, self.state_(p)['param_ema']) for p in active_p])
-                    torch._foreach_lerp_(param_ema, y, weight=beta_debias(1 - self.ema_decay, k + 1))
+                y, param_ema = zip(*[(p.data, self.state_(p)['param_ema']) for p in active_p])
+                torch._foreach_lerp_(param_ema, y, weight=beta_debias(1 - self.ema_decay, k + 1))
     def copy_emas_to_params(self):
         with torch.no_grad():
-            for top_group in self.param_groups:
-                for group in self.get_groups(top_group):
-                    active_p = [p for p in group['params']]
+            for group in self.param_groups:
+                active_p = [p for p in group['params']]
-                    if not active_p:
-                        return
+                if not active_p:
+                    return
-                    for p in active_p:
-                        if 'param_ema' in self.state_(p):
-                            p_clone = p.data.clone()
-                            set_(p.data, self.state_(p)['param_ema'])
-                            set_(self.state_(p)['param_ema'], p_clone)
+                for p in active_p:
+                    if 'param_ema' in self.state_(p):
+                        p_clone = p.data.clone()
+                        set_(p.data, self.state_(p)['param_ema'])
+                        set_(self.state_(p)['param_ema'], p_clone)
     def copy_params_to_emas(self):
         with torch.no_grad():
-            for top_group in self.param_groups:
-                for group in self.get_groups(top_group):
-                    active_p = [p for p in group['params']]
+            for group in self.param_groups:
+                active_p = [p for p in group['params']]
-                    if not active_p:
-                        return
+                if not active_p:
+                    return
-                    for p in active_p:
-                        if 'param_ema' in self.state_(p):
-                            ema_clone = self.state_(p)['param_ema'].data.clone()
-                            set_(self.state_(p)['param_ema'], p.data)
-                            set_(p.data, ema_clone)
+                for p in active_p:
+                    if 'param_ema' in self.state_(p):
+                        ema_clone = self.state_(p)['param_ema'].data.clone()
+                        set_(self.state_(p)['param_ema'], p.data)
+                        set_(p.data, ema_clone)
     def step(self, closure: Optional[Callable] = None):
+        if self.precond_schedule is None:
+            self._is_preconditioning = False
+        else:
+            self._is_preconditioning = psgd_should_update(self._inner_group, self.precond_schedule, self._precond_rng)
+        hessian_approx = self.hessian_approx and self._is_preconditioning
         if closure is None:
+            if hessian_approx:
+                raise ValueError("Hessian approximation requires a closure.")
             loss = None
         else:
             with torch.enable_grad():
                 loss = closure()
+            if hessian_approx:
+                grads = []
+                for group in self.param_groups:
+                    for p, g in self.split_p_and_g_in_group(group, skip_none=True, should_promote=False):
+                        grads.append(g)
+                        p.vector = torch.randn_like(p)
+                        p.orig = p.data.clone()
+                        stochastic_add_(p.data, p.vector, tiny_bf16)
+                with torch.enable_grad():
+                    closure()
+                for group in self.param_groups:
+                    for p, g in self.split_p_and_g_in_group(group, skip_none=True, should_promote=False):
+                        p.grad = grads.pop(0)
+                        stochastic_add_(g, p.grad, -1)
+                        p.hessian_vector = g
+                        p.data.copy_(p.orig)
+                        del p.orig
         # we assume that parameters are constant and that there are no excessive recompiles
         with torch.no_grad(), torch._dynamo.utils.disable_cache_limit():
-            for top_group in self.param_groups:
-                for group in self.get_groups(top_group):
-                    self._step(group)
-                    self.mapping.clear()
-                    if self.use_ema:
-                        self.ema_update(group)
+            for group in self.param_groups:
+                group['is_preconditioning'] = self._is_preconditioning
+                self._step(group)
+                if self.use_ema:
+                    self.ema_update(group)
         return loss
@@ -943,6 +960,15 @@ def precond_schedule(step, precond_scheduler, rng):
     return update_precond
+def get_soap_precond_schedule(precond_scheduler):
+    rng = random.Random(0x12312)
+    def _inner(step):
+        return precond_schedule(step, precond_scheduler, rng)
+    return _inner
 def init_Q_exprs(t, scale, max_size, min_ndim_triangular, memory_save_mode, dtype=None):
     """For a scalar or tensor t, we initialize its preconditioner Q and
     reusable einsum expressions for updating Q and preconditioning gradient.
@@ -1030,20 +1056,24 @@ def psgd_balance_Q(Q_in):
     torch._foreach_mul_(Q_in, list(norms))
-def psgd_calc_A_and_conjB(exprA, G, Q):
+def psgd_calc_A_and_conjB(exprA, G, Q, V=None):
     eps = scalar_guard(math.sqrt(torch.finfo(torch.float32).eps), G)
     eps *= G.norm() / G.numel()
     G = G + torch.randn_like(G) * eps
     md = min_dtype(Q + [G])
     A = torch.einsum(exprA, *[q.to(md) for q in Q], G.to(md)).to(G.dtype)
     order = G.dim()
-    conjB = torch.randn(G.shape[1:] + G.shape[:1], dtype=promote(G.dtype), device=G.device)
+    if V is None:
+        conjB = torch.randn(G.shape[1:] + G.shape[:1], dtype=promote(G.dtype), device=G.device)
+    else:
+        conjB = V.permute(*range(1, order), 0).to(promote(G.dtype))
     Q = [promote(q) for q in Q]
     for i, q in enumerate(Q):
         if q.dim() <= 1:
             conjB /= q
         else:
-            conjB = torch.linalg.solve_triangular(q, conjB.reshape(-1, q.size(0)), upper=True, left=False).reshape_as(conjB)
+            conjB = torch.linalg.solve_triangular(q, conjB.reshape(-1, q.size(0)), upper=True, left=False).reshape_as(
+                conjB)
         if i < order - 1:
             conjB = torch.transpose(conjB, i, order - 1)
     return A, conjB
@@ -1065,11 +1095,11 @@ def psgd_lb(A, max_abs):
 @decorator
-def psgd_update_precond(Q, exprs, G, precond_lr, oq, store_triu_as_line):
+def psgd_update_precond(Q, exprs, G, precond_lr, oq, store_triu_as_line, V):
     """Update Kronecker product preconditioner Q with pair (V, G)."""
     exprA, exprGs, _ = exprs
-    A, conjB = psgd_calc_A_and_conjB(exprA, G, Q)
+    A, conjB = psgd_calc_A_and_conjB(exprA, G, Q, V)
     for q, exprG, o in zip(Q, exprGs, oq):
         term1 = promote(torch.einsum(exprG, A, A))
@@ -1286,7 +1316,6 @@ def _compilable_fused_precond_grad_cached_(expr: str, ea: Tensor, param, lr, gra
 def fused_precond_grad_cached_(expr: str, ea: Tensor, param, lr, grad, decay, caution, *cached_q: Tensor):
     lr = scalar_guard(lr, param[0])
     _compilable_fused_precond_grad_cached_(expr, ea, param, lr, grad, decay, caution, *cached_q)
@@ -1325,6 +1354,29 @@ def mars_correction(g, old_g, beta1, gamma):
     _compilable_mars_correction_(g, old_g, a)
+@decorator_knowngood
+def _compilable_orthogonalization(weight: List[Tensor], grad: List[Tensor], eps: Tensor, graft: bool = True):
+    """
+    Implements OrthoGrad from "Grokking at the Edge of Numerical Stability" (https://arxiv.org/abs/2501.04697)
+    """
+    for w, g in zip(weight, grad):
+        proj = promote((w * g).sum()) / promote((w * w).sum()).add(eps)
+        out = promote(g) - proj * promote(w)  # promote in this funky way to keep traffic minimal
+        if graft:
+            out = _compilable_grafting(g, out)
+        copy_stochastic_(g, out)
+def orthogonalize_grad_to_param(weight, grad, eps, graft=True):
+    weight, grad = list_guard(weight, grad)
+    eps = scalar_guard(eps, weight[0])
+    _compilable_orthogonalization(weight, grad, eps, graft)
+    return grad
 @decorator_knowngood
 def _compilable_cautioning(g: Tensor, update: Tensor):
     mask = g.signbit() ^ update.signbit()  # "Mask if they point in different directions"
@@ -1338,25 +1390,20 @@ def caution(g, update):
     return _compilable_cautioning(g, update)
-def precond_update_prob_schedule(max_prob=1.0, min_prob=0.03, decay=0.001, flat_start=500):
+def precond_update_prob_schedule(max_prob=1.0, min_prob=0.03, decay=0.999, flat_start=1000):
     """Anneal preconditioner update probability during beginning of training.
     PSGD benefits from more preconditioner updates at the beginning of training,
     but once the preconditioner is learned the update probability can drop low.
     This schedule is an exponential anneal with a flat start. Default settings keep
-    update probability at 1.0 for 200 steps then exponentially anneal down to
-    `min_prob` by 4000 steps. Default settings work very well for most models and
+    update probability at `max_prob` for 1000 steps then exponentially anneal down to
+    `min_prob` by ~4000 steps. Default settings work very well for most models and
     training regimes.
     """
     def _schedule(n):
-        if n < flat_start:  # higher numerical stability
-            return max_prob
-        n -= flat_start
-        prob = max_prob * math.exp(-decay * (n - flat_start))
-        return max(min_prob, min(max_prob, prob))
+        return max(min_prob, max_prob * decay ** max(n - flat_start, 0))
     return _schedule
@@ -1375,12 +1422,18 @@ def merge_group(group, *tensors):
 def hook_optimizer_into_model(model, optimizer, *args, **kwargs):
-    def _step(p: Tensor, o: torch.optim.Optimizer):
+    optimizers = {}
+    def _step(p: Tensor):
+        o = optimizers[p]
         o.step()
         o.zero_grad()
     for p in model.parameters():
-        p.register_post_accumulate_grad_hook(functools.partial(_step, o=optimizer([p], *args, **kwargs)))
+        optimizers[p] = optimizer([p], *args, **kwargs)
+        p.register_post_accumulate_grad_hook(_step)
+    return optimizers
 def fused_hook(parameters, optimizer, *args, **kwargs):
@@ -1389,18 +1442,23 @@ def fused_hook(parameters, optimizer, *args, **kwargs):
     seen_params = set()
     o = optimizer(parameters, *args, **kwargs)
+    step_fn = o.step
+    o.step = functools.partial(warn_once,
+                               msg="You're trying to call `step` on a fused optimizer. This will not do anything.")
     def _step(p: Tensor):
         seen_params.add(p)
         if len(seen_params) < param_count:
-            o.step()
+            step_fn()
             o.zero_grad()
             seen_params.clear()
     for p in parameters:
         p.register_post_accumulate_grad_hook(_step)
+    return o
 @decorator_knowngood
 def _compilable_caution_no_scale(g: Tensor, update: Tensor):

{heavyball-1.4.3.dist-info → heavyball-1.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.4.3
+Version: 1.5.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

heavyball-1.5.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+heavyball/__init__.py,sha256=AL3oSbNB1HQ0cwEG6aPZGVMbpCXXCYOxREX7JwK4Byc,12773
+heavyball/chainable.py,sha256=4xIaufYcIMgrasSIm9ZHwqRXD2vvUbHsW0FJqGB68EM,24782
+heavyball/utils.py,sha256=NFvQcQemNOugH1vAi_UH3jnnttPSgVopmS1q6jbhxkQ,50289
+heavyball-1.5.0.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
+heavyball-1.5.0.dist-info/METADATA,sha256=fUOCJvDcBQ5280TCLhUCuIRwNVMvp3ysp4qrDuJCUeI,43584
+heavyball-1.5.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+heavyball-1.5.0.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
+heavyball-1.5.0.dist-info/RECORD,,

heavyball-1.4.3.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-heavyball/__init__.py,sha256=miRgcXlzLWTNzojeRF5hEcg-x_GqfMHjRzOaiR_zO3U,10981
-heavyball/chainable.py,sha256=-5ovRa7yD7V41_cgaBJtO5fBrnBemAILl4YKjQmeuns,24183
-heavyball/utils.py,sha256=x0rSU8lko7ACdI9GuTLC0wP6HwIZxwB8f8tukBOR0xA,48129
-heavyball-1.4.3.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
-heavyball-1.4.3.dist-info/METADATA,sha256=RM_pOme3dsQL-drKcKD6FJ0qE3SSh4JdPM-kC9vpbeU,43584
-heavyball-1.4.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-heavyball-1.4.3.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
-heavyball-1.4.3.dist-info/RECORD,,

{heavyball-1.4.3.dist-info → heavyball-1.5.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{heavyball-1.4.3.dist-info → heavyball-1.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{heavyball-1.4.3.dist-info → heavyball-1.5.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

heavyball 1.4.3__py3-none-any.whl → 1.5.0__py3-none-any.whl

heavyball 1.4.3py3-none-any.whl → 1.5.0py3-none-any.whl