PyPI - heavyball - Versions diffs - 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

heavyball 1.6.3py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

heavyball/__init__.py +515 -100
heavyball/chainable.py +487 -156
heavyball/optimizations/__init__.py +38 -0
heavyball/optimizations/integrator.py +169 -0
heavyball/optimizations/optimizations.py +329 -0
heavyball/utils.py +780 -241
{heavyball-1.6.3.dist-info → heavyball-1.7.1.dist-info}/METADATA +3 -2
heavyball-1.7.1.dist-info/RECORD +11 -0
{heavyball-1.6.3.dist-info → heavyball-1.7.1.dist-info}/WHEEL +1 -1
{heavyball-1.6.3.dist-info → heavyball-1.7.1.dist-info/licenses}/LICENSE +1 -1
heavyball-1.6.3.dist-info/RECORD +0 -8
{heavyball-1.6.3.dist-info → heavyball-1.7.1.dist-info}/top_level.txt +0 -0

heavyball/chainable.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import functools
+import math
 import random
-from typing import Optional, Union, Literal, List
+from typing import List, Literal, Optional, Union
 import torch
+from torch import Tensor
 from . import utils
@@ -42,7 +44,7 @@ class FunctionTransform:
         raise NotImplementedError
     def get_fn(self):
-        if hasattr(self.fn, 'get_fn'):
+        if utils.hasattr_none(self.fn, "get_fn"):
             return self.fn.get_fn()
         return self.fn
@@ -55,7 +57,7 @@ def _zero_guard(state, key, ref, dtype):
 def _storage_dtype(group):
-    dtype = group.get('storage_dtype', "float32")
+    dtype = group.get("storage_dtype", "float32")
     return getattr(torch, dtype)
@@ -65,8 +67,10 @@ class ZeroGuard(FunctionTransform):
         self.names = names
     def __call__(self, state, group, update, grad, param, *args, **kwargs):
-        vars = [[_zero_guard(state(p), self.val_name(name), p, _storage_dtype(group)) for p in param]  #
-                for name in self.names]
+        vars = [
+            [_zero_guard(state(p), self.val_name(name), p, _storage_dtype(group)) for p in param]  #
+            for name in self.names
+        ]
         return self.fn(state, group, update, grad, param, *args, *vars, **kwargs)
@@ -78,8 +82,10 @@ class CopyGuard(FunctionTransform):
     def __call__(self, state, group, update, grad, param, *args, **kwargs):
         val = [update, grad, param, *args][self.index]
-        vars = [[_guard_in_state(state(p), self.val_name(name), lambda: torch.clone(v)) for p, v in zip(param, val)]  #
-                for name in self.names]
+        vars = [
+            [_guard_in_state(state(p), self.val_name(name), lambda: torch.clone(v)) for p, v in zip(param, val)]  #
+            for name in self.names
+        ]
         return self.fn(state, group, update, grad, param, *args, *vars, **kwargs)
@@ -152,145 +158,243 @@ def exp_avg(group, update, grad, param, exp_avg):
     return utils.scale_by_exp_avg_(exp_avg, update, utils.beta_debias(utils.get_beta1(group), group["step"]))
-@zero_guard('exp_avg')
+@zero_guard("exp_avg")
 @no_state
 def weight_decay_to_ema(group, update, grad, param, exp_avg):
-    utils.weight_decay_to_ema_(exp_avg, update, utils.beta_debias(group['ema_beta'], group['step']),
-                               group['weight_decay_to_ema'] * group['lr'])
+    utils.weight_decay_to_ema_(
+        exp_avg,
+        update,
+        utils.beta_debias(group["ema_beta"], group["step"]),
+        group["weight_decay_to_ema"] * group["lr"],
+    )
     return update
-@zero_guard('exp_avg')
+@zero_guard("exp_avg")
 @no_state
 def l1_weight_decay_to_ema(group, update, grad, param, exp_avg):
-    utils.l1_weight_decay_to_ema_(exp_avg, update, utils.beta_debias(group['ema_beta'], group['step']),
-                                  group['weight_decay_to_ema'] * group['lr'])
+    utils.l1_weight_decay_to_ema_(
+        exp_avg,
+        update,
+        utils.beta_debias(group["ema_beta"], group["step"]),
+        group["weight_decay_to_ema"] * group["lr"],
+    )
     return update
 @zero_guard("exp_avg_sq")
 @no_state
 def scale_by_exp_avg_sq(group, update, grad, param, exp_avg_sq):
-    return utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group["step"]),
-                                      group['eps'])
+    return utils.scale_by_exp_avg_sq_(
+        exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group["step"]), group["eps"]
+    )
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def scale_by_adam(group, update, grad, param, exp_avg, exp_avg_sq):
-    return utils.adam_(exp_avg, exp_avg_sq, update, utils.get_beta1(group), utils.get_beta2(group), group['step'],  #
-                       group['eps'])
+    return utils.adam_(
+        exp_avg,
+        exp_avg_sq,
+        update,
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"],  #
+        group["eps"],
+    )
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def update_by_adam(group, update, grad, param, exp_avg, exp_avg_sq):
-    utils.fused_adam_(param, exp_avg, exp_avg_sq, update, grad, utils.get_beta1(group), utils.get_beta2(group),
-                      group['step'], group['lr'], group['eps'], group['weight_decay'], group['caution'])
+    utils.fused_adam_(
+        param,
+        exp_avg,
+        exp_avg_sq,
+        update,
+        grad,
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"],
+        group["lr"],
+        group["eps"],
+        group["weight_decay"],
+        group["caution"],
+    )
     raise SkipUpdate
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def scale_by_laprop(group, update, grad, param, exp_avg, exp_avg_sq):
-    return utils.laprop_(exp_avg, exp_avg_sq, update, utils.get_beta1(group), utils.get_beta2(group), group['step'])
+    return utils.laprop_(exp_avg, exp_avg_sq, update, utils.get_beta1(group), utils.get_beta2(group), group["step"])
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def update_by_laprop(group, update, grad, param, exp_avg, exp_avg_sq):
-    utils.fused_laprop_(param, exp_avg, exp_avg_sq, update, grad, utils.get_beta1(group), utils.get_beta2(group),
-                        group['step'], group['lr'], group['weight_decay'], group['caution'])
+    utils.fused_laprop_(
+        param,
+        exp_avg,
+        exp_avg_sq,
+        update,
+        grad,
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"],
+        group["lr"],
+        group["weight_decay"],
+        group["caution"],
+    )
     raise SkipUpdate
 @no_state
 def orthogonalize_grad_to_param(group, update, grad, param):
-    return utils.orthogonalize_grad_to_param(param, update, group['eps'])
+    return utils.orthogonalize_grad_to_param(param, update, group["eps"])
 @copy_guard(2, "z")
 @no_state
 def update_by_schedule_free(group, update, grad, param, z):
-    group['weight_sum'] = utils.schedule_free_(group['lr'], group['weight_lr_power'], group.get('weight_sum', 0),
-                                               utils.get_beta1(group), param, z, update, grad, group['caution'],
-                                               group['r'], group['step'], group['weight_decay'])
+    group["weight_sum"] = utils.schedule_free_(
+        group["lr"],
+        group["weight_lr_power"],
+        group.get("weight_sum", 0),
+        utils.get_beta1(group),
+        param,
+        z,
+        update,
+        grad,
+        group["caution"],
+        group["r"],
+        group["step"],
+        group["weight_decay"],
+    )
     raise SkipUpdate
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def update_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
-    if group['step'] == 1:
-        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, 0, group['eps'])
+    if group["step"] == 1:
+        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, 0, group["eps"])
         raise SkipUpdate
-    if group['step'] == 2:
+    if group["step"] == 2:
         update = utils.promote(update)
         easq = utils.promote(exp_avg_sq)
-        [utils.set_(ea, u / easq_.sqrt().clamp_(min=group['eps'])) for ea, u, easq_ in zip(exp_avg, update, easq)]
-        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']),
-                                   group['eps'])
+        [utils.set_(ea, u / easq_.sqrt().clamp_(min=group["eps"])) for ea, u, easq_ in zip(exp_avg, update, easq)]
+        utils.scale_by_exp_avg_sq_(
+            exp_avg_sq,
+            update,
+            utils.beta_debias(utils.get_beta2(group), group["step"]),
+            group["eps"],
+        )
         raise SkipUpdate
-    utils.fused_adopt_(param, update, grad, exp_avg_sq, exp_avg, utils.get_beta1(group), utils.get_beta2(group),
-                       group['step'] - 2, group['lr'], group['eps'], group['weight_decay'], group['caution'])
+    utils.fused_adopt_(
+        param,
+        update,
+        grad,
+        exp_avg_sq,
+        exp_avg,
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"] - 2,
+        group["lr"],
+        group["eps"],
+        group["weight_decay"],
+        group["caution"],
+    )
     raise SkipUpdate
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def scale_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
-    if group['step'] == 1:
-        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, 0, group['eps'])
+    if group["step"] == 1:
+        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, 0, group["eps"])
         raise SkipUpdate
-    if group['step'] == 2:
+    if group["step"] == 2:
         update = utils.promote(update)
         easq = utils.promote(exp_avg_sq)
-        [utils.set_(ea, u / easq_.sqrt().clamp_(min=group['eps'])) for ea, u, easq_ in zip(exp_avg, update, easq)]
-        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']),
-                                   group['eps'])
+        [utils.set_(ea, u / easq_.sqrt().clamp_(min=group["eps"])) for ea, u, easq_ in zip(exp_avg, update, easq)]
+        utils.scale_by_exp_avg_sq_(
+            exp_avg_sq,
+            update,
+            utils.beta_debias(utils.get_beta2(group), group["step"]),
+            group["eps"],
+        )
         raise SkipUpdate
-    return utils.adopt(update, exp_avg_sq, exp_avg, utils.get_beta1(group), utils.get_beta2(group), group['step'] - 2)
-def _init_soap(state, group, update, grad, param, inner: str = ''):
-    utils.init_preconditioner(grad, state, group['max_precond_dim'], group['precondition_1d'])
-def _init_psgd(state, group, update, grad, param, cached: bool = False, prob: Optional[callable] = None):
-    Q, state["exprs"] = utils.init_Q_exprs(grad, group['precond_init_scale'], group['max_size_triangular'],
-                                           group['min_ndim_triangular'], group['memory_save_mode'],
-                                           dtype=getattr(torch, group['q_dtype']))
-    state["Q"] = utils.triu_to_line(Q) if group['store_triu_as_line'] else Q
+    return utils.adopt(
+        update,
+        exp_avg_sq,
+        exp_avg,
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"] - 2,
+    )
+def _init_soap(state, group, update, grad, param, inner: str = ""):
+    utils.init_preconditioner(grad, state, group["max_precond_dim"], group["precondition_1d"])
+def _init_psgd_kron(state, group, update, grad, param, cached: bool = False, prob: Optional[callable] = None):
+    Q, state["exprs"] = utils.init_Q_exprs(
+        grad,
+        group["precond_init_scale"],
+        group["precond_init_scale_scale"],
+        group["max_size_triangular"],
+        group["min_ndim_triangular"],
+        group["memory_save_mode"],
+        getattr(param, "hessian_vector", None),
+        getattr(param, "vector", None),
+        dtype=getattr(torch, group["q_dtype"]),
+    )
+    state["Q"] = utils.triu_to_line(Q) if group["store_triu_as_line"] else Q
     if not cached:
         return
-    state['Q_cache'] = [torch.empty_like(q) for q in Q]
+    state["Q_cache"] = [torch.empty_like(q) for q in Q]
-    expr = [f'{c.upper()}{c}' if q_.ndim == 2 else c for c, q_ in zip(utils.einsum_base, Q)]
-    expr = ','.join(expr)
-    grad_expr = ''.join(c for c, _ in zip(utils.einsum_base, grad.shape))
-    out_expr = ''.join(c.upper() if c.upper() in expr else c for c in grad_expr)
-    expr = f'{expr},{grad_expr}->{out_expr}'
+    expr = [f"{c.upper()}{c}" if q_.ndim == 2 else c for c, q_ in zip(utils.einsum_base, Q)]
+    expr = ",".join(expr)
+    grad_expr = "".join(c for c, _ in zip(utils.einsum_base, grad.shape))
+    out_expr = "".join(c.upper() if c.upper() in expr else c for c in grad_expr)
+    expr = f"{expr},{grad_expr}->{out_expr}"
-    state['cache_expr'] = expr
+    state["cache_expr"] = expr
-def precond_schedule(group, prob: Union[callable, float, None] = None, name: str = 'cumulative_prob'):
-    step = group['step']
-    if 'precondition_frequency' in group:
-        return step > 0 and step % group['precondition_frequency'] == 0
+def _init_psgd_lra(state, group, update, grad, param, cached: bool = False, prob: Optional[callable] = None):
+    state["U"], state["V"], state["d"] = utils.init_lra(
+        grad,
+        group["precond_init_scale"],
+        group["precond_init_scale_scale"],
+        group["rank"],
+        getattr(param, "hessian_vector", None),
+        getattr(param, "vector", None),
+        dtype=getattr(torch, group["q_dtype"]),
+    )
+    group["preconditioning_step"] = 0
+def precond_schedule(group, prob: Union[callable, float, None] = None, name: str = "cumulative_prob"):
+    step = group["step"]
+    if "precondition_frequency" in group:
+        return step > 0 and step % group["precondition_frequency"] == 0
     if isinstance(step, torch.Tensor):
         utils.warn_once("Preconditioner schedule is not supported with torch.Tensor step.")
         rng = random.Random(0x172381)
     else:
         rng = random.Random(0x172381 ^ step)
-    if 'precond_scheduler' in group:
-        return utils.precond_schedule(step, group['precond_scheduler'], rng)
+    if "precond_scheduler" in group:
+        return utils.precond_schedule(step, group["precond_scheduler"], rng)
     if prob is not None:
         return utils.psgd_should_update(group, prob, rng, name=name)
     raise ValueError("No preconditioner update schedule specified.")
@@ -313,17 +417,17 @@ def nesterov_momentum(group, updates, grads, params, momentum):
     return utils.nesterov_momentum(momentum, updates, utils.get_beta1(group))
-@zero_guard('momentum')
+@zero_guard("momentum")
 @no_state
 def nesterov_ema(group, updates, grads, params, momentum):  # equivalent to Grokfast
     return utils.nesterov_ema(momentum, updates, utils.get_beta1(group))
 def _store_std(state, group, update, grad, param):
-    state['init_std'] = torch.std(grad, dim=0)
+    state["init_std"] = torch.std(grad, dim=0)
-@general_guard("init_std", init_fn=_store_std)
+@general_guard("init_std", init_fn=_store_std, skip_first=False)
 @no_state
 def mup_approx(group, updates, grads, params, init_std):
     _updates = [(u, i) for u, i in zip(updates, init_std) if u.ndim > 1]
@@ -332,31 +436,79 @@ def mup_approx(group, updates, grads, params, init_std):
     return updates
+def _init_delta(state, group, update, grad, param, log_space: bool):
+    val = group["initial_d"]
+    state["delta"] = torch.full((), math.log(val) if log_space else val, dtype=param.dtype, device=param.device)
+def _init_full_delta(state, group, update, grad, param, log_space: bool):
+    val = group["initial_d"]
+    state["delta"] = torch.full_like(param, math.log(val) if log_space else val)
+@zero_guard("state")
+@general_guard("delta", init_fn=functools.partial(_init_delta, log_space=False), skip_first=False)
+@no_state
+def scale_by_d_adaptation(group, update, grad, param, state, delta):
+    utils.d_adaptation(grad, update, state, delta)
+    return update
+@zero_guard("state")
+@general_guard("delta", init_fn=functools.partial(_init_delta, log_space=True), skip_first=False)
+@no_state
+def scale_by_lr_adaptation(group, update, grad, param, state, delta):
+    utils.lr_adaptation(grad, update, state, delta, group["lr_lr"])
+    return update
+@zero_guard("state")
+@general_guard("delta", init_fn=functools.partial(_init_full_delta, log_space=True), skip_first=False)
+@no_state
+def scale_by_pointwise_lr_adaptation(group, update, grad, param, state, delta):
+    utils.pointwise_lr_adaptation(grad, update, state, delta, group["lr_lr"])
+    return update
 @zero_guard("momentum")
 @no_state
 def heavyball_momentum(group, updates, grads, params, momentum):
     return utils.heavyball_momentum(momentum, updates, utils.get_beta1(group))
-_optim_fns = {'adam': utils.adam_, 'laprop': utils.laprop_}
+_optim_fns = {"adam": utils.adam_, "laprop": utils.laprop_}
 @zero_guard("exp_avg", "exp_avg_sq")
 @general_guard("Q", "GG", init_fn=_init_soap)
 @no_state
-def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG, inner: str = 'adam'):
+def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG, inner: str = "adam"):
     update = utils.promote(update)  # Promote to highest precision if needed
     grad_projected = [utils.project(u, q, False) for u, q in zip(update, Q)]
     fn = _optim_fns[inner]
-    precond = fn(exp_avg, exp_avg_sq, grad_projected, utils.get_beta1(group), utils.get_beta2(group), group['step'] - 1,
-                 group['eps'])
+    precond = fn(
+        exp_avg,
+        exp_avg_sq,
+        grad_projected,
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"] - 1,
+        group["eps"],
+    )
     precond = [utils.project(p, q, True) for p, q in zip(precond, Q)]
     for u, q, gg, ea in zip(update, Q, GG, exp_avg):
-        utils.update_preconditioner(u, q, gg, ea, group['max_precond_dim'], group['precondition_1d'],
-                                    utils.beta_debias(group['shampoo_beta'], group['step']),
-                                    group['is_preconditioning'])
+        utils.update_preconditioner(
+            u,
+            q,
+            gg,
+            ea,
+            group["max_precond_dim"],
+            group["precondition_1d"],
+            utils.beta_debias(group["shampoo_beta"], group["step"]),
+            group["is_preconditioning"],
+        )
     return precond
@@ -364,17 +516,28 @@ def _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, p
     if prob is None:
         prob = utils.precond_update_prob_schedule()
-    if not group['is_preconditioning']:
+    if not group["is_preconditioning"]:
         return Q_mat
-    utils.psgd_update_precond(Q_mat, exprs, getattr(param, 'hessian_vector', grad), group['precond_lr'], Q,
-                              group['store_triu_as_line'], getattr(param, 'vector', None))
-    if hasattr(param, 'vector'):
+    if utils.hasattr_none(param, "vector"):
+        vector, hessian_vector = param.vector, param.hessian_vector
         del param.vector
         del param.hessian_vector
+    else:
+        vector, hessian_vector = utils.dampen_grad(grad)
+    utils.psgd_update_precond(
+        Q_mat,
+        exprs,
+        hessian_vector,
+        group["precond_lr"],
+        Q,
+        group["store_triu_as_line"],
+        vector,
+    )
     if grad.dim() > 1 and precond_schedule(group, balance_probability, f"balance_prob_{id(Q)}"):
-        if group['store_triu_as_line']:
+        if group["store_triu_as_line"]:
             utils.psgd_balance_Q([q_ for _, q_ in Q])
         else:
             utils.psgd_balance_Q(Q)
@@ -382,8 +545,8 @@ def _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, p
     if isinstance(prob, float):
         float_prob = prob
     else:
-        float_prob = prob(group.get(f'cumulative_prob_{id(Q)}_prob_step', 1))
-    group['is_cached'] = should_use_cache = cached and float_prob < 0.5
+        float_prob = prob(group.get(f"cumulative_prob_{id(Q)}_prob_step", 1))
+    group["is_cached"] = should_use_cache = cached and float_prob < 0.5
     if should_use_cache:  # caching adds extra ops and is not worth the overhead when we precondition at every step
         return _update_psgd_cache(cached, Q_cache, Q_mat)
@@ -403,51 +566,172 @@ def _update_psgd_cache(cached, Q_cache, q):
 def _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache, grad):
-    if group.get('is_cached', False):
-        out = utils.precond_grad_cached_(cache_expr, update, *Q_cache, caution=group['caution'], grad=grad)
-    out = utils.psgd_precond_grad(exprs[-1], update, *Q_mat, caution=group['caution'], grad=grad)
-    group['caution'] = False  # we already cautioned here - shouldn't do it again
+    if group.get("is_cached", False):
+        out = utils.precond_grad_cached_(cache_expr, update, *Q_cache, caution=group["caution"], grad=grad)
+    else:
+        out = utils.psgd_precond_grad(exprs[-1], update, *Q_mat, caution=group["caution"], grad=grad)
+    group["caution"] = False  # we already cautioned here - shouldn't do it again
     return out
 def _fused_cached_psgd_precond_grad(group, grad, param, cache_expr, exprs, update, Q_mat, Q_cache):
-    if group.get('is_cached', False):
-        utils.fused_precond_grad_cached_(cache_expr, update, param, group['lr'], grad, group['weight_decay'],
-                                         group['caution'], *Q_cache)
+    if group.get("is_cached", False):
+        utils.fused_precond_grad_cached_(
+            cache_expr,
+            update,
+            param,
+            group["lr"],
+            grad,
+            group["weight_decay"],
+            group["caution"],
+            *Q_cache,
+        )
     else:
-        utils.fused_psgd_precond_grad(exprs[-1], update, param, group['lr'], grad, group['weight_decay'],
-                                      group['caution'], *Q_mat)
+        utils.fused_psgd_precond_grad(
+            exprs[-1],
+            update,
+            param,
+            group["lr"],
+            grad,
+            group["weight_decay"],
+            group["caution"],
+            *Q_mat,
+        )
+def _update_lra(
+    group, U: List[Tensor], V: List[Tensor], d: List[Tensor], params: List[Tensor], grads: List[Tensor], delayed: bool
+):
+    if not group["is_preconditioning"]:
+        return utils.flatten(U, 1), utils.flatten(V, 1), utils.flatten(d)
+    if utils.hasattr_none(params[0], "hessian_vector"):
+        vector = utils.flatten([p.vector for p in params])
+        hessian_vector = utils.flatten([p.hessian_vector for p in params])
+        for p in params:
+            del p.vector
+            del p.hessian_vector
+    else:
+        vector, hessian_vector = utils.dampen_multiple(grads)
+    return utils.update_lra_precond_(U, V, d, vector, hessian_vector, group["eps"], group["precond_lr"], delayed)
+@general_guard("U", "V", "d", init_fn=_init_psgd_lra, skip_first=False)
+@no_state
+def scale_by_psgd_lra(group, update, grad, param, U, V, d):
+    u, v, d = _update_lra(group, U, V, d, param, update if group["momentum_into_precond_update"] else grad, False)
+    return utils.extract_from_flat_update(param, utils.lra_precond(u, v, d, utils.flatten(update)))
+@general_guard("U", "V", "d", init_fn=_init_psgd_lra, skip_first=False)
+@no_state
+def update_by_psgd_lra(group, update, grad, param, U, V, d):
+    u, v, d = _update_lra(group, U, V, d, param, update if group["momentum_into_precond_update"] else grad, False)
+    utils.apply_lra_update(param, update, u, v, d)
+    raise SkipUpdate
+@general_guard("U", "V", "d", init_fn=_init_psgd_lra, skip_first=False)
+@no_state
+def scale_by_delayed_psgd_lra(group, update, grad, param, U, V, d):
+    u, v, d = _update_lra(group, U, V, d, param, update if group["momentum_into_precond_update"] else grad, True)
+    return utils.extract_from_flat_update(param, utils.lra_precond(u, v, d, utils.flatten(update)))
+@general_guard("U", "V", "d", init_fn=_init_psgd_lra, skip_first=False)
+@no_state
+def update_by_delayed_psgd_lra(group, update, grad, param, U, V, d):
+    u, v, d = _update_lra(group, U, V, d, param, update if group["momentum_into_precond_update"] else grad, True)
+    utils.apply_lra_update(param, update, u, v, d)
+    raise SkipUpdate
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd_kron, skip_first=False)
 @no_state_no_foreach
-def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
-                  prob: Optional[callable] = None):
+def scale_by_psgd(
+    group,
+    update,
+    grad,
+    param,
+    Q,
+    exprs,
+    Q_cache,
+    cache_expr: str,
+    cached: bool = False,
+    prob: Optional[callable] = None,
+):
     update = update.to(memory_format=torch.contiguous_format)
-    Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    Q_mat = _update_psgd_precond(cached, Q_cache, group, param,
-                                 update if group['momentum_into_precond_update'] else grad, Q_mat, Q, exprs, prob)
+    Q_mat = utils.line_to_triu(Q) if group["store_triu_as_line"] else Q
+    Q_mat = _update_psgd_precond(
+        cached,
+        Q_cache,
+        group,
+        param,
+        update if group["momentum_into_precond_update"] else grad,
+        Q_mat,
+        Q,
+        exprs,
+        prob,
+    )
     return _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache, grad)
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd_kron, skip_first=False)
 @no_state_no_foreach
-def scale_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
-                          prob: Optional[callable] = None):
-    Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
+def scale_by_delayed_psgd(
+    group,
+    update,
+    grad,
+    param,
+    Q,
+    exprs,
+    Q_cache,
+    cache_expr: str,
+    cached: bool = False,
+    prob: Optional[callable] = None,
+):
+    Q_mat = utils.line_to_triu(Q) if group["store_triu_as_line"] else Q
     precond = _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache, grad)
-    _ = _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
-                             Q_mat, Q, exprs, prob)
+    _ = _update_psgd_precond(
+        cached,
+        Q_cache,
+        group,
+        param,
+        update if group["momentum_into_precond_update"] else grad,
+        Q_mat,
+        Q,
+        exprs,
+        prob,
+    )
     return precond
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd_kron, skip_first=False)
 @no_state_no_foreach
-def update_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
-                   prob: Optional[callable] = None):
-    Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    Q_mat = _update_psgd_precond(cached, Q_cache, group, param,
-                                 update if group['momentum_into_precond_update'] else grad, Q_mat, Q, exprs, prob)
+def update_by_psgd(
+    group,
+    update,
+    grad,
+    param,
+    Q,
+    exprs,
+    Q_cache,
+    cache_expr: str,
+    cached: bool = False,
+    prob: Optional[callable] = None,
+):
+    Q_mat = utils.line_to_triu(Q) if group["store_triu_as_line"] else Q
+    Q_mat = _update_psgd_precond(
+        cached,
+        Q_cache,
+        group,
+        param,
+        update if group["momentum_into_precond_update"] else grad,
+        Q_mat,
+        Q,
+        exprs,
+        prob,
+    )
     _fused_cached_psgd_precond_grad(group, update, param, cache_expr, exprs, update, Q_mat, Q_cache)
     raise SkipUpdate
@@ -457,20 +741,39 @@ def sign(group, update, grad, param, graft: bool = True):
     return utils.sign_(update, graft)
-@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
+@general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd_kron, skip_first=False)
 @no_state_no_foreach
-def update_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
-                           prob: Optional[callable] = None):
-    Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
+def update_by_delayed_psgd(
+    group,
+    update,
+    grad,
+    param,
+    Q,
+    exprs,
+    Q_cache,
+    cache_expr: str,
+    cached: bool = False,
+    prob: Optional[callable] = None,
+):
+    Q_mat = utils.line_to_triu(Q) if group["store_triu_as_line"] else Q
     _fused_cached_psgd_precond_grad(group, update, param, cache_expr, exprs, update, Q_mat, Q_cache)
-    _ = _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
-                             Q_mat, Q, exprs, prob)
+    _ = _update_psgd_precond(
+        cached,
+        Q_cache,
+        group,
+        param,
+        update if group["momentum_into_precond_update"] else grad,
+        Q_mat,
+        Q,
+        exprs,
+        prob,
+    )
     raise SkipUpdate
 def palm_beta2(state, group, update, grad, param):
-    beta2 = 1 - group['step'] ** -group['beta2_scale']
-    group['betas'] = (utils.get_beta1(group), beta2)
+    beta2 = 1 - group["step"] ** -group["beta2_scale"]
+    group["betas"] = (utils.get_beta1(group), beta2)
     return update
@@ -499,7 +802,7 @@ def chain(state: Union[callable, dict], group, grad, param, *fns):
     update = [torch.clone(g, memory_format=torch.preserve_format) for g in grad]
     update, skip_update = _inner_chain(state, group, update, grad, param, *fns)
     if not skip_update and update is not None:
-        utils.update_param_(param, update, group['lr'], group['weight_decay'], caution=group['caution'], grad=grad)
+        utils.update_param_(param, update, group["lr"], group["weight_decay"], caution=group["caution"], grad=grad)
 def create_branch(branches: List[List[callable]], merge_fn: callable):
@@ -524,14 +827,16 @@ class ChainOpt(utils.StatefulOptimizer):
         self.fns = tuple(fns)
     def _step(self, group):
-        if 'base_lr' not in group:
-            group['base_lr'] = group['lr']
-        if 'prev_lr' in group and group['prev_lr'] != group['lr']:
-            utils.warn_once(f'Learning rate changed between steps. This is an experimental feature and '
-                            f'only supported with foreach=True (currently foreach={group["foreach"]}).')
-            group['base_lr'] = group['lr']
+        if "base_lr" not in group:
+            group["base_lr"] = group["lr"]
+        if "prev_lr" in group and group["prev_lr"] != group["lr"]:
+            utils.warn_once(
+                f"Learning rate changed between steps. This is an experimental feature and "
+                f"only supported with foreach=True (currently foreach={group['foreach']})."
+            )
+            group["base_lr"] = group["lr"]
-        caution = group['caution']
+        caution = group["caution"]
         vals = list(self.split_p_and_g_in_group(group, should_promote=self.promote, beta1=utils.get_beta1(group)))
@@ -541,26 +846,26 @@ class ChainOpt(utils.StatefulOptimizer):
         for param in p:
             state = self.state_(param)
-            if 'step' in state:
-                step = state['step']
+            if "step" in state:
+                step = state["step"]
             elif self.compile_step:
                 step = utils.scalar_guard(0, param)
             else:
                 step = 0
             break
-        group['step'] = state['step'] = step = step + 1
-        group['prev_lr'] = group['lr'] = group['base_lr'] * step / max(step, group['warmup_steps'] + 1)
+        group["step"] = state["step"] = step = step + 1
+        group["prev_lr"] = group["lr"] = group["base_lr"] * step / max(step, group["warmup_steps"] + 1)
-        if not group['foreach'] or len(p) == 1:
+        if not group["foreach"] or len(p) == 1:
             for param, grad in zip(p, g):
                 chain(self.state_, group, [grad], [param], *self.fns)
         else:
             chain(self.state_, group, g, p, *self.fns)
-        group['caution'] = caution
-        group['lr'] = group['prev_lr']
-        group['step'] = None
+        group["caution"] = caution
+        group["lr"] = group["prev_lr"]
+        group["step"] = None
 use_default = object()
@@ -571,7 +876,13 @@ def _get_clip_fn(name: str_or_fn, default_val: str_or_fn):
     name = default(name, default_val)
     if callable(name):
         return name
-    elif name not in ('l2_clip_', 'rmsnorm_clip_', 'trust_region_clip_', 'a_law_compress', 'mu_law_compress'):
+    elif name not in (
+        "l2_clip_",
+        "rmsnorm_clip_",
+        "trust_region_clip_",
+        "a_law_compress",
+        "mu_law_compress",
+    ):
         raise ValueError(f"Clipping function {name} not found")
     return getattr(utils, name)
@@ -581,16 +892,24 @@ def default(a, b):
 # not supported: update_by_schedule_free, scale_by_soap, scale_by_exp_avg_sq
-_scale_to_update_map = {scale_by_delayed_psgd.get_fn(): update_by_delayed_psgd,  #
-                        scale_by_psgd.get_fn(): update_by_psgd,  #
-                        scale_by_adam.get_fn(): update_by_adam,  #
-                        scale_by_laprop.get_fn(): update_by_laprop,  #
-                        scale_by_adopt.get_fn(): update_by_adopt}
-_scale_to_update_map_inv = {update_by_delayed_psgd.get_fn(): scale_by_delayed_psgd,  #
-                            update_by_psgd.get_fn(): scale_by_psgd,  #
-                            update_by_adam.get_fn(): scale_by_adam,  #
-                            update_by_laprop.get_fn(): scale_by_laprop,  #
-                            update_by_adopt.get_fn(): scale_by_adopt}
+_scale_to_update_map = {
+    scale_by_delayed_psgd.get_fn(): update_by_delayed_psgd,  #
+    scale_by_psgd.get_fn(): update_by_psgd,  #
+    scale_by_psgd_lra.get_fn(): update_by_psgd_lra,  #
+    scale_by_delayed_psgd_lra.get_fn(): update_by_delayed_psgd_lra,  #
+    scale_by_adam.get_fn(): update_by_adam,  #
+    scale_by_laprop.get_fn(): update_by_laprop,  #
+    scale_by_adopt.get_fn(): update_by_adopt,  #
+}
+_scale_to_update_map_inv = {
+    update_by_delayed_psgd.get_fn(): scale_by_delayed_psgd,  #
+    update_by_psgd.get_fn(): scale_by_psgd,  #
+    update_by_psgd_lra.get_fn(): scale_by_psgd_lra,  #
+    update_by_delayed_psgd_lra.get_fn(): scale_by_delayed_psgd_lra,  #
+    update_by_adam.get_fn(): scale_by_adam,  #
+    update_by_laprop.get_fn(): scale_by_laprop,  #
+    update_by_adopt.get_fn(): scale_by_adopt,  #
+}
 class BaseOpt(ChainOpt):
@@ -622,8 +941,18 @@ class BaseOpt(ChainOpt):
     palm: bool = False
     auto_fuse: bool = True
-    def __init__(self, params, defaults, foreach: bool, gradient_clipping: str_or_fn, update_clipping: str_or_fn,
-                 palm: bool = use_default, *fns, compile_step: bool = use_default, promote: bool = use_default):
+    def __init__(
+        self,
+        params,
+        defaults,
+        foreach: bool,
+        gradient_clipping: str_or_fn,
+        update_clipping: str_or_fn,
+        palm: bool = use_default,
+        *fns,
+        compile_step: bool = use_default,
+        promote: bool = use_default,
+    ):
         if not fns:
             raise ValueError("No functions provided. If that's on purpose (SGD-like), use `identity`")
@@ -643,8 +972,10 @@ class BaseOpt(ChainOpt):
                     fns = tuple(fns)[:-1] + (fn,)
         elif fn in _scale_to_update_map_inv:
             if not self.auto_fuse:
-                raise ValueError("update_clipping is currently not compatible with update_by_* functions. "
-                                 "Manually select scale_by_* functions or set auto_fuse=True.")
+                raise ValueError(
+                    "update_clipping is currently not compatible with update_by_* functions. "
+                    "Manually select scale_by_* functions or set auto_fuse=True."
+                )
             fn = _scale_to_update_map_inv[fn]
             if args is not None:
                 fn = functools.partial(fn, *args, **kwargs)
@@ -665,27 +996,27 @@ class BaseOpt(ChainOpt):
 class ScheduleFree(BaseOpt):
     def eval(self):
         for group in self.param_groups:
-            group['train_mode'] = train_mode = not group.get('train_mode')
+            group["train_mode"] = train_mode = not group.get("train_mode")
             beta1 = utils.get_beta1(group)
             if beta1 > 0 and not train_mode:
-                for p in group['params']:
+                for p in group["params"]:
                     state = self.state_(p)
-                    if 'z' in state:
+                    if "z" in state:
                         # Set p.data to x
-                        z = utils.promote(state['z'])
+                        z = utils.promote(state["z"])
                         p32 = utils.promote(p.data)
                         p32.lerp_(end=z, weight=1 - 1 / beta1)
                         utils.copy_stochastic_(p.data, p32)
     def train(self):
         for group in self.param_groups:
-            group['train_mode'] = train_mode = not group.get('train_mode')
+            group["train_mode"] = train_mode = not group.get("train_mode")
             beta1 = utils.get_beta1(group)
             if beta1 > 0 and train_mode:
-                for p in group['params']:
+                for p in group["params"]:
                     state = self.state_(p)
-                    if 'z' in state:
-                        z = utils.promote(state['z'])
+                    if "z" in state:
+                        z = utils.promote(state["z"])
                         p32 = utils.promote(p.data)
                         p32.lerp_(end=z, weight=1 - beta1)
                         utils.copy_stochastic_(p.data, p32)

heavyball 1.6.3__py3-none-any.whl → 1.7.1__py3-none-any.whl

heavyball 1.6.3py3-none-any.whl → 1.7.1py3-none-any.whl