PyPI - heavyball - Versions diffs - 2.0.0.dev0__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

heavyball 2.0.0.dev0py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

heavyball/__init__.py +168 -29
heavyball/chainable.py +165 -63
heavyball/helpers.py +5 -1
heavyball/utils.py +490 -124
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.0.dist-info}/METADATA +19 -7
heavyball-2.1.0.dist-info/RECORD +9 -0
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.0.dist-info}/WHEEL +1 -1
heavyball-2.0.0.dev0.dist-info/RECORD +0 -9
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.0.dist-info}/licenses/LICENSE +0 -0
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.0.dist-info}/top_level.txt +0 -0

heavyball/chainable.py CHANGED Viewed

@@ -2,6 +2,7 @@ import copy
 import functools
 import math
 import random
+from collections.abc import Iterable as _Iterable
 from typing import Iterable, List, Literal, Optional, Union
 import torch
@@ -85,6 +86,22 @@ class FunctionTransform:
         return f"{self.__class__.__name__}({self.fn}, transform_idx={self.transform_idx})"
+class Branch:
+    def __init__(self, branches: List[List[callable]], merge_fn: callable):
+        self.branches = branches
+        self.merge_fn = merge_fn
+    def __call__(self, state, group, update, grad, param):
+        outputs = []
+        for branch in self.branches:
+            branch_update = [torch.clone(u, memory_format=torch.preserve_format) for u in update]
+            branch_update, skip_update = _inner_chain(state, group, branch_update, grad, param, *branch)
+            if skip_update:
+                raise ValueError("Branches should not skip updates")
+            outputs.append(branch_update)
+        return self.merge_fn(outputs)
 def _zero_guard(state, key, ref, dtype):
     return _guard_in_state(state, key, lambda: torch.zeros_like(ref, dtype=dtype, memory_format=torch.preserve_format))
@@ -117,7 +134,7 @@ class PrecondGradAccumGuard(FunctionTransform):
         utils.stochastic_add_(state, new)
     def _reset(self, state):
-        if self.steps_taken == 0:
+        if self.steps_taken != 0:
             self.steps_taken = 0
             utils.zero_(state)
@@ -214,6 +231,23 @@ class NoStateNoForeach(FunctionTransform):
         return updates
+class SqueezeGrad(FunctionTransform):
+    def __call__(self, state, group, update, grad, param, *args, **kwargs):
+        original_shapes = [u.shape for u in update]
+        update = [u.squeeze() if u.numel() > 1 else u.view(-1) for u in update]
+        grad = [x.view_as(u) for x, u in zip(grad, update)]
+        param = [x.view_as(u) for x, u in zip(param, update)]
+        args = list(args)
+        for i, a in enumerate(args):
+            if isinstance(a, (list, tuple)) and isinstance(a[0], Tensor):
+                args[i] = [x.view_as(u) for x, u in zip(a, update)]
+        for k, a in kwargs.items():
+            if isinstance(a, (list, tuple)) and isinstance(a[0], Tensor):
+                kwargs[k] = [x.view_as(u) for x, u in zip(a, update)]
+        out = self.fn(state, group, update, grad, param, *args, **kwargs)
+        return [o.view(s) for o, s in zip(out, original_shapes)]
 def zero_guard(*names):
     return functools.partial(ZeroGuard, names=names)
@@ -247,11 +281,11 @@ def exp_avg(group, update, grad, param, exp_avg):
 @copy_guard(2, "init")
 @no_state
 def weight_decay_to_init(group, update, grad, param, init):
-    utils.weight_decay_to_init_(
-        param,
-        init,
-        group["weight_decay_to_ema"] * group["lr"],
-    )
+    utils.stochastic_lerp_(param, init, group["weight_decay_to_ema"] * group["lr"])
+    return update
+def identity(state, group, update, grad, param):
     return update
@@ -321,6 +355,26 @@ def update_by_adam(group, update, grad, param, exp_avg, exp_avg_sq):
     raise SkipUpdate from None
+@zero_guard("exp_avg", "exp_avg_sq")
+@no_state
+def update_by_adamc(group, update, grad, param, exp_avg, exp_avg_sq):
+    utils.fused_adam_(
+        param,
+        exp_avg,
+        exp_avg_sq,
+        update,
+        grad,
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"],
+        group["lr"],
+        group["eps"],
+        group["lr"] * group["weight_decay"] / group["max_lr"],
+        group["caution"],
+    )
+    raise SkipUpdate from None
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def scale_by_laprop(group, update, grad, param, exp_avg, exp_avg_sq):
@@ -426,6 +480,43 @@ def update_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
     raise SkipUpdate from None
+@zero_guard("exp_avg", "exp_avg_sq", "fisher_approx")
+@no_state_no_foreach
+def scale_by_suds(group, update, grad, param, exp_avg, exp_avg_sq, fisher_approx):
+    if group["step"] == 1:
+        utils.copy_stochastic_(fisher_approx, update / update.norm().clamp(min=1e-8))
+        raise SkipUpdate from None
+    precond_update, w = utils.eigvecs_product_rank1(update.flatten(), fisher_approx.flatten().to(update.dtype))
+    precond_update = utils.adam_(
+        exp_avg,
+        exp_avg_sq,
+        precond_update.view_as(exp_avg),
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"] - 1,
+    )[0]
+    precond_update, _ = utils.eigvecs_product_rank1(precond_update.flatten(), fisher_approx.flatten(), w)
+    new_approx = utils.oja_update(fisher_approx.flatten().to(update.dtype), update.flatten(), group["precond_lr"])
+    utils.copy_stochastic_(fisher_approx, new_approx)
+    return precond_update
+@zero_guard("exp_avg", "exp_avg_sq")
+@no_state
+def scale_by_unscaled_adam(group, update, grad, param, exp_avg, exp_avg_sq):
+    update = utils.unscaled_adam_(
+        exp_avg,
+        exp_avg_sq,
+        update,
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"],
+    )
+    return update
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def scale_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
@@ -473,7 +564,8 @@ def _init_psgd_kron(state, group, update, grad, param, cached: bool = False, pro
         dtype=getattr(torch, group["q_dtype"]),
     )
     state["Q"] = utils.triu_to_line(Q) if group["store_triu_as_line"] else Q
-    state["running_lower_bound"] = [torch.zeros((), device=q.device, dtype=q.dtype) for q in Q]
+    state["running_lower_bound"] = [torch.zeros((1,), device=q.device, dtype=torch.float64) for q in Q]
+    state["step"] = torch.zeros((), device=param.device, dtype=torch.int64)
     if group["adaptive"]:
         state["velocity"] = [torch.zeros((), device=q.device, dtype=q.dtype) for q in Q]
     if not cached:
@@ -623,7 +715,7 @@ def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG, inner:
 def _update_psgd_precond(
-    cached, Q_cache, group, param, grad, Q, velocity, running_lower_bound, prob: Optional[callable] = None
+    cached, Q_cache, group, param, grad, Q, velocity, running_lower_bound, step, prob: Optional[callable] = None
 ) -> Optional[Tensor]:
     if prob is None:
         prob = utils.precond_update_prob_schedule()
@@ -640,13 +732,13 @@ def _update_psgd_precond(
     else:
         vector, hessian_vector = utils.dampen_grad(grad, group["dampening"])
-    precond = (utils.inverse_free_psgd_update_precond if vector is None else utils.psgd_update_precond)(
+    precond = utils.psgd_update_precond(
         hessian_vector,
         group["precond_lr"],
         Q,
         group["store_triu_as_line"],
         velocity,
-        utils.beta_debias(utils.get_beta2(group), group["step"]),
+        utils.get_beta2(group),
         group["ortho_method"],
         vector,
         running_lower_bound,
@@ -723,6 +815,7 @@ def _update_lra(
     )
+@SqueezeGrad
 @PrecondGradAccumGuard
 @general_guard("U", "V", "d", init_fn=_init_psgd_lra, skip_first=False)
 @no_state
@@ -731,6 +824,7 @@ def scale_by_psgd_lra(group, update, grad, param, update_to_precond, U, V, d):
     return utils.extract_from_flat_update(param, utils.lra_precond(u, v, d, utils.flatten(update)))
+@SqueezeGrad
 @PrecondGradAccumGuard
 @general_guard("U", "V", "d", init_fn=_init_psgd_lra, skip_first=False)
 @no_state
@@ -740,6 +834,7 @@ def update_by_psgd_lra(group, update, grad, param, update_to_precond, U, V, d):
     raise SkipUpdate from None
+@SqueezeGrad
 @PrecondGradAccumGuard
 @general_guard("U", "V", "d", init_fn=_init_psgd_lra, skip_first=False)
 @no_state
@@ -748,6 +843,7 @@ def scale_by_delayed_psgd_lra(group, update, grad, param, update_to_precond, U,
     return utils.extract_from_flat_update(param, utils.lra_precond(u, v, d, utils.flatten(update)))
+@SqueezeGrad
 @PrecondGradAccumGuard
 @general_guard("U", "V", "d", init_fn=_init_psgd_lra, skip_first=False)
 @no_state
@@ -757,8 +853,9 @@ def update_by_delayed_psgd_lra(group, update, grad, param, update_to_precond, U,
     raise SkipUpdate from None
+@SqueezeGrad
 @PrecondGradAccumGuard
-@general_guard("Q", "Q_cache", "velocity", "running_lower_bound", init_fn=_init_psgd_kron, skip_first=False)
+@general_guard("Q", "Q_cache", "velocity", "running_lower_bound", "step", init_fn=_init_psgd_kron, skip_first=False)
 @no_state_no_foreach
 def scale_by_psgd(
     group,
@@ -770,15 +867,17 @@ def scale_by_psgd(
     Q_cache,
     velocity: Optional[List[Tensor]],
     running_lower_bound: List[Tensor],
+    step: Tensor,
     cached: bool = False,
     prob: Optional[callable] = None,
 ):
-    _update_psgd_precond(cached, Q_cache, group, param, update_to_precond, Q, velocity, running_lower_bound, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, update_to_precond, Q, velocity, running_lower_bound, step, prob)
     return _cached_psgd_precond_grad(group, update, Q, Q_cache, grad)
+@SqueezeGrad
 @PrecondGradAccumGuard
-@general_guard("Q", "Q_cache", "velocity", "running_lower_bound", init_fn=_init_psgd_kron, skip_first=False)
+@general_guard("Q", "Q_cache", "velocity", "running_lower_bound", "step", init_fn=_init_psgd_kron, skip_first=False)
 @no_state_no_foreach
 def scale_by_delayed_psgd(
     group,
@@ -790,6 +889,7 @@ def scale_by_delayed_psgd(
     Q_cache,
     velocity: Optional[List[Tensor]],
     running_lower_bound: List[Tensor],
+    step: Tensor,
     cached: bool = False,
     prob: Optional[callable] = None,
 ):
@@ -797,12 +897,15 @@ def scale_by_delayed_psgd(
         precond = None
     else:
         precond = _cached_psgd_precond_grad(group, update, Q, Q_cache, grad)
-    new = _update_psgd_precond(cached, Q_cache, group, param, update_to_precond, Q, velocity, running_lower_bound, prob)
+    new = _update_psgd_precond(
+        cached, Q_cache, group, param, update_to_precond, Q, velocity, running_lower_bound, step, prob
+    )
     return new if precond is None else precond
+@SqueezeGrad
 @PrecondGradAccumGuard
-@general_guard("Q", "Q_cache", "velocity", "running_lower_bound", init_fn=_init_psgd_kron, skip_first=False)
+@general_guard("Q", "Q_cache", "velocity", "running_lower_bound", "step", init_fn=_init_psgd_kron, skip_first=False)
 @no_state_no_foreach
 def update_by_psgd(
     group,
@@ -814,10 +917,11 @@ def update_by_psgd(
     Q_cache,
     velocity: Optional[List[Tensor]],
     running_lower_bound: List[Tensor],
+    step: Tensor,
     cached: bool = False,
     prob: Optional[callable] = None,
 ):
-    _update_psgd_precond(cached, Q_cache, group, param, update_to_precond, Q, velocity, running_lower_bound, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, update_to_precond, Q, velocity, running_lower_bound, step, prob)
     _fused_cached_psgd_precond_grad(group, update, param, update, Q, Q_cache)
     raise SkipUpdate from None
@@ -833,8 +937,9 @@ def global_clip(group, update, grad, param, clip_fn: Optional[callable] = None):
     return clip_fn(update)
+@SqueezeGrad
 @PrecondGradAccumGuard
-@general_guard("Q", "Q_cache", "velocity", "running_lower_bound", init_fn=_init_psgd_kron, skip_first=False)
+@general_guard("Q", "Q_cache", "velocity", "running_lower_bound", "step", init_fn=_init_psgd_kron, skip_first=False)
 @no_state_no_foreach
 def update_by_delayed_psgd(
     group,
@@ -846,11 +951,12 @@ def update_by_delayed_psgd(
     Q_cache,
     velocity: Optional[List[Tensor]],
     running_lower_bound: List[Tensor],
+    step: Tensor,
     cached: bool = False,
     prob: Optional[callable] = None,
 ):
     _fused_cached_psgd_precond_grad(group, update, param, update, Q, Q_cache)
-    _update_psgd_precond(cached, Q_cache, group, param, update_to_precond, Q, velocity, running_lower_bound, prob)
+    _update_psgd_precond(cached, Q_cache, group, param, update_to_precond, Q, velocity, running_lower_bound, step, prob)
     raise SkipUpdate from None
@@ -888,55 +994,50 @@ def chain(state: Union[callable, dict], group, grad, param, *fns):
         utils.update_param_(param, update, group["lr"], group["weight_decay"], caution=group["caution"], grad=grad)
-def create_branch(branches: List[List[callable]], merge_fn: callable):
-    def _branch(state, group, update, grad, param):
-        outputs = []
-        for branch in branches:
-            branch_update = [torch.clone(u, memory_format=torch.preserve_format) for u in update]
-            branch_update, skip_update = _inner_chain(state, group, branch_update, grad, param, *branch)
-            if skip_update:
-                raise ValueError("Branches should not skip updates")
-            outputs.append(branch_update)
-        return merge_fn(outputs)
+def set_indices(fns: Iterable[callable], retain: bool = True, offset: int = 0):
+    if retain and offset:
+        raise ValueError("offset cannot be retained")
+    def _walk(obj):
+        stack = [obj]
+        while stack:
+            cur = stack.pop()
+            if isinstance(cur, FunctionTransform):
+                yield cur
+                stack.append(cur.fn)
+            elif isinstance(cur, functools.partial):
+                stack.append(cur.func)
+            elif isinstance(cur, Branch):
+                for branch in cur.branches:
+                    stack.extend(branch)
+            elif isinstance(cur, _Iterable) and not isinstance(cur, (str, bytes, bytearray)):
+                stack.extend(cur)
-    return _branch
+    if retain:
+        offset = max((ft.transform_idx for ft in _walk(fns) if ft.transform_idx is not None), default=-1) + 1
+    new_fns = [copy.deepcopy(fn) for fn in fns]
+    for ft in _walk(new_fns):
+        if not retain or ft.transform_idx is None:
+            ft.transform_idx, offset = offset, offset + 1
-def set_indices(fns: Iterable[callable], retain: bool = True, offset: int = 0):
-    if retain:
-        if offset:
-            raise ValueError("offset cannot be retained")
-        offset = -1
-        for fn in fns:
-            while isinstance(fn, (FunctionTransform, functools.partial)):
-                if isinstance(fn, functools.partial):
-                    fn = fn.func
-                    continue
-                if fn.transform_idx is not None:
-                    offset = max(offset, fn.transform_idx)
-                fn = fn.fn
-        offset += 1  # if we found nothing, this will be 0. if we found something, we START at N+1
-    fns = [copy.deepcopy(fn) for fn in fns]
-    for fn in fns:
-        while isinstance(fn, (FunctionTransform, functools.partial)):
-            if isinstance(fn, functools.partial):
-                fn = fn.func
-                continue
-            if not retain or fn.transform_idx is None:
-                fn.transform_idx = offset
-                offset += 1
-            fn = fn.fn
-    return fns
+    return new_fns
 class ChainOpt(utils.StatefulOptimizer):
     promote: bool = False
+    global_defaults = {
+        "caution": False,
+        "lr": 1,
+        "warmup_steps": 0,
+        "weight_decay": 0,
+        "eps": 1e-8,
+    }
     def __init__(self, params, defaults, foreach: bool, *fns):
-        defaults = {k: v for k, v in defaults.items() if v is not use_default}
-        super().__init__(params, defaults, foreach)
+        base = self.global_defaults.copy()
+        base.update({k: v for k, v in defaults.items() if v is not use_default})
+        super().__init__(params, base, foreach)
         self.fns = fns
     @property
@@ -1055,7 +1156,7 @@ class BaseOpt(ChainOpt):
     update_clipping: str_or_fn = None
     The function to use for clipping the outgoing updates before applying them, after all other transformations.
-    This will turn off
+    This will turn off fused updates.
     This is syntactic sugar, equivalent to manually passing the function as the last element of the optimizer chain.
     """
@@ -1069,11 +1170,11 @@ class BaseOpt(ChainOpt):
         self,
         params,
         defaults,
-        foreach: bool,
-        gradient_clipping: str_or_fn,
-        update_clipping: str_or_fn,
+        foreach: bool = True,
+        gradient_clipping: str_or_fn = None,
+        update_clipping: str_or_fn = None,
         palm: bool = use_default,
-        *fns,
+        fns: Iterable[callable] = (),
         compile_step: bool = use_default,
         promote: bool = use_default,
     ):
@@ -1081,6 +1182,7 @@ class BaseOpt(ChainOpt):
             raise ValueError("No functions provided. If that's on purpose (SGD-like), use `identity`")
         args, kwargs = None, None
+        fns = tuple(fns)
         fn = fns[-1]
         if isinstance(fn, functools.partial):
             fn, args, kwargs = fn.func, fn.args, fn.keywords

heavyball/helpers.py CHANGED Viewed

@@ -420,7 +420,7 @@ class FastINGO:
         population_size: Optional[int] = None,
         learning_rate: Optional[float] = None,
         last_n: int = 4096,
-        loco_step_size: float = 1,
+        loco_step_size: float = 0.1,
         device="cuda",
         batchnorm_decay: float = 0.99,
         score_decay: float = 0.99,
@@ -697,6 +697,10 @@ def init_nsgaii(study, seed, trials, search_space):
     return module.NSGAIIwITSampler(seed=seed)
+def init_random(study, seed, trials, search_space):
+    return optuna.samplers.RandomSampler(seed=seed)
 def init_ingo(study, seed, trials, search_space):
     return ImplicitNaturalGradientSampler(search_space=search_space, seed=seed)

heavyball 2.0.0.dev0__py3-none-any.whl → 2.1.0__py3-none-any.whl

heavyball 2.0.0.dev0py3-none-any.whl → 2.1.0py3-none-any.whl