PyPI - heavyball - Versions diffs - 2.0.0__tar.gz → 2.1.0__tar.gz - Mend

heavyball 2.0.0tar.gz → 2.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{heavyball-2.0.0 → heavyball-2.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: heavyball
-Version: 2.0.0
+Version: 2.1.0
 Summary: Efficient Optimizers
 Author-email: HeavyBall Authors <github.heavyball@nestler.sh>
 Project-URL: source, https://github.com/HomebrewML/HeavyBall
@@ -24,13 +24,12 @@ Requires-Dist: pytest; extra == "dev"
 Requires-Dist: ruff; extra == "dev"
 Requires-Dist: matplotlib; extra == "dev"
 Requires-Dist: seaborn; extra == "dev"
-Requires-Dist: hyperopt; extra == "dev"
 Requires-Dist: pandas; extra == "dev"
 Requires-Dist: typer; extra == "dev"
 Requires-Dist: optuna; extra == "dev"
 Requires-Dist: optunahub; extra == "dev"
-Requires-Dist: botorch; extra == "dev"
 Requires-Dist: hebo; extra == "dev"
+Requires-Dist: lightbench; extra == "dev"
 Dynamic: license-file
 # heavyball
@@ -54,7 +53,7 @@ _High-performance, extensible, chainable optimizers for PyTorch._
 - Schedule-Free optimizers with dynamic learning rate adaptation.
 - Advanced update rules: MARS correction, cautious updates, PaLM beta2 scheduling.
 - Chainable transforms for custom optimization recipes.
-- Comprehensive benchmark suite (`benchmark/`).
+- Comprehensive benchmark suite packaged separately as LightBench (`../LightBench`).
 - Detailed documentation and example-driven tutorials.
 ## Quickstart
@@ -85,14 +84,14 @@ for data, target in dataloader:
 ## Benchmarks
-> Reproduce benchmarks with:
+> Reproduce benchmarks with LightBench (install it via `pip install -e ../LightBench` from the repo root):
 > ```bash
-> python3 -m benchmark.run_all_benchmarks --opt ForeachSOAP --opt LaProp --opt AdamW --opt Muon --opt ForeachCachedNewtonPSGD  --opt RMSprop --opt OrthoLaProp --opt ForeachSFAdamW --opt ForeachADOPT --opt LaPropOrtho --opt CachedPSGDKron --opt SignLaProp --opt ForeachSOLP --opt PSGDLRA --opt NewtonPSGDLRA --opt NewtonHybrid2PSGDKron --opt NewtonHybrid2PSGDLRA --opt mars-NewtonHybrid2PSGDLRA --opt MSAMLaProp --opt mars-adaptive-NewtonHybrid2PSGDKron  --opt mars-ortho-NewtonHybrid2PSGDKron --opt MuonLaProp --opt mars-unscaled-NewtonHybrid2PSGDKron --opt mars-NewtonHybrid2PSGDKron --opt cautious-AdamW --opt unscaled_cautious-AdamW --opt mars-AdamW  --dtype float32 --steps 1000000 --trials 1000 --parallelism 256 --seeds 1 --difficulties trivial --difficulties easy --difficulties medium --difficulties hard --difficulties extreme --difficulties nightmare --timeout 2880
+> python3 -m lightbench.run_all_benchmarks --opt ForeachSOAP --opt LaProp --opt AdamW --opt Muon --opt ForeachCachedNewtonPSGD  --opt RMSprop --opt OrthoLaProp --opt ForeachSFAdamW --opt ForeachADOPT --opt LaPropOrtho --opt CachedPSGDKron --opt SignLaProp --opt ForeachSOLP --opt PSGDLRA --opt NewtonPSGDLRA --opt NewtonHybrid2PSGDKron --opt NewtonHybrid2PSGDLRA --opt mars-NewtonHybrid2PSGDLRA --opt MSAMLaProp --opt mars-adaptive-NewtonHybrid2PSGDKron  --opt mars-ortho-NewtonHybrid2PSGDKron --opt MuonLaProp --opt mars-unscaled-NewtonHybrid2PSGDKron --opt mars-NewtonHybrid2PSGDKron --opt cautious-AdamW --opt unscaled_cautious-AdamW --opt mars-AdamW  --dtype float32 --steps 1000000 --trials 1000 --parallelism 256 --seeds 1 --difficulties trivial --difficulties easy --difficulties medium --difficulties hard --difficulties extreme --difficulties nightmare --timeout 2880
 > ```
 ## Migrating from HeavyBall 1.x
-- Read the detailed [2.0.0 migration notes](docs/heavyball-2.0.0-migration.md) for an end-to-end checklist, including guidance for restoring legacy behaviour when needed.
+- Read the detailed [2.0.0 migration notes](docs/heavyball2.md) for an end-to-end checklist, including guidance for restoring legacy behaviour when needed.
 - Use `scripts/migrate_optimizer_state.py` to rewrite pre-2.0 optimizer checkpoints:
   ```bash
   python scripts/migrate_optimizer_state.py path/to/checkpoint.pt heavyball.ForeachAdamW --state-key optimizer

{heavyball-2.0.0 → heavyball-2.1.0}/README.md RENAMED Viewed

@@ -19,7 +19,7 @@ _High-performance, extensible, chainable optimizers for PyTorch._
 - Schedule-Free optimizers with dynamic learning rate adaptation.
 - Advanced update rules: MARS correction, cautious updates, PaLM beta2 scheduling.
 - Chainable transforms for custom optimization recipes.
-- Comprehensive benchmark suite (`benchmark/`).
+- Comprehensive benchmark suite packaged separately as LightBench (`../LightBench`).
 - Detailed documentation and example-driven tutorials.
 ## Quickstart
@@ -50,14 +50,14 @@ for data, target in dataloader:
 ## Benchmarks
-> Reproduce benchmarks with:
+> Reproduce benchmarks with LightBench (install it via `pip install -e ../LightBench` from the repo root):
 > ```bash
-> python3 -m benchmark.run_all_benchmarks --opt ForeachSOAP --opt LaProp --opt AdamW --opt Muon --opt ForeachCachedNewtonPSGD  --opt RMSprop --opt OrthoLaProp --opt ForeachSFAdamW --opt ForeachADOPT --opt LaPropOrtho --opt CachedPSGDKron --opt SignLaProp --opt ForeachSOLP --opt PSGDLRA --opt NewtonPSGDLRA --opt NewtonHybrid2PSGDKron --opt NewtonHybrid2PSGDLRA --opt mars-NewtonHybrid2PSGDLRA --opt MSAMLaProp --opt mars-adaptive-NewtonHybrid2PSGDKron  --opt mars-ortho-NewtonHybrid2PSGDKron --opt MuonLaProp --opt mars-unscaled-NewtonHybrid2PSGDKron --opt mars-NewtonHybrid2PSGDKron --opt cautious-AdamW --opt unscaled_cautious-AdamW --opt mars-AdamW  --dtype float32 --steps 1000000 --trials 1000 --parallelism 256 --seeds 1 --difficulties trivial --difficulties easy --difficulties medium --difficulties hard --difficulties extreme --difficulties nightmare --timeout 2880
+> python3 -m lightbench.run_all_benchmarks --opt ForeachSOAP --opt LaProp --opt AdamW --opt Muon --opt ForeachCachedNewtonPSGD  --opt RMSprop --opt OrthoLaProp --opt ForeachSFAdamW --opt ForeachADOPT --opt LaPropOrtho --opt CachedPSGDKron --opt SignLaProp --opt ForeachSOLP --opt PSGDLRA --opt NewtonPSGDLRA --opt NewtonHybrid2PSGDKron --opt NewtonHybrid2PSGDLRA --opt mars-NewtonHybrid2PSGDLRA --opt MSAMLaProp --opt mars-adaptive-NewtonHybrid2PSGDKron  --opt mars-ortho-NewtonHybrid2PSGDKron --opt MuonLaProp --opt mars-unscaled-NewtonHybrid2PSGDKron --opt mars-NewtonHybrid2PSGDKron --opt cautious-AdamW --opt unscaled_cautious-AdamW --opt mars-AdamW  --dtype float32 --steps 1000000 --trials 1000 --parallelism 256 --seeds 1 --difficulties trivial --difficulties easy --difficulties medium --difficulties hard --difficulties extreme --difficulties nightmare --timeout 2880
 > ```
 ## Migrating from HeavyBall 1.x
-- Read the detailed [2.0.0 migration notes](docs/heavyball-2.0.0-migration.md) for an end-to-end checklist, including guidance for restoring legacy behaviour when needed.
+- Read the detailed [2.0.0 migration notes](docs/heavyball2.md) for an end-to-end checklist, including guidance for restoring legacy behaviour when needed.
 - Use `scripts/migrate_optimizer_state.py` to rewrite pre-2.0 optimizer checkpoints:
   ```bash
   python scripts/migrate_optimizer_state.py path/to/checkpoint.pt heavyball.ForeachAdamW --state-key optimizer

{heavyball-2.0.0 → heavyball-2.1.0}/heavyball/__init__.py RENAMED Viewed

@@ -100,6 +100,71 @@ class ForeachAdamW(C.BaseOpt):
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, fns=(C.update_by_adam,))
+class UnscaledAdamW(C.BaseOpt):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+        **kwargs,
+    ):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        defaults.update(defaults.pop("kwargs"))
+        if kwargs:
+            utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
+        super().__init__(
+            params, defaults, foreach, gradient_clipping, update_clipping, palm, fns=(C.scale_by_unscaled_adam,)
+        )
+class SUDSAdamW(C.BaseOpt):
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.99),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        foreach: bool = True,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        palm: bool = C.use_default,
+        beta2_scale: float = 0.8,
+        precond_lr: float = 1e-2,
+        **kwargs,
+    ):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        defaults.update(defaults.pop("kwargs"))
+        if kwargs:
+            utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, fns=(C.scale_by_suds,))
 class ForeachAdamC(C.BaseOpt):
     def __init__(
         self,
@@ -320,6 +385,7 @@ class ForeachMuon(C.BaseOpt):
         palm: bool = C.use_default,
         beta2_scale: float = 0.8,
         nesterov: bool = True,
+        heavyball_momentum: bool = False,
         **kwargs,
     ):
         defaults = locals()
@@ -330,6 +396,16 @@ class ForeachMuon(C.BaseOpt):
         if kwargs:
             utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
+        if heavyball_momentum:
+            if nesterov:
+                ema = C.nesterov_momentum
+            else:
+                ema = C.heavyball_momentum
+        elif nesterov:
+            ema = C.nesterov_ema
+        else:
+            ema = C.exp_avg
         super().__init__(
             params,
             defaults,
@@ -337,7 +413,7 @@ class ForeachMuon(C.BaseOpt):
             gradient_clipping,
             update_clipping,
             palm,
-            fns=(C.nesterov_ema if nesterov else C.exp_avg, C.orthogonalize_update),
+            fns=(ema, C.orthogonalize_update),
         )
@@ -743,7 +819,9 @@ class ForeachPSGDKron(C.BaseOpt):
         update_clipping = C.default(update_clipping, utils.trust_region_clip_)
         inverse_free = C.default(inverse_free, self.quad)
         if inverse_free:
-            raise ValueError("inverse_free (i.e., PSGD-QUAD) is not supported at the moment. Consider using https://github.com/evanatyourservice/quad_torch")
+            raise ValueError(
+                "inverse_free (i.e., PSGD-QUAD) is not supported at the moment. Consider using https://github.com/evanatyourservice/quad_torch"
+            )
         defaults = locals()
         defaults.pop("self")
@@ -796,7 +874,6 @@ class NewtonHybrid2PSGDKron(ForeachCachedNewtonPSGD):
     hvp_interval = 2
 class ForeachPSGDLRA(C.BaseOpt):
     """
     Originally from Evan Walters and Omead Pooladzandi, 2024
@@ -953,5 +1030,5 @@ __all__ = [
     "MSAMLaProp",
     "NewtonPSGDKron",
     "ForeachAdamC",
-    "SGD"
+    "SGD",
 ]

{heavyball-2.0.0 → heavyball-2.1.0}/heavyball/chainable.py RENAMED Viewed

@@ -480,6 +480,43 @@ def update_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
     raise SkipUpdate from None
+@zero_guard("exp_avg", "exp_avg_sq", "fisher_approx")
+@no_state_no_foreach
+def scale_by_suds(group, update, grad, param, exp_avg, exp_avg_sq, fisher_approx):
+    if group["step"] == 1:
+        utils.copy_stochastic_(fisher_approx, update / update.norm().clamp(min=1e-8))
+        raise SkipUpdate from None
+    precond_update, w = utils.eigvecs_product_rank1(update.flatten(), fisher_approx.flatten().to(update.dtype))
+    precond_update = utils.adam_(
+        exp_avg,
+        exp_avg_sq,
+        precond_update.view_as(exp_avg),
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"] - 1,
+    )[0]
+    precond_update, _ = utils.eigvecs_product_rank1(precond_update.flatten(), fisher_approx.flatten(), w)
+    new_approx = utils.oja_update(fisher_approx.flatten().to(update.dtype), update.flatten(), group["precond_lr"])
+    utils.copy_stochastic_(fisher_approx, new_approx)
+    return precond_update
+@zero_guard("exp_avg", "exp_avg_sq")
+@no_state
+def scale_by_unscaled_adam(group, update, grad, param, exp_avg, exp_avg_sq):
+    update = utils.unscaled_adam_(
+        exp_avg,
+        exp_avg_sq,
+        update,
+        utils.get_beta1(group),
+        utils.get_beta2(group),
+        group["step"],
+    )
+    return update
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def scale_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):

{heavyball-2.0.0 → heavyball-2.1.0}/heavyball/utils.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import collections
 import contextlib
+import enum
 import functools
 import gc
 import inspect
@@ -19,11 +20,26 @@ from torch.backends import cudnn, opt_einsum
 from torch.nn import functional as F
 from torch.utils._pytree import tree_map
+class ZerothPowerMode(enum.Enum):
+    newtonschulz = "newtonschulz"
+    legacy_newtonschulz = "legacy_newtonschulz"
+    qr = "qr"
+    svd = "svd"
+    legacy_svd = "legacy_svd"
+class OrthoScaleMode(enum.Enum):
+    none = "none"
+    scale = "scale"
+    graft = "graft"
 compile_mode = "max-autotune-no-cudagraphs"
 dynamic = False
 compile_mode_recommended_to_none = None
 zeroth_power_mode = "newtonschulz"
-precise_zeroth_power_mode = "qr"  # or svd
+precise_zeroth_power_mode = "qr"
 tiny_bf16 = torch.finfo(torch.bfloat16).tiny
 _cudnn_double_backward_pattern = re.compile(
     r"the derivative for .* is not implemented\. Double backwards .* To run double backwards"
@@ -373,6 +389,7 @@ def zeropower_via_newtonschulz5(G, steps=5, eps=1e-7):
         G.ndim >= 2
     )  # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
     assert steps == 5
+    G = G.clone()
     X = G if G.dtype == torch.float64 else stochastic_round_(G)
     if G.size(-2) > G.size(-1):
         X = X.mT
@@ -387,9 +404,7 @@ def zeropower_via_newtonschulz5(G, steps=5, eps=1e-7):
         (2.8366, -3.0525, 1.2012),
     ]:
         A = X @ X.mT
-        B = (
-            b * A + c * A @ A
-        )  # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = b * A + c * A @ A  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
         X = a * X + B @ X
     if G.size(-2) > G.size(-1):
@@ -397,6 +412,24 @@ def zeropower_via_newtonschulz5(G, steps=5, eps=1e-7):
     return X.to(G.dtype)
+@decorator_knowngood
+def legacy_zeropower_via_newtonschulz5(G, steps=5, eps=1e-7):
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    G = G.clone()
+    X = G if G.dtype == torch.float64 else stochastic_round_(G)
+    stochastic_multiply_(X, G.norm(dim=(-2, -1)) + eps)  # ensure top singular value <= 1
+    if G.size(0) > G.size(1):
+        X = X.T
+    for _ in range(steps):
+        A = X @ X.T
+        B = b * A + c * A @ A  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        X = a * X + B @ X
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X.to(G.dtype)
 @decorator_knowngood
 def _compilable_heavyball_momentum_(state, grad, beta):
     s32, g32 = [list(map(promote, x)) for x in (state, grad)]
@@ -449,21 +482,30 @@ def _compilable_grafting(magnitude, direction):
 @decorator_knowngood
-def _compilable_orthogonal_(x: Tensor, mode: str, out: Tensor | None, scale_mode: str):
-    if mode == "newtonschulz" or x.shape[0] != x.shape[1]:
+def _compilable_orthogonal_(x: Tensor, mode: str | ZerothPowerMode, out: Tensor | None, scale_mode: str):
+    if not isinstance(mode, ZerothPowerMode):
+        mode = ZerothPowerMode(mode)
+    if not isinstance(scale_mode, ZerothPowerMode):
+        scale_mode = OrthoScaleMode(scale_mode)
+    if mode == ZerothPowerMode.newtonschulz or x.shape[0] != x.shape[1]:
         y = zeropower_via_newtonschulz5(x, 5)
-    elif mode == "qr":
+    elif mode == ZerothPowerMode.legacy_newtonschulz:
+        y = legacy_zeropower_via_newtonschulz5(x, 5)
+    elif mode == ZerothPowerMode.qr:
         y = torch.linalg.qr(promote(x)).Q
-    elif mode == "svd":
-        u, _s, v = torch.linalg.svd(promote(x))
-        y = u @ v.T
+    elif mode == ZerothPowerMode.svd:
+        u, _s, vt = torch.linalg.svd(promote(x))
+        y = u @ vt
+    elif mode == ZerothPowerMode.legacy_svd:
+        u, _s, vt = torch.linalg.svd(promote(x))
+        y = u @ vt.T
     else:
         raise NotImplementedError(f"Unknown zeroth_power_mode: {mode}")
-    if scale_mode == "none":
+    if scale_mode == OrthoScaleMode.none:
         pass
-    elif scale_mode == "scale":
-        y *= max(1, x.size(0) / x.size(1)) ** 0.5
-    elif scale_mode == "graft":
+    elif scale_mode == OrthoScaleMode.scale:
+        y *= max(1, x.size(-2) / x.size(-1)) ** 0.5
+    elif scale_mode == OrthoScaleMode.graft:
         y = _compilable_grafting(x, y)
     else:
         raise NotImplementedError(f"Unknown scale_mode: {scale_mode}")
@@ -1223,17 +1265,53 @@ def _compilable_adam_(
 def adam_(
+    exp_avg: List[Tensor] | Tensor,
+    exp_avg_sq: List[Tensor] | Tensor,
+    grad: List[Tensor] | Tensor,
+    beta1: float,
+    beta2: float,
+    step: int,
+    eps: float = 1e-8,
+) -> List[Tensor]:
+    exp_avg, exp_avg_sq, grad = map(list_guard, (exp_avg, exp_avg_sq, grad))
+    beta1, beta2, step, eps = scalar_guard(beta1, beta2, step, eps, exp_avg[0])
+    _compilable_adam_(exp_avg, exp_avg_sq, grad, beta1, beta2, step, eps)
+    return grad
+@decorator_knowngood
+def _compilable_unscaled_adam_(
     exp_avg: List[Tensor],
     exp_avg_sq: List[Tensor],
     grad: List[Tensor],
+    beta1: Tensor,
+    beta2: Tensor,
+    step: Tensor,
+    eps: Tensor,
+):
+    beta1 = beta_debias(beta1, step)
+    beta2 = beta_debias(beta2, step)
+    g32 = list(map(promote, grad))
+    denom = _compilable_exp_avg_sq_(exp_avg_sq, g32, beta2, eps, [None])
+    g32 = torch._foreach_div(g32, denom)
+    exp_avg32 = _lerp(exp_avg, g32, beta1)
+    u32 = torch._foreach_mul(exp_avg32, denom)
+    copy_stochastic_list_(grad, u32)
+def unscaled_adam_(
+    exp_avg: List[Tensor] | Tensor,
+    exp_avg_sq: List[Tensor] | Tensor,
+    grad: List[Tensor] | Tensor,
     beta1: float,
     beta2: float,
     step: int,
     eps: float = 1e-8,
-):
+) -> List[Tensor]:
     exp_avg, exp_avg_sq, grad = map(list_guard, (exp_avg, exp_avg_sq, grad))
     beta1, beta2, step, eps = scalar_guard(beta1, beta2, step, eps, exp_avg[0])
-    _compilable_adam_(exp_avg, exp_avg_sq, grad, beta1, beta2, step, eps)
+    _compilable_unscaled_adam_(exp_avg, exp_avg_sq, grad, beta1, beta2, step, eps)
     return grad
@@ -2285,6 +2363,55 @@ def cond(cond, true_fn, false_fn):
     return false_fn()
+@decorator_knowngood
+def _householder_vec_e1_to_v(v: Tensor, eps: float = 1e-12) -> Tensor:
+    """
+    Return w such that H = I - 2 w w^T is orthogonal and H e1 = v (v unit).
+    Applying from the right: G @ H = G - 2 (G @ w) w^T.
+    If v is (numerically) e1, returns w=0 and H=I.
+    """
+    v = v / v.norm().clamp(min=eps)
+    e1 = torch.zeros_like(v)
+    e1[0] = 1.0
+    w = e1 - v
+    return w / w.norm().clamp(min=eps)
+@decorator_knowngood
+def eigvecs_product_rank1(
+    G: Tensor, v: Tensor, w: Optional[Tensor] = None, eps: float = 1e-12
+) -> Tuple[Tensor, Tensor]:
+    """
+    Compute Y = G @ V where V is an eigenvector matrix for P = λ I + σ v v^T,
+    using the Householder reflector with first column v. Never materializes V.
+    Args:
+        G: shape (..., d) — gradient row(s) you want to rotate into eigenbasis.
+        v: shape (d,)      — current unit direction (top eigenvector of P).
+        w: optional Householder vector w; pass to reuse across calls.
+    Returns:
+        (Y, w) where:
+          Y has shape (..., d) and equals G @ eigenvectors(P),
+          w is the Householder vector you can cache & reuse.
+    """
+    if w is None:
+        w = _householder_vec_e1_to_v(v, eps)
+    Y = G - 2.0 * compiled_einsum("...i,i,j->...j", G, w, w)
+    return Y, w
+@decorator_knowngood
+def oja_update(v: Tensor, g: Tensor, lr: float = 1e-2, eps: float = 1e-12) -> Tensor:
+    """
+    One Oja step to track the top eigendirection of the gradient covariance.
+    v <- v + lr * ((g^T v) g - (g^T v)^2 v); then renormalize.
+    """
+    gv = g @ v
+    v = v + lr * (gv * g - (gv * gv) * v)
+    return v / v.norm().clamp(min=eps)
 def cond_n(cond_val: Tensor, *fns):
     fns = list(fns)
     fn = fns.pop(0)

{heavyball-2.0.0 → heavyball-2.1.0}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: heavyball
-Version: 2.0.0
+Version: 2.1.0
 Summary: Efficient Optimizers
 Author-email: HeavyBall Authors <github.heavyball@nestler.sh>
 Project-URL: source, https://github.com/HomebrewML/HeavyBall
@@ -24,13 +24,12 @@ Requires-Dist: pytest; extra == "dev"
 Requires-Dist: ruff; extra == "dev"
 Requires-Dist: matplotlib; extra == "dev"
 Requires-Dist: seaborn; extra == "dev"
-Requires-Dist: hyperopt; extra == "dev"
 Requires-Dist: pandas; extra == "dev"
 Requires-Dist: typer; extra == "dev"
 Requires-Dist: optuna; extra == "dev"
 Requires-Dist: optunahub; extra == "dev"
-Requires-Dist: botorch; extra == "dev"
 Requires-Dist: hebo; extra == "dev"
+Requires-Dist: lightbench; extra == "dev"
 Dynamic: license-file
 # heavyball
@@ -54,7 +53,7 @@ _High-performance, extensible, chainable optimizers for PyTorch._
 - Schedule-Free optimizers with dynamic learning rate adaptation.
 - Advanced update rules: MARS correction, cautious updates, PaLM beta2 scheduling.
 - Chainable transforms for custom optimization recipes.
-- Comprehensive benchmark suite (`benchmark/`).
+- Comprehensive benchmark suite packaged separately as LightBench (`../LightBench`).
 - Detailed documentation and example-driven tutorials.
 ## Quickstart
@@ -85,14 +84,14 @@ for data, target in dataloader:
 ## Benchmarks
-> Reproduce benchmarks with:
+> Reproduce benchmarks with LightBench (install it via `pip install -e ../LightBench` from the repo root):
 > ```bash
-> python3 -m benchmark.run_all_benchmarks --opt ForeachSOAP --opt LaProp --opt AdamW --opt Muon --opt ForeachCachedNewtonPSGD  --opt RMSprop --opt OrthoLaProp --opt ForeachSFAdamW --opt ForeachADOPT --opt LaPropOrtho --opt CachedPSGDKron --opt SignLaProp --opt ForeachSOLP --opt PSGDLRA --opt NewtonPSGDLRA --opt NewtonHybrid2PSGDKron --opt NewtonHybrid2PSGDLRA --opt mars-NewtonHybrid2PSGDLRA --opt MSAMLaProp --opt mars-adaptive-NewtonHybrid2PSGDKron  --opt mars-ortho-NewtonHybrid2PSGDKron --opt MuonLaProp --opt mars-unscaled-NewtonHybrid2PSGDKron --opt mars-NewtonHybrid2PSGDKron --opt cautious-AdamW --opt unscaled_cautious-AdamW --opt mars-AdamW  --dtype float32 --steps 1000000 --trials 1000 --parallelism 256 --seeds 1 --difficulties trivial --difficulties easy --difficulties medium --difficulties hard --difficulties extreme --difficulties nightmare --timeout 2880
+> python3 -m lightbench.run_all_benchmarks --opt ForeachSOAP --opt LaProp --opt AdamW --opt Muon --opt ForeachCachedNewtonPSGD  --opt RMSprop --opt OrthoLaProp --opt ForeachSFAdamW --opt ForeachADOPT --opt LaPropOrtho --opt CachedPSGDKron --opt SignLaProp --opt ForeachSOLP --opt PSGDLRA --opt NewtonPSGDLRA --opt NewtonHybrid2PSGDKron --opt NewtonHybrid2PSGDLRA --opt mars-NewtonHybrid2PSGDLRA --opt MSAMLaProp --opt mars-adaptive-NewtonHybrid2PSGDKron  --opt mars-ortho-NewtonHybrid2PSGDKron --opt MuonLaProp --opt mars-unscaled-NewtonHybrid2PSGDKron --opt mars-NewtonHybrid2PSGDKron --opt cautious-AdamW --opt unscaled_cautious-AdamW --opt mars-AdamW  --dtype float32 --steps 1000000 --trials 1000 --parallelism 256 --seeds 1 --difficulties trivial --difficulties easy --difficulties medium --difficulties hard --difficulties extreme --difficulties nightmare --timeout 2880
 > ```
 ## Migrating from HeavyBall 1.x
-- Read the detailed [2.0.0 migration notes](docs/heavyball-2.0.0-migration.md) for an end-to-end checklist, including guidance for restoring legacy behaviour when needed.
+- Read the detailed [2.0.0 migration notes](docs/heavyball2.md) for an end-to-end checklist, including guidance for restoring legacy behaviour when needed.
 - Use `scripts/migrate_optimizer_state.py` to rewrite pre-2.0 optimizer checkpoints:
   ```bash
   python scripts/migrate_optimizer_state.py path/to/checkpoint.pt heavyball.ForeachAdamW --state-key optimizer

{heavyball-2.0.0 → heavyball-2.1.0}/heavyball.egg-info/requires.txt RENAMED Viewed

@@ -8,10 +8,9 @@ pytest
 ruff
 matplotlib
 seaborn
-hyperopt
 pandas
 typer
 optuna
 optunahub
-botorch
 hebo
+lightbench

{heavyball-2.0.0 → heavyball-2.1.0}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "heavyball"
 description = "Efficient Optimizers"
-version = "2.0.0"
+version = "2.1.0"
 authors = [{ name = "HeavyBall Authors", email = "github.heavyball@nestler.sh" }]
 classifiers = ["Intended Audience :: Developers",
     "Intended Audience :: Science/Research",
@@ -28,7 +28,7 @@ readme = "README.md"
 requires-python = ">=3.9"
 [project.optional-dependencies]
-dev = ["pre-commit", "pytest", "ruff", "matplotlib", "seaborn", "hyperopt", "pandas", "typer", "optuna", "optunahub", "botorch", "hebo"]
+dev = ["pre-commit", "pytest", "ruff", "matplotlib", "seaborn", "pandas", "typer", "optuna", "optunahub", "hebo", "lightbench"]
 [project.urls]
 source = "https://github.com/HomebrewML/HeavyBall"

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_bf16_params.py RENAMED Viewed

@@ -3,12 +3,12 @@ import os
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 from torch._dynamo import config
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch
 os.environ["TORCH_LOGS"] = "+recompiles"

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_bf16_q.py RENAMED Viewed

@@ -1,11 +1,11 @@
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 from torch._dynamo import config
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch
 config.cache_size_limit = 128

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_bf16_storage.py RENAMED Viewed

@@ -1,11 +1,11 @@
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 from torch._dynamo import config
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch
 config.cache_size_limit = 128

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_caution.py RENAMED Viewed

@@ -4,12 +4,12 @@ os.environ["TORCH_LOGS"] = "+recompiles"
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 from torch._dynamo import config
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch
 config.cache_size_limit = 128

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_channels_last.py RENAMED Viewed

@@ -4,12 +4,12 @@ os.environ["TORCH_LOGS"] = "+recompiles"
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 from torch._dynamo import config
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch
 heavyball.utils.zeroth_power_mode = "newtonschulz"

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_clip.py RENAMED Viewed

@@ -6,11 +6,11 @@ import math
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import linalg, nn
 from torch._dynamo import config
 import heavyball
-from benchmark.utils import get_optim
 from heavyball import utils
 from heavyball.utils import (
     _compilable_global_l2norm_clip_,

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_closure.py RENAMED Viewed

@@ -2,11 +2,11 @@ from typing import List
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_ema.py RENAMED Viewed

@@ -1,11 +1,11 @@
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 from torch._dynamo import config
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch
 config.cache_size_limit = 128

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_foreach.py RENAMED Viewed

@@ -1,10 +1,10 @@
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_hook.py RENAMED Viewed

@@ -4,12 +4,12 @@ os.environ["TORCH_LOGS"] = "+recompiles"
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 from torch._dynamo import config
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, hook_optimizer_into_model, set_torch
 heavyball.utils.compile_mode = "default"

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_mars.py RENAMED Viewed

@@ -1,11 +1,11 @@
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 from torch._dynamo import config
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch
 config.cache_size_limit = 128

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_memory.py RENAMED Viewed

@@ -1,10 +1,10 @@
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_memory_leak.py RENAMED Viewed

@@ -1,12 +1,12 @@
 import pytest
 import torch
 import tqdm
+from lightbench.utils import get_optim
 from torch import nn
 from torch.nn import functional as F
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_merge.py RENAMED Viewed

@@ -2,11 +2,11 @@ from typing import List
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_nd_param.py RENAMED Viewed

@@ -1,11 +1,11 @@
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 from torch._dynamo import config
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import set_torch
 config.cache_size_limit = 2**20

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_no_grad.py RENAMED Viewed

@@ -2,11 +2,11 @@ from typing import List
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_save_restore.py RENAMED Viewed

@@ -5,13 +5,13 @@ os.environ["TORCH_LOGS"] = "+recompiles"
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 from torch._dynamo import config
 from torch.utils._pytree import tree_map
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import set_torch
 config.cache_size_limit = 128

{heavyball-2.0.0 → heavyball-2.1.0}/test/test_stochastic_updates.py RENAMED Viewed

@@ -1,10 +1,10 @@
 import pytest
 import torch
+from lightbench.utils import get_optim
 from torch import nn
 import heavyball
 import heavyball.utils
-from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch