PyPI - heavyball - Versions diffs - 2.0.0.dev0__py3-none-any.whl → 2.1.1__py3-none-any.whl - Mend

heavyball 2.0.0.dev0py3-none-any.whl → 2.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

heavyball/__init__.py +168 -29
heavyball/chainable.py +165 -63
heavyball/helpers.py +5 -1
heavyball/utils.py +507 -124
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.1.dist-info}/METADATA +19 -7
heavyball-2.1.1.dist-info/RECORD +9 -0
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.1.dist-info}/WHEEL +1 -1
heavyball-2.0.0.dev0.dist-info/RECORD +0 -9
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.1.dist-info}/licenses/LICENSE +0 -0
{heavyball-2.0.0.dev0.dist-info → heavyball-2.1.1.dist-info}/top_level.txt +0 -0

heavyball/utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import collections
 import contextlib
+import enum
 import functools
 import gc
 import inspect
@@ -16,13 +17,29 @@ import torch
 from torch import Tensor
 from torch._dynamo.exc import TorchDynamoException
 from torch.backends import cudnn, opt_einsum
+from torch.nn import functional as F
 from torch.utils._pytree import tree_map
+class ZerothPowerMode(enum.Enum):
+    newtonschulz = "newtonschulz"
+    legacy_newtonschulz = "legacy_newtonschulz"
+    qr = "qr"
+    svd = "svd"
+    legacy_svd = "legacy_svd"
+class OrthoScaleMode(enum.Enum):
+    none = "none"
+    scale = "scale"
+    graft = "graft"
 compile_mode = "max-autotune-no-cudagraphs"
 dynamic = False
 compile_mode_recommended_to_none = None
 zeroth_power_mode = "newtonschulz"
-precise_zeroth_power_mode = "qr"  # or svd
+precise_zeroth_power_mode = "qr"
 tiny_bf16 = torch.finfo(torch.bfloat16).tiny
 _cudnn_double_backward_pattern = re.compile(
     r"the derivative for .* is not implemented\. Double backwards .* To run double backwards"
@@ -240,14 +257,14 @@ def eps_sqrt(item, eps):
 @decorator_knowngood
 def _compilable_exp_avg_sq_(
-    state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor, out: List[Optional[Tensor]]
+    state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor, out: None | List[None | Tensor]
 ):
     g32 = promote(grad)
     s32 = _lerp(state, torch._foreach_mul(g32, g32), beta2)
     denom = [eps_sqrt(d, eps) for d in s32]
-    if out[0] is None:
+    if out is None or out[0] is None:
         return denom
     copy_stochastic_list_(out, denom)
@@ -316,8 +333,8 @@ def adaptive_gradient_clipping_(
 def is_compiling():
     try:
         return torch.compiler.is_compiling()
-    except TorchDynamoException:
-        return True
+    except (TorchDynamoException, AttributeError):
+        return False
 def set_(dst: Tensor, src: Tensor):
@@ -366,12 +383,45 @@ def set_torch(benchmark_limit: int = 32, einsum_strategy: str = "auto-hq"):
     )
-@decorator
+@decorator_knowngood
 def zeropower_via_newtonschulz5(G, steps=5, eps=1e-7):
+    assert (
+        G.ndim >= 2
+    )  # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
+    assert steps == 5
+    G = G.clone()
+    X = G if G.dtype == torch.float64 else stochastic_round_(G)
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    # X = X / (X.norm(dim=(-2, -1), keepdim=True) + eps)
+    stochastic_divide_with_eps_(X, G.norm(dim=(-2, -1)), eps)  # ensure top singular value <= 1
+    # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
+        A = X @ X.mT
+        B = b * A + c * A @ A  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        X = a * X + B @ X
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    return X.to(G.dtype)
+@decorator_knowngood
+def legacy_zeropower_via_newtonschulz5(G, steps=5, eps=1e-7):
     assert len(G.shape) == 2
     a, b, c = (3.4445, -4.7750, 2.0315)
-    X = G.to(torch.bfloat16 if G.dtype != torch.float64 else G.dtype)  # Preserve float64 if present
-    X /= X.norm() + eps  # ensure top singular value <= 1
+    G = G.clone()
+    X = G if G.dtype == torch.float64 else stochastic_round_(G)
+    # X = X / (X.norm(dim=(-2, -1), keepdim=True) + eps)
+    stochastic_divide_with_eps_(X, G.norm(dim=(-2, -1)), eps)  # ensure top singular value <= 1
     if G.size(0) > G.size(1):
         X = X.T
     for _ in range(steps):
@@ -435,21 +485,30 @@ def _compilable_grafting(magnitude, direction):
 @decorator_knowngood
-def _compilable_orthogonal_(x: Tensor, mode: str, out: Tensor | None, scale_mode: str):
-    if mode == "newtonschulz" or x.shape[0] != x.shape[1]:
+def _compilable_orthogonal_(x: Tensor, mode: str | ZerothPowerMode, out: Tensor | None, scale_mode: str):
+    if not isinstance(mode, ZerothPowerMode):
+        mode = ZerothPowerMode(mode)
+    if not isinstance(scale_mode, ZerothPowerMode):
+        scale_mode = OrthoScaleMode(scale_mode)
+    if mode == ZerothPowerMode.newtonschulz or x.shape[0] != x.shape[1]:
         y = zeropower_via_newtonschulz5(x, 5)
-    elif mode == "qr":
+    elif mode == ZerothPowerMode.legacy_newtonschulz:
+        y = legacy_zeropower_via_newtonschulz5(x, 5)
+    elif mode == ZerothPowerMode.qr:
         y = torch.linalg.qr(promote(x)).Q
-    elif mode == "svd":
-        u, _s, v = torch.linalg.svd(promote(x))
-        y = u @ v.T
+    elif mode == ZerothPowerMode.svd:
+        u, _s, vt = torch.linalg.svd(promote(x))
+        y = u @ vt
+    elif mode == ZerothPowerMode.legacy_svd:
+        u, _s, vt = torch.linalg.svd(promote(x))
+        y = u @ vt.T
     else:
         raise NotImplementedError(f"Unknown zeroth_power_mode: {mode}")
-    if scale_mode == "none":
+    if scale_mode == OrthoScaleMode.none:
         pass
-    elif scale_mode == "scale":
-        y *= max(1, x.size(0) / x.size(1)) ** 0.5
-    elif scale_mode == "graft":
+    elif scale_mode == OrthoScaleMode.scale:
+        y *= max(1, x.size(-2) / x.size(-1)) ** 0.5
+    elif scale_mode == OrthoScaleMode.graft:
         y = _compilable_grafting(x, y)
     else:
         raise NotImplementedError(f"Unknown scale_mode: {scale_mode}")
@@ -556,10 +615,16 @@ def get_orthogonal_matrix(mat, max_eps: float = 1e-3, min_eps: float = 1e-30):
             except torch.OutOfMemoryError:
                 if m.device.type == "cpu":
                     raise
-                else:
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize(m.device)
+                clean()
+                m = m.cpu()
+            except RuntimeError as e:
+                if torch.cuda.is_available() and ("CUDA" in str(e) or "illegal memory access" in str(e)):
+                    torch.cuda.synchronize(m.device)
+                    clean()
                     m = m.cpu()
-            except RuntimeError:  # failed to compute eigenvalues
-                if m.dtype != torch.double:
+                elif m.dtype != torch.double:
                     m = m.double()
                 elif eps < max_eps:
                     eps = eps ** (2 / 3)
@@ -658,7 +723,7 @@ def _compilable_stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[f
         copy_stochastic_(x_, x32 + y32 * alpha)
-def stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[float, int, Tensor] = 1):
+def stochastic_add_(x: List[Tensor] | Tensor, y: List[Tensor] | Tensor, alpha: Union[float, int, Tensor] = 1):
     x, y = broadcastable_list_guard(x, y)
     alpha = scalar_guard(alpha, x[0])
     _compilable_stochastic_add_(x, y, alpha)
@@ -672,7 +737,9 @@ def _compilable_stochastic_add_divide_(x: List[Tensor], y: List[Tensor], alpha:
         copy_stochastic_(x_, (x32 + y32 * alpha) / divisor)
-def stochastic_add_divide_(x: List[Tensor], y: List[Tensor], alpha: Union[float, int, Tensor] = 1, divisor: float = 1):
+def stochastic_add_divide_(
+    x: List[Tensor] | Tensor, y: List[Tensor] | Tensor, alpha: Union[float, int, Tensor] = 1, divisor: float = 1
+):
     x, y = broadcastable_list_guard(x, y)
     alpha, divisor = scalar_guard(alpha, divisor, x[0])
     _compilable_stochastic_add_divide_(x, y, alpha, divisor)
@@ -686,11 +753,25 @@ def _compilable_stochastic_multiply_(x: List[Tensor], y: List[Tensor]):
         copy_stochastic_(x_, x32 * y32)
-def stochastic_multiply_(x: List[Tensor], y: List[Tensor]):
+def stochastic_multiply_(x: List[Tensor] | Tensor, y: List[Tensor] | Tensor):
     x, y = broadcastable_list_guard(x, y)
     _compilable_stochastic_multiply_(x, y)
+@decorator_knowngood
+def _compilable_stochastic_divide_with_eps_(x: List[Tensor], y: List[Tensor], eps: Tensor):
+    for x_, y_ in zip(x, y):
+        x32 = promote(x_)
+        y32 = promote(y_)
+        copy_stochastic_(x_, x32 / (y32 + eps))
+def stochastic_divide_with_eps_(x: List[Tensor] | Tensor, y: List[Tensor] | Tensor, eps: float):
+    x, y = broadcastable_list_guard(x, y)
+    eps = scalar_guard(eps, y[0])
+    _compilable_stochastic_divide_with_eps_(x, y, eps)
 @decorator
 def update_ggt(grad, GG, max_precond_dim, precondition_1d, beta):
     """
@@ -832,6 +913,10 @@ class ExactHVPFailed(ValueError):
 use_default = object()
+def _tensor_key(x: Tensor):
+    return x.data_ptr(), x.numel(), x.dtype, x.device
 class StatefulOptimizer(torch.optim.Optimizer):
     """
     finite_differences saves memory, but needs more compute. (Alternative is true HVP)
@@ -874,7 +959,6 @@ class StatefulOptimizer(torch.optim.Optimizer):
         self.register_state_dict_post_hook(StatefulOptimizer._store_stats)
         self.register_load_state_dict_pre_hook(StatefulOptimizer._load_stats)
-        self._init_mapping()
     def _store_stats(self, state_dict: dict[str, any]):
         state_dict["heavyball"] = {
@@ -905,7 +989,9 @@ class StatefulOptimizer(torch.optim.Optimizer):
     def state_(self, arg: Tensor, fail: bool = True):
         if not fail and arg not in self.mapping:
             return {}
-        state_param, index = self.mapping_inverse[arg]
+        if _tensor_key(arg) not in self.mapping_inverse:
+            self._init_mapping()
+        state_param, index = self.mapping_inverse[_tensor_key(arg)]
         if state_param not in self.state:
             self.state[state_param] = collections.defaultdict(dict)
         return self.state[state_param][index]
@@ -928,7 +1014,7 @@ class StatefulOptimizer(torch.optim.Optimizer):
             if p not in self.mapping:
                 self.mapping[p] = p_views = merge_group(group, p)
                 for i, pv in enumerate(p_views):
-                    self.mapping_inverse[pv] = (p, i)
+                    self.mapping_inverse[_tensor_key(pv)] = (p, i)
     def split_p_and_g_in_group(
         self,
@@ -949,12 +1035,9 @@ class StatefulOptimizer(torch.optim.Optimizer):
                 yield p, grad
                 continue
-            if p in self.mapping:
-                p_views = self.mapping[p]
-            else:
-                self.mapping[p] = p_views = merge_group(group, p)
-                for i, pv in enumerate(p_views):
-                    self.mapping_inverse[pv] = (p, i)
+            self.mapping[p] = p_views = merge_group(group, p)
+            for i, pv in enumerate(p_views):
+                self.mapping_inverse[_tensor_key(pv)] = (p, i)
             vector = getattr(p, "vector", None)
             hessian_vector = getattr(p, "hessian_vector", None)
@@ -1199,17 +1282,53 @@ def _compilable_adam_(
 def adam_(
+    exp_avg: List[Tensor] | Tensor,
+    exp_avg_sq: List[Tensor] | Tensor,
+    grad: List[Tensor] | Tensor,
+    beta1: float,
+    beta2: float,
+    step: int,
+    eps: float = 1e-8,
+) -> List[Tensor]:
+    exp_avg, exp_avg_sq, grad = map(list_guard, (exp_avg, exp_avg_sq, grad))
+    beta1, beta2, step, eps = scalar_guard(beta1, beta2, step, eps, exp_avg[0])
+    _compilable_adam_(exp_avg, exp_avg_sq, grad, beta1, beta2, step, eps)
+    return grad
+@decorator_knowngood
+def _compilable_unscaled_adam_(
     exp_avg: List[Tensor],
     exp_avg_sq: List[Tensor],
     grad: List[Tensor],
+    beta1: Tensor,
+    beta2: Tensor,
+    step: Tensor,
+    eps: Tensor,
+):
+    beta1 = beta_debias(beta1, step)
+    beta2 = beta_debias(beta2, step)
+    g32 = list(map(promote, grad))
+    denom = _compilable_exp_avg_sq_(exp_avg_sq, g32, beta2, eps, [None])
+    g32 = torch._foreach_div(g32, denom)
+    exp_avg32 = _lerp(exp_avg, g32, beta1)
+    u32 = torch._foreach_mul(exp_avg32, denom)
+    copy_stochastic_list_(grad, u32)
+def unscaled_adam_(
+    exp_avg: List[Tensor] | Tensor,
+    exp_avg_sq: List[Tensor] | Tensor,
+    grad: List[Tensor] | Tensor,
     beta1: float,
     beta2: float,
     step: int,
     eps: float = 1e-8,
-):
+) -> List[Tensor]:
     exp_avg, exp_avg_sq, grad = map(list_guard, (exp_avg, exp_avg_sq, grad))
     beta1, beta2, step, eps = scalar_guard(beta1, beta2, step, eps, exp_avg[0])
-    _compilable_adam_(exp_avg, exp_avg_sq, grad, beta1, beta2, step, eps)
+    _compilable_unscaled_adam_(exp_avg, exp_avg_sq, grad, beta1, beta2, step, eps)
     return grad
@@ -1253,7 +1372,7 @@ def fused_adam_(
     caution: bool,
 ):
     y, exp_avg, exp_avg_sq, grad = list_guard(y, exp_avg, exp_avg_sq, grad)
-    beta1, beta2, step, lr = scalar_guard(beta1, beta2, step, lr, y[0])
+    beta1, beta2, step, lr, decay = scalar_guard(beta1, beta2, step, lr, decay, y[0])
     _fused_compilable_adam_(y, exp_avg, exp_avg_sq, update, grad, beta1, beta2, step, decay, lr, eps, caution)
@@ -1332,7 +1451,7 @@ def fused_laprop_(
     eps: float = 1e-8,
 ):
     exp_avg, exp_avg_sq, grad, y = list_guard(exp_avg, exp_avg_sq, grad, y)
-    beta1, beta2, step, lr, eps = scalar_guard(beta1, beta2, step, lr, eps, exp_avg[0])
+    beta1, beta2, step, lr, eps, decay = scalar_guard(beta1, beta2, step, lr, eps, decay, exp_avg[0])
     _fused_compilable_laprop_(y, exp_avg, exp_avg_sq, update, grad, beta1, beta2, step, lr, decay, caution, eps)
@@ -1351,7 +1470,7 @@ def _fused_compilable_adopt_(y, update, grad, exp_avg_sq, exp_avg, beta1, beta2,
 def fused_adopt_(y, update, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     exp_avg, exp_avg_sq, grad, y = list_guard(exp_avg, exp_avg_sq, grad, y)
-    beta1, beta2, step, lr = scalar_guard(beta1, beta2, step, lr, exp_avg[0])
+    beta1, beta2, step, lr, decay = scalar_guard(beta1, beta2, step, lr, decay, exp_avg[0])
     _fused_compilable_adopt_(y, update, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution)
@@ -1381,11 +1500,15 @@ def stochastic_round_list_(ref: List[Tensor], source: List[Tensor]):
 @decorator_knowngood
-def stochastic_round_(ref: Tensor, source: Tensor):
-    if source.dtype == torch.bfloat16 or ref.dtype == source.dtype:
-        return source
-    if ref.dtype != torch.bfloat16:
-        return source.to(ref.dtype)
+def stochastic_round_(ref: Tensor, source: Tensor | None = None):
+    if source is not None:
+        if source.dtype == torch.bfloat16 or ref.dtype == source.dtype:
+            return source
+        if ref.dtype != torch.bfloat16:
+            return source.to(ref.dtype)
+    else:
+        source = ref
+    source = source.float()
     result = torch.randint_like(source, dtype=torch.int32, low=0, high=(1 << 16))
     result.add_(source.view(dtype=torch.int32))
     result.bitwise_and_(-65536)  # -65536 = FFFF0000 as a signed int32
@@ -1908,7 +2031,9 @@ def ndim_tuple(Q: list[Tensor]) -> tuple:
     return tuple(q.ndim for q in Q)
-def psgd_calc_A_and_conjB(G, Q, conjB):  # conjB ("V", "vector") == randn during hvp/whitening
+def psgd_calc_A_and_conjB(G: Tensor, Q, conjB: Tensor | None):  # conjB ("V", "vector") == randn during hvp/whitening
+    if conjB is None:
+        conjB = torch.randn_like(G)
     exprA = cached_precond_grad_expr(ndim_tuple(Q), G.ndim)  # calcA expr and cached precond expr are the same
     A = casted_einsum(exprA, *Q, G)
     solve = torch.compiler.disable(torch.linalg.solve_triangular)
@@ -1940,24 +2065,35 @@ def max_singular_value_exact(A, use_lobpcg: bool = False):
             eigval, _ = torch.compiler.disable(torch.lobpcg)(A, k=1, largest=True)
             return eigval[0].sqrt()
         else:
-            return torch.linalg.svd(A, driver="gesvdj")[1].max()  # == linalg.matrix_norm(A, ord=2)
-    except torch.linalg.LinAlgError:
-        return torch.zeros((), device=A.device, dtype=A.dtype)
+            return torch.linalg.svd(promote(A), driver="gesvdj")[1].max().to(A.dtype)  # == linalg.matrix_norm(A, ord=2)
+    except (torch.linalg.LinAlgError, RuntimeError):
+        return max_singular_value_power_iter(promote(A), iterations=2)
 @decorator_knowngood
-def max_singular_value_power_iter(A: Tensor, max_abs: Optional[Tensor] = None, iterations: int = 5):
+def max_singular_value_power_iter(A_outer: Tensor, max_abs: Optional[Tensor] = None, iterations: int = 5):
     """
     Rayleigh quotient of row with the largest norm + optional power iterations
     """
-    x_norm, max_idx = A.norm(dim=1).max(dim=0)
-    x = A.index_select(0, max_idx).flatten().contiguous()
-    A = A / x_norm
-    x = x / x_norm
-    for _ in range(iterations):
-        x = A.T.mv(A.mv(x))  # A @ A.T @ x, but explicitly telling torch.compile not to compute the full matrix
-        x = x / x.norm()
-    return (x @ A.T.mv(A.mv(x))).sqrt() * x_norm
+    x_norm, max_idx = A_outer.norm(dim=1).max(dim=0)
+    x_norm = promote(x_norm)
+    def _inner():
+        A = A_outer
+        x = A.index_select(0, max_idx).flatten().contiguous()
+        A = stochastic_round_(A / x_norm)
+        x = x / x_norm
+        def _mv(x):
+            return promote(A.T.mv(A.mv(stochastic_round_(x))))
+        for _ in range(iterations):
+            # A @ A.T @ x, but explicitly telling torch.compile not to compute the full matrix
+            x = F.normalize(_mv(x), dim=0)
+        out = (x @ _mv(x)).to(x_norm.dtype).sqrt() * x_norm
+        return out.squeeze().clone()
+    return cond(x_norm > 0, _inner, lambda: x_norm.squeeze().clone())
 @decorator_knowngood
@@ -1974,33 +2110,81 @@ def max_singular_value_cholesky(A: Tensor, max_abs: Optional[Tensor] = None):
     return sketch_norm * max_abs
+def _max_singular_value_ndim(A: Tensor, max_svd: int = 0, use_cholesky: bool = False, power_iter: int = 16) -> Tensor:
+    if A.ndim <= 2:
+        return max_singular_value(A, max_svd, use_cholesky, power_iter)
+    base = einsum_base[: A.ndim]
+    A16 = stochastic_round_(A)
+    squares = [compiled_einsum(f"{base},{base.replace(b, b.upper())}->{b}{b.upper()}", A16, A16) for b in base]
+    svds = [max_singular_value(promote(s), max_svd, use_cholesky, power_iter) for s in squares]
+    svds = torch.stack(svds)
+    return svds.max().sqrt().to(A.dtype)  # sqrt because we took the SVD of a squared matrix
 @decorator_knowngood
-def max_singular_value(
-    A: Tensor, max_abs: Optional[Tensor], max_svd: int = 32, use_cholesky: bool = False, power_iter: int = 0
-) -> Tensor:
+def max_singular_value(A: Tensor, max_svd: int = 0, use_cholesky: bool = False, power_iter: int = 16) -> Tensor:
+    if A.ndim < 2:
+        return A.abs().max()
+    if A.ndim > 2:
+        raise ValueError("max_singular_value: dimension of A must be less than or equal to 2")
     if min(A.shape) <= max_svd:
         return max_singular_value_exact(A)  # SVD needs ~25% more runtime for size=32, but 0% error instead of 5%
     if use_cholesky or power_iter < 0:
-        return max_singular_value_cholesky(A, max_abs)
+        return max_singular_value_cholesky(A)
     return max_singular_value_power_iter(A, None, iterations=power_iter)
 @decorator_knowngood
-def _psgd_default_preconditioner_grad(
-    terms: List[Tuple[Tensor, Tensor]],
-    Q: List[Tensor],
-) -> List[Tensor]:
-    out = []
-    for q, (x, y) in zip(Q, terms):
-        x = promote(x)
-        y = promote(y)
-        update = x - y
-        if q.ndim < 2:
-            update = q * update
-        else:
-            update = (q @ update).triu()
-        out.append(update)
-    return out
+def clamped_max_singular_value(
+    A: Tensor, min: float, max_svd: int = 0, use_cholesky: bool = False, power_iter: int = 16
+) -> Tensor:
+    norm = A.norm()  # L2 norm is an upper bound for the spectral norm. If the upper bound is below the minimum, the real value will be too.
+    out = cond(norm > min, lambda: max_singular_value(A, max_svd, use_cholesky, power_iter), lambda: norm.clone())
+    return out.clamp(min=min)
+@decorator_knowngood
+def min_singular_value(
+    A: Tensor,
+    power_iter: int = 5,
+    safety: float = 1.05,
+    max_svd: int = 32,
+):
+    if A.ndim < 2:
+        return A.abs().min()
+    n = A.size(0)
+    if n <= max_svd:
+        try:
+            eigs = torch.linalg.eigvalsh(promote(A))
+            return eigs.min().to(A.dtype)
+        except torch.linalg.LinAlgError:
+            pass
+    lambda_max_hat = max_singular_value(A, power_iter=power_iter)
+    lambda_upper = lambda_max_hat * safety
+    row_norms = A.norm(dim=1)
+    norm, idx = row_norms.min(dim=0)
+    v = cond(norm > 0, lambda: A.index_select(0, idx).flatten(), lambda: torch.rand_like(A[0]))
+    v = v / promote(v.norm())
+    for _ in range(power_iter):
+        v = lambda_upper * v - promote(A.mv(stochastic_round_(v)))
+        v = v / promote(v.norm())
+    mu_hat = v @ (lambda_upper * v - promote(A.mv(stochastic_round_(v))))
+    lambda_min_hat = lambda_upper - mu_hat
+    def _approx():
+        mu = A.trace() / n
+        sigma_square = A.square().sum() / n - mu**2
+        return mu - (sigma_square / (n - 1)).sqrt()
+    return cond(
+        (~torch.isfinite(lambda_min_hat)) | (lambda_min_hat <= 0), _approx, lambda: lambda_min_hat.clone()
+    ).squeeze()
 @decorator_knowngood
@@ -2027,6 +2211,107 @@ def calcG_expr(q_dim, g_dim):
     return exprs
+def eye_like(x: Tensor):
+    if x.ndim < 2:
+        return torch.ones_like(x)
+    assert x.ndim == 2
+    assert x.size(0) == x.size(1)
+    return torch.eye(x.size(0), device=x.device, dtype=x.dtype)
+@decorator_knowngood
+def _gg_inverse_via_vjp(G: Tensor, Q: List[Tensor]):
+    """
+    Idea:
+        G should be zeroth power. So, all Qs together should approximate the G's inverse.
+        Assuming G is 2-dimensional, we'd have two preconditioning Q's: L, R
+        Optimize LGR being a zeroth power using `MSE( (LGR) (LGR).T , I ) + MSE( (LGR).T + (LGR) , I )`,
+        then backprop to L/R jointly.
+        This function computes the gradients for L/R, with an outer optimizer layer handling the rest.
+        `psgd_precond_grad` computes LGR for the general (n-dimensional) case
+        `exprG` contains the einsum expressions to compute (LGR)(LGR).T (and (LGR).T(LGR)) for the general n-dim case
+    Args:
+        G: Gradient that should be orthogonalized
+        Q: List of preconditioner tensors.
+    Returns:
+        - List of gradients with respect to Q (d_Q).
+    """
+    exprGs = calcG_expr(ndim_tuple(Q), G.ndim)
+    G16 = stochastic_round_(G)
+    Q16 = [stochastic_round_(q) for q in Q]
+    P = psgd_precond_grad(G16, Q16)  # Q₀GQ₁
+    d_P = torch.zeros_like(G)
+    base = einsum_base[: G.ndim]
+    for i, exprG in enumerate(exprGs):
+        pp = compiled_einsum(exprG, P, P)
+        error = pp - eye_like(pp)
+        dim = einsum_base[i]
+        if pp.ndim == 2:
+            new = dim.upper()
+            prec = f"{new}{dim}"
+        else:
+            new = dim
+            prec = dim
+        d_P += torch.einsum(f"{base},{prec}->{base.replace(dim, new)}", P, error)
+    d_P = stochastic_round_(d_P)  # accumulate in fp32 and round at the end
+    grads = []
+    for i, exprG in enumerate(exprGs):
+        new_q = Q16[:]
+        new_q[i] = eye_like(new_q[i])
+        pq = psgd_precond_grad(G16, new_q)
+        grad = compiled_einsum(exprG, pq, d_P)
+        if grad.ndim == 2:
+            grad = (grad + grad.T) / 2
+        grads.append(grad)
+    return grads, P.to(G.dtype)
+def _inverse_initial_guess(gg):
+    n = gg.shape[0]
+    sigma_max = promote(gg.norm())
+    trace_gg = promote(torch.trace(gg))
+    sigma_min_approx = trace_gg / (n * sigma_max)
+    return sigma_max, sigma_min_approx
+@decorator_knowngood
+def _chebychef_coeff(degree: int, device, eps: float = 1e-8):
+    k = torch.arange(degree, dtype=torch.float64, device=device)
+    rotation = (2 * k + 1) * math.pi / (2 * degree)
+    f = (rotation.cos() + 1 + eps) ** -0.5
+    rotation = (rotation.view(-1, 1) * k[1:].view(1, -1)).cos()
+    coeff0 = f.sum() / degree
+    coeffs = f @ rotation * 2 / degree
+    return coeff0.float(), coeffs.float()
+@decorator_knowngood
+def _psgd_default_preconditioner_grad(
+    terms: List[Tuple[Tensor, Tensor]],
+    Q: List[Tensor],
+) -> List[Tensor]:
+    out = []
+    for q, (x, y) in zip(Q, terms):
+        x = promote(x)
+        y = promote(y)
+        update = x - y
+        if q.ndim < 2:
+            update = q * update
+        else:
+            update = (q @ update).triu()
+        out.append(update)
+    return out
 @decorator
 def psgd_update_precond(
     G: Tensor,
@@ -2056,6 +2341,102 @@ def psgd_update_precond(
     return None
+@decorator_knowngood
+def bf16_matmul(x: Tensor, y: Tensor):
+    return (promote(x) @ promote(y)).to(x.dtype)
+def if_iscompiling(fn):
+    base = getattr(torch, fn.__name__, None)
+    def _fn(x):
+        if torch.compiler.is_compiling() and hasattr(torch, fn.__name__):
+            return base(x)
+        return fn(x)
+    return _fn
+@if_iscompiling
+def while_loop(cond, body, state):
+    """
+    dispatches to torch.while_loop if we're compiling. otherwise, falls back to a naive + slow baseline
+    useful for debugging
+    """
+    while cond(*state).item():
+        state = body(*state)
+    return state
+@if_iscompiling
+def cond(cond, true_fn, false_fn):
+    """
+    dispatches to torch.cond if we're compiling. otherwise, falls back to a naive + slow baseline
+    useful for debugging
+    """
+    if cond.item():
+        return true_fn()
+    return false_fn()
+@decorator_knowngood
+def _householder_vec_e1_to_v(v: Tensor, eps: float = 1e-12) -> Tensor:
+    """
+    Return w such that H = I - 2 w w^T is orthogonal and H e1 = v (v unit).
+    Applying from the right: G @ H = G - 2 (G @ w) w^T.
+    If v is (numerically) e1, returns w=0 and H=I.
+    """
+    v = v / v.norm().clamp(min=eps)
+    e1 = torch.zeros_like(v)
+    e1[0] = 1.0
+    w = e1 - v
+    return w / w.norm().clamp(min=eps)
+@decorator_knowngood
+def eigvecs_product_rank1(
+    G: Tensor, v: Tensor, w: Optional[Tensor] = None, eps: float = 1e-12
+) -> Tuple[Tensor, Tensor]:
+    """
+    Compute Y = G @ V where V is an eigenvector matrix for P = λ I + σ v v^T,
+    using the Householder reflector with first column v. Never materializes V.
+    Args:
+        G: shape (..., d) — gradient row(s) you want to rotate into eigenbasis.
+        v: shape (d,)      — current unit direction (top eigenvector of P).
+        w: optional Householder vector w; pass to reuse across calls.
+    Returns:
+        (Y, w) where:
+          Y has shape (..., d) and equals G @ eigenvectors(P),
+          w is the Householder vector you can cache & reuse.
+    """
+    if w is None:
+        w = _householder_vec_e1_to_v(v, eps)
+    Y = G - 2.0 * compiled_einsum("...i,i,j->...j", G, w, w)
+    return Y, w
+@decorator_knowngood
+def oja_update(v: Tensor, g: Tensor, lr: float = 1e-2, eps: float = 1e-12) -> Tensor:
+    """
+    One Oja step to track the top eigendirection of the gradient covariance.
+    v <- v + lr * ((g^T v) g - (g^T v)^2 v); then renormalize.
+    """
+    gv = g @ v
+    v = v + lr * (gv * g - (gv * gv) * v)
+    return v / v.norm().clamp(min=eps)
+def cond_n(cond_val: Tensor, *fns):
+    fns = list(fns)
+    fn = fns.pop(0)
+    if not fns:
+        return fn
+    return cond(cond_val == 0, fn, lambda: cond_n(cond_val - 1, *fns))
 @decorator_knowngood
 def _psgd_precond_update_(
     matmuled: List[Optional[Tensor]],
@@ -2074,7 +2455,7 @@ def _psgd_precond_update_(
         if update.ndim < 2:
             lb = update.norm(float("inf"))
         else:
-            lb = max_singular_value(update, None, power_iter=power_iter)
+            lb = max_singular_value(update, power_iter=power_iter)
             update = promote(update)
             if store_triu_as_line:
                 update = triu_to_line([update])[0][1]
@@ -2146,70 +2527,83 @@ def inverse_free_psgd_update_precond(
 @decorator_knowngood
-def _compilable_l2_clip_(x, clip_at):
-    ref = x
-    x = list(map(promote, x))
-    norm = torch._foreach_norm(x)
-    torch._foreach_maximum_(norm, clip_at)
-    out = torch._foreach_div(x, norm)
-    return stochastic_round_list_(ref, out)
+def _clip(x, norm, clip_at, eps=1e-8):
+    x32 = promote(x)
+    # (x / y.clamp(min=eps)).clamp(max=1) == x / y.clamp(min=max(x, eps))
+    norm = clip_at / norm.clamp(min=max(clip_at, eps))
+    x32 = x32 * norm
+    copy_stochastic_(x, x32)
+@decorator_knowngood
+def _compilable_l2_clip_(xs, clip_at, eps=1e-8):
+    for x in xs:
+        _clip(x, promote(x).norm(), clip_at, eps)
 def l2_normalization_(x, clip_at: float = 1e-8):
     x = list_guard(x)
-    return _compilable_l2_clip_(x, clip_at)
+    _compilable_l2_clip_(x, clip_at)
+    return x
 def l2_clip_(x, clip_at: float = 1.0):
     x = list_guard(x)
-    return _compilable_l2_clip_(x, clip_at)
+    _compilable_l2_clip_(x, clip_at)
+    return x
 @decorator_knowngood
-def _compilable_rmsnorm_clip_(x, clip_at):
-    x = list(map(promote, x))
-    norm = torch._foreach_norm(x)
-    norm = [n.div_(x_.numel() ** 0.5) for n, x_ in zip(norm, x)]
-    torch._foreach_maximum_(norm, clip_at)
-    return torch._foreach_div(x, norm)
+def _compilable_rmsnorm_clip_(xs, clip_at, eps=1e-8):
+    for x in xs:
+        _clip(x, promote(x).square().mean().sqrt(), clip_at, eps)
 def rmsnorm_clip_(x, clip_at: float = 1.0):
     x = list_guard(x)
-    return _compilable_rmsnorm_clip_(x, clip_at)
-@decorator_knowngood
-def _compilable_global_rmsnorm_clip_(x, clip_at):
-    x = list(map(promote, x))
-    norm = sum([x.square().sum() for x in x]) / sum([x.numel() for x in x])
-    norm = norm**0.5
-    norm = norm.clamp(min=clip_at)
-    return torch._foreach_div(x, norm)
+    _compilable_rmsnorm_clip_(x, clip_at)
+    return x
 @decorator_knowngood
-def _compilable_global_l2norm_clip_(x, clip_at):
-    x = list(map(promote, x))
-    norm = sum([x.square().sum() for x in x])
-    norm = norm**0.5
-    norm = norm.clamp(min=clip_at)
-    return torch._foreach_div(x, norm)
+def _compilable_global_rmsnorm_clip_(x, clip_at, eps=1e-8):
+    norm = 0
+    numel = sum([i.numel() for i in x])
+    for i in x:
+        norm += promote(i).square().sum()
+    norm = (norm / numel) ** 0.5
+    scalar = clip_at / norm.clamp(min=max(clip_at, eps))
+    stochastic_multiply_(x, scalar)
 def global_rmsnorm_clip(x, clip_at: float = 1.0):
     x = list_guard(x)
-    return _compilable_global_rmsnorm_clip_(x, clip_at)
+    clip_at = scalar_guard(clip_at, x[0])
+    _compilable_global_rmsnorm_clip_(x, clip_at)
+    return x
+@decorator_knowngood
+def _compilable_global_l2norm_clip_(x, clip_at, eps=1e-8):
+    norm = 0
+    for i in x:
+        norm += promote(i).square().sum()
+    norm = norm**0.5
+    scalar = clip_at / norm.clamp(min=max(clip_at, eps))
+    stochastic_multiply_(x, scalar)
 def global_l2norm_clip(x, clip_at: float = 1.0):
     x = list_guard(x)
-    return _compilable_global_rmsnorm_clip_(x, clip_at)
+    clip_at = scalar_guard(clip_at, x[0])
+    _compilable_global_l2norm_clip_(x, clip_at)
+    return x
 def rmsnorm_normalize_(x, clip_at: float = 1e-6):
     x = list_guard(x)
-    return _compilable_rmsnorm_clip_(x, clip_at)
+    _compilable_rmsnorm_clip_(x, clip_at)
+    return x
 @decorator_knowngood
@@ -2284,17 +2678,6 @@ def weight_decay_to_ema_(p, ema, ema_decay, weight_decay):
     _compilable_weight_decay_to_ema_(p, ema, ema_decay, weight_decay)
-@decorator_knowngood
-def _compilable_weight_decay_to_init_(p, init, weight_decay):
-    _lerp(p, promote(init), 1 - weight_decay)
-def weight_decay_to_init_(p, init, weight_decay):
-    p, init = list_guard(p, init)
-    weight_decay = scalar_guard(weight_decay, p[0])
-    _compilable_weight_decay_to_ema_(p, init, weight_decay)
 @decorator_knowngood
 def _compilable_l1_weight_decay_to_ema_(p, ema, ema_decay, weight_decay):
     ema32 = _lerp(ema, p, ema_decay)
@@ -2416,7 +2799,7 @@ def precond_grad_cached_(
     md = min_dtype(list(cached_q) + [ea])
     args = [q.to(md) for q in cached_q]
     args = args + [ea.to(md)]
-    expr = cached_precond_grad_expr(ndim_tuple(cached_q), grad.ndim)
+    expr = cached_precond_grad_expr(ndim_tuple(cached_q), ea.ndim)
     new = compiled_einsum(expr, *args)
     if cast:
         return new.to(ea.dtype)
@@ -2433,7 +2816,7 @@ def _compilable_fused_precond_grad_cached_(ea: Tensor, param, lr, grad, decay, c
 def fused_precond_grad_cached_(ea: Tensor, param, lr, grad, decay, caution, cached_q: List[Tensor]):
-    lr = scalar_guard(lr, param[0])
+    lr, decay = scalar_guard(lr, decay, param[0])
     _compilable_fused_precond_grad_cached_(ea, param, lr, grad, decay, caution, cached_q)
@@ -2502,7 +2885,7 @@ def fused_psgd_precond_grad(
     store_triu_as_line: bool = False,
     symmetric_output: bool = False,
 ):
-    lr = scalar_guard(lr, param[0])
+    lr, decay = scalar_guard(lr, decay, param[0])
     _compilable_fused_psgd_precond_grad(
         ea, param, lr, grad, decay, caution, preconds, store_triu_as_line, symmetric_output
     )

heavyball 2.0.0.dev0__py3-none-any.whl → 2.1.1__py3-none-any.whl

heavyball 2.0.0.dev0py3-none-any.whl → 2.1.1py3-none-any.whl