PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +43 -33
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +48 -52
torchzero/core/module.py +130 -50
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +2 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +15 -12
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +1 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +32 -32
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +50 -48
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +10 -10
torchzero/modules/quasi_newton/sg2.py +19 -19
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +49 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +57 -90
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +157 -177
torchzero/modules/second_order/rsn.py +106 -96
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +10 -10
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +93 -69
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/linalg/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from . import linear_operator
+from .matrix_power import (
+    matrix_power_eigh,
+    matrix_power_svd,
+)
+from .orthogonalize import zeropower_via_eigh, zeropower_via_newtonschulz5, zeropower_via_svd, orthogonalize
+from .qr import qr_householder
+from .solve import cg, nystrom_sketch_and_solve, nystrom_pcg
+from .eigh import nystrom_approximation

torchzero/linalg/eigh.py ADDED Viewed

@@ -0,0 +1,34 @@
+from collections.abc import Callable
+import torch
+from .linalg_utils import mm
+# https://arxiv.org/pdf/2110.02820
+def nystrom_approximation(
+    A_mv: Callable[[torch.Tensor], torch.Tensor] | None,
+    A_mm: Callable[[torch.Tensor], torch.Tensor] | None,
+    ndim: int,
+    rank: int,
+    device,
+    dtype = torch.float32,
+    generator = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Computes Nyström approximation to positive-semidefinite A factored as Q L Q^T (truncatd eigenvalue decomp),
+    returns ``(L, Q)``.
+    A is ``(m,m)``, then Q is ``(m, rank)``; L is a ``(rank, )`` vector - diagonal of ``(rank, rank)``"""
+    # basis
+    O = torch.randn((ndim, rank), device=device, dtype=dtype, generator=generator) # Gaussian test matrix
+    O, _ = torch.linalg.qr(O) # Thin QR decomposition # pylint:disable=not-callable
+    # Y = AΩ
+    AO = mm(A_mv=A_mv, A_mm=A_mm, X=O)
+    v = torch.finfo(dtype).eps * torch.linalg.matrix_norm(AO, ord='fro') # Compute shift # pylint:disable=not-callable
+    Yv = AO + v*O # Shift for stability
+    C = torch.linalg.cholesky_ex(O.mT @ Yv)[0] # pylint:disable=not-callable
+    B = torch.linalg.solve_triangular(C, Yv.mT, upper=False, unitriangular=False).mT # pylint:disable=not-callable
+    Q, S, _ = torch.linalg.svd(B, full_matrices=False) # pylint:disable=not-callable
+    L = (S.pow(2) - v).clip(min=0) #Remove shift, compute eigs
+    return L, Q

torchzero/linalg/linalg_utils.py ADDED Viewed

@@ -0,0 +1,14 @@
+from collections.abc import Callable
+import torch
+def mm(
+    A_mv: Callable[[torch.Tensor], torch.Tensor] | None,
+    A_mm: Callable[[torch.Tensor], torch.Tensor] | None,
+    X
+):
+    """matrix-matrix when either mv or mm is given"""
+    if A_mm is not None: return A_mm(X)
+    assert A_mv is not None
+    return torch.stack([A_mv(col) for col in X.unbind(-1)], -1) # rank matvecs

torchzero/{utils/linalg → linalg}/linear_operator.py RENAMED Viewed

@@ -1,4 +1,6 @@
-"""simplified version of https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.LinearOperator.html. This is used for trust regions."""
+"""This is mainly used for trust regions. In some cases certain operations are relaxed, e.g. eigenvalue shift instead of
+adding diagonal when it isn't tractable, to make it work with Levenberg-Marquadt.
+"""
 import math
 from abc import ABC, abstractmethod
 from functools import partial
@@ -7,7 +9,8 @@ from typing import cast, final
 import torch
-from ..torch_tools import tofloat, tonumpy, totensor
+from ..utils.torch_tools import tofloat, tonumpy, totensor
+from .solve import nystrom_sketch_and_solve
 if find_spec('scipy') is not None:
     from scipy.sparse.linalg import LinearOperator as _ScipyLinearOperator
@@ -15,7 +18,6 @@ else:
     _ScipyLinearOperator = None
 class LinearOperator(ABC):
-    """this is used for trust region"""
     device: torch.types.Device
     dtype: torch.dtype | None
@@ -25,12 +27,18 @@ class LinearOperator(ABC):
     def rmatvec(self, x: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError(f"{self.__class__.__name__} doesn't implement rmatvec")
-    def matmat(self, x: torch.Tensor) -> "LinearOperator":
-        raise NotImplementedError(f"{self.__class__.__name__} doesn't implement matmul")
+    def matmat(self, X: torch.Tensor) -> "LinearOperator":
+        raise NotImplementedError(f"{self.__class__.__name__} doesn't implement matmat")
+    def rmatmat(self, X: torch.Tensor) -> "LinearOperator":
+        raise NotImplementedError(f"{self.__class__.__name__} doesn't implement rmatmat")
     def solve(self, b: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError(f"{self.__class__.__name__} doesn't implement solve")
+    def solve_plus_diag(self, b: torch.Tensor, diag: int | float | torch.Tensor) -> torch.Tensor:
+        return self.add_diagonal(diag).solve(b)
     def solve_bounded(self, b: torch.Tensor, bound:float, ord:float=2) -> torch.Tensor:
         """solve with a norm bound on x"""
         raise NotImplementedError(f"{self.__class__.__name__} doesn't implement solve_bounded")
@@ -129,8 +137,8 @@ class Dense(LinearOperator):
     def matvec(self, x): return self.A.mv(x)
     def rmatvec(self, x): return self.A.mH.mv(x)
-    def matmat(self, x): return Dense(self.A.mm(x))
-    def rmatmat(self, x): return Dense(self.A.mH.mm(x))
+    def matmat(self, X): return Dense(self.A.mm(X))
+    def rmatmat(self, X): return Dense(self.A.mH.mm(X))
     def solve(self, b): return _solve(self.A, b)
@@ -146,6 +154,12 @@ class Dense(LinearOperator):
     def is_dense(self): return True
     def transpose(self): return Dense(self.A.mH)
+class SPD(Dense):
+    def solve(self, b: torch.Tensor):
+        L, info = torch.linalg.cholesky_ex(self.A) # pylint:disable=not-callable
+        return torch.cholesky_solve(b.unsqueeze(-1), L).squeeze(-1)
 class DenseInverse(LinearOperator):
     """Represents inverse of a dense matrix A."""
     def __init__(self, A_inv: torch.Tensor):
@@ -156,8 +170,8 @@ class DenseInverse(LinearOperator):
     def matvec(self, x): return _solve(self.A_inv, x) # pylint:disable=not-callable
     def rmatvec(self, x): return _solve(self.A_inv.mH, x) # pylint:disable=not-callable
-    def matmat(self, x): return Dense(_solve(self.A_inv, x)) # pylint:disable=not-callable
-    def rmatmat(self, x): return Dense(_solve(self.A_inv.mH, x)) # pylint:disable=not-callable
+    def matmat(self, X): return Dense(_solve(self.A_inv, X)) # pylint:disable=not-callable
+    def rmatmat(self, X): return Dense(_solve(self.A_inv.mH, X)) # pylint:disable=not-callable
     def solve(self, b): return self.A_inv.mv(b)
@@ -190,8 +204,8 @@ class Diagonal(LinearOperator):
     def matvec(self, x): return self.A * x
     def rmatvec(self, x): return self.A * x
-    def matmat(self, x): return Dense(x * self.A.unsqueeze(-1))
-    def rmatmat(self, x): return Dense(x * self.A.unsqueeze(-1))
+    def matmat(self, X): return Dense(X * self.A.unsqueeze(-1))
+    def rmatmat(self, X): return Dense(X * self.A.unsqueeze(-1))
     def solve(self, b): return b/self.A
@@ -221,8 +235,8 @@ class ScaledIdentity(LinearOperator):
     def matvec(self, x): return x * self.s
     def rmatvec(self, x): return x * self.s
-    def matmat(self, x): return Dense(x * self.s)
-    def rmatmat(self, x): return Dense(x * self.s)
+    def matmat(self, X): return Dense(X * self.s)
+    def rmatmat(self, X): return Dense(X * self.s)
     def solve(self, b): return b / self.s
     def solve_bounded(self, b, bound, ord = 2):
@@ -263,6 +277,7 @@ class ScaledIdentity(LinearOperator):
     def is_dense(self): return False
     def transpose(self): return ScaledIdentity(self.s, shape=self.shape, device=self.device, dtype=self.dtype)
 class AtA(LinearOperator):
     def __init__(self, A: torch.Tensor):
         self.A = A
@@ -270,8 +285,8 @@ class AtA(LinearOperator):
     def matvec(self, x): return self.A.mH.mv(self.A.mv(x))
     def rmatvec(self, x): return self.matvec(x)
-    def matmat(self, x): return Dense(torch.linalg.multi_dot([self.A.mH, self.A, x])) # pylint:disable=not-callable
-    def rmatmat(self, x): return Dense(torch.linalg.multi_dot([self.A.mH, self.A, x])) # pylint:disable=not-callable
+    def matmat(self, X): return Dense(torch.linalg.multi_dot([self.A.mH, self.A, X])) # pylint:disable=not-callable
+    def rmatmat(self, X): return Dense(torch.linalg.multi_dot([self.A.mH, self.A, X])) # pylint:disable=not-callable
     def is_dense(self): return False
     def to_tensor(self): return self.A.mH @ self.A
@@ -283,51 +298,41 @@ class AtA(LinearOperator):
         return Dense(self.to_tensor() + torch.diag_embed(x))
     def solve(self, b):
-        return Dense(self.to_tensor()).solve(b)
+        *_, n, m = self.A.shape
+        if n >= m: return Dense(self.to_tensor()).solve(b)
-    def inv(self):
-        return Dense(self.to_tensor()).inv()
+        A = self.A
+        C = A @ A.mH # (n, n), SPD
+        L, info = torch.linalg.cholesky_ex(C) # pylint:disable=not-callable
+        z = torch.cholesky_solve((A @ b).unsqueeze(-1), L).squeeze(-1)
+        return A.mH @ z
-    def diagonal(self):
-        return self.A.pow(2).sum(1)
-    def size(self):
-        n = self.A.size(1)
-        return (n,n)
+    def solve_plus_diag(self, b, diag):
+        *_, n, m = self.A.shape
+        if (n >= m) or (isinstance(diag, torch.Tensor) and diag.numel() > 1):
+            return Dense(self.to_tensor()).solve_plus_diag(b, diag)
-class AAT(LinearOperator):
-    def __init__(self, A: torch.Tensor):
-        self.A = A
-        self.device = self.A.device; self.dtype = self.A.dtype
-    def matvec(self, x): return self.A.mv(self.A.mH.mv(x))
-    def rmatvec(self, x): return self.matvec(x)
-    def matmat(self, x): return Dense(torch.linalg.multi_dot([self.A, self.A.mH, x])) # pylint:disable=not-callable
-    def rmatmat(self, x): return Dense(torch.linalg.multi_dot([self.A, self.A.mH, x])) # pylint:disable=not-callable
-    def is_dense(self): return False
-    def to_tensor(self): return self.A @ self.A.mH
-    def transpose(self): return AAT(self.A)
-    def add_diagonal(self, x):
-        if isinstance(x, torch.Tensor) and x.numel() <= 1: x = x.item()
-        if isinstance(x, (int,float)): x = torch.full((self.shape[0],), fill_value=x, device=self.A.device, dtype=self.A.dtype)
-        return Dense(self.to_tensor() + torch.diag_embed(x))
+        A = self.A
+        I = torch.eye(A.size(-2), device=A.device, dtype=A.dtype)
-    def solve(self, b):
-        return Dense(self.to_tensor()).solve(b)
+        C = (A @ A.mH).add_(I.mul_(diag)) # (n, n), SPD
+        L, info = torch.linalg.cholesky_ex(C + I.mul_(diag)) # pylint:disable=not-callable
+        z = torch.cholesky_solve((A @ b).unsqueeze(-1), L).squeeze(-1)
+        return (1 / diag) * (b - A.mH @ z)
     def inv(self):
         return Dense(self.to_tensor()).inv()
     def diagonal(self):
-        return self.A.pow(2).sum(0)
+        return self.A.pow(2).sum(1)
     def size(self):
         n = self.A.size(1)
         return (n,n)
+class AAt(AtA):
+    def __init__(self, A: torch.Tensor):
+        super().__init__(A.mH)
 class Sketched(LinearOperator):
     """A projected by sketching matrix S, representing the operator S @ A_proj @ S.T.
@@ -339,7 +344,6 @@ class Sketched(LinearOperator):
         self.A_proj = A_proj
         self.device = self.A_proj.device; self.dtype = self.A_proj.dtype
     def matvec(self, x):
         x_proj = self.S.T @ x
         Ax_proj = self.A_proj @ x_proj
@@ -351,8 +355,8 @@ class Sketched(LinearOperator):
         return self.S @ ATx_proj
-    def matmat(self, x): return Dense(torch.linalg.multi_dot([self.S, self.A_proj, self.S.T, x])) # pylint:disable=not-callable
-    def rmatmat(self, x): return Dense(torch.linalg.multi_dot([self.S, self.A_proj.mH, self.S.T, x])) # pylint:disable=not-callable
+    def matmat(self, X): return Dense(torch.linalg.multi_dot([self.S, self.A_proj, self.S.T, X])) # pylint:disable=not-callable
+    def rmatmat(self, X): return Dense(torch.linalg.multi_dot([self.S, self.A_proj.mH, self.S.T, X])) # pylint:disable=not-callable
     def is_dense(self): return False
@@ -375,3 +379,49 @@ class Sketched(LinearOperator):
         n = self.S.size(0)
         return (n,n)
+class Eigendecomposition(LinearOperator):
+    """A represented as Q L Q^H. If A is (n,n), then Q is (n, rank); L is a vector - diagonal of (rank, rank)"""
+    def __init__(self, L: torch.Tensor, Q: torch.Tensor, use_nystrom: bool = True):
+        self.L = L
+        self.Q = Q
+        self.use_nystrom = use_nystrom
+        self.device = self.L.device; self.dtype = self.L.dtype
+    def matvec(self, x):
+        return self.Q @ ((self.Q.mH @ x) * self.L)
+    def rmatvec(self, x):
+        return self.matvec(x)
+    def matmat(self, X):
+        return Dense(self.Q @ (self.L[:, None] * (self.Q.mH @ X)))
+    def rmatmat(self, X):
+        return self.matmat(X)
+    def is_dense(self): return False
+    def to_tensor(self): return self.Q @ self.L.diag_embed() @ self.Q.mH
+    def transpose(self): return Eigendecomposition(L=self.L, Q=self.Q)
+    def add_diagonal(self, x):
+        """this doesn't correspond to adding diagonal to A, however it still works for LM etc."""
+        if isinstance(x, torch.Tensor) and x.numel() > 1:
+            raise RuntimeError("Eigendecomposition linear operator doesn't support add_diagonal with a vector diag")
+        return Eigendecomposition(L=self.L + x, Q = self.Q)
+    def solve(self, b):
+        return self.Q @ ((self.Q.mH @ b) / self.L)
+    def solve_plus_diag(self, b, diag):
+        if isinstance(diag, torch.Tensor) and diag.numel() > 1: return super().solve_plus_diag(b, diag)
+        if not self.use_nystrom: return super().solve_plus_diag(b, diag)
+        return nystrom_sketch_and_solve(L=self.L, Q=self.Q, b=b, reg=float(diag))
+    def inv(self):
+        return Eigendecomposition(L=1 / self.L, Q = self.Q)
+    def size(self):
+        n = self.Q.size(0)
+        return (n,n)

torchzero/linalg/matrix_power.py ADDED Viewed

@@ -0,0 +1,28 @@
+from typing import Literal
+import warnings
+from collections.abc import Callable
+import torch
+from . import torch_linalg
+def matrix_power_eigh(A: torch.Tensor, power:float, abs:bool=False):
+    """this is faster than SVD but only for positive semi-definite symmetric matrices
+    (covariance matrices are always SPD)"""
+    L, Q = torch_linalg.eigh(A, retry_float64=True) # pylint:disable=not-callable
+    if abs: L.abs_()
+    if power % 2 != 0: L.clip_(min = torch.finfo(A.dtype).tiny * 2)
+    return (Q * L.pow_(power).unsqueeze(-2)) @ Q.mH
+def matrix_power_svd(A: torch.Tensor, power: float) -> torch.Tensor:
+    """for any symmetric matrix"""
+    U, S, Vh = torch_linalg.svd(A, full_matrices=False, retry_float64=True) # pylint:disable=not-callable
+    if power % 2 != 0: S.clip_(min = torch.finfo(A.dtype).tiny * 2)
+    return (U * S.pow_(power).unsqueeze(-2)) @ Vh
+MatrixPowerMethod = Literal["eigh", "eigh_abs", "svd"]
+def matrix_power(A: torch.Tensor, power: float, method: MatrixPowerMethod = "eigh_abs") -> torch.Tensor:
+    if method == "eigh": return matrix_power_eigh(A, power)
+    if method == "eigh_abs": return matrix_power_eigh(A, power, abs=True)
+    if method == "svd": return matrix_power_svd(A, power)
+    raise ValueError(method)

torchzero/linalg/orthogonalize.py ADDED Viewed

@@ -0,0 +1,95 @@
+from typing import Literal
+import torch
+from ..utils.compile import allow_compile
+from . import torch_linalg
+# zeropower_via_newtonschulz5 from:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# and
+# https://github.com/HomebrewML/HeavyBall/blob/main/heavyball/utils.py#L452
+_NS_COEFFS = (
+    (4.0848, -6.8946, 2.9270),
+    (3.9505, -6.3029, 2.6377),
+    (3.7418, -5.5913, 2.3037),
+    (2.8769, -3.1427, 1.2046),
+    (2.8366, -3.0525, 1.2012)
+)
+@allow_compile
+def zeropower_via_newtonschulz5(G: torch.Tensor, coeffs=_NS_COEFFS) -> torch.Tensor:
+    """
+    Applies to last 2 dims - so usually reverse_dims should be applied to G before and after.
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
+    X = G.bfloat16()
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm(dim=(-2, -1), keepdim=True).clip(min=torch.finfo(X.dtype).tiny * 2))
+    # Perform the NS iterations
+    for a,b,c in coeffs:
+        A = X @ X.mT
+        B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        X = a * X + B @ X
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    return X.to(G.dtype)
+# code from https://github.com/MarkTuddenham/Orthogonal-Optimisers.
+# Tuddenham, M., Prügel-Bennett, A., & Hare, J. (2022).
+# Orthogonalising gradients to speed up neural network optimisation. arXiv preprint arXiv:2202.07052.
+def zeropower_via_svd(A: torch.Tensor) -> torch.Tensor:
+    """
+    Applies to first 2 dims and isn't batched - rest of dimensions are flattened.
+    """
+    try:
+        U, S, Vt = torch_linalg.svd(A, full_matrices=False, retry_float64=True) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError:
+        U, S, Vt = torch.svd_lowrank(A, q=1, M=1e-4 * A.mean() * torch.rand_like(A))
+    return  U @ Vt
+def zeropower_via_eigh(A: torch.Tensor) -> torch.Tensor:
+    """
+    Only SPD and I need to check if I apply those to SPD because this is better than SVD.
+    """
+    L, Q = torch_linalg.eigh(A, retry_float64=True)
+    return  Q @ Q.mH
+def orthogonalize_via_qr(A: torch.Tensor):
+    *_, m, n = A.shape
+    T = False
+    if m < n:
+        T = True
+        m,n = n,m
+        A = A.mH
+    Q = torch_linalg.qr(A, mode='reduced', retry_float64=True).Q
+    if T:
+        Q = Q.mH
+    return Q
+OrthogonalizeMethod = Literal["newtonschulz", "svd", "qr"]
+def orthogonalize(A: torch.Tensor, method: OrthogonalizeMethod = "newtonschulz") -> torch.Tensor:
+    if method == "newtonschulz": return zeropower_via_newtonschulz5(A)
+    if method == "svd": return zeropower_via_svd(A)
+    if method == "qr": return orthogonalize_via_qr(A)
+    if method == "eigh": return zeropower_via_eigh(A)
+    raise ValueError(method)

torchzero/{utils/linalg → linalg}/qr.py RENAMED Viewed

@@ -1,8 +1,9 @@
 from typing import Literal
 import torch
-from ..compile import enable_compilation
+from ..utils.compile import allow_compile
 # reference - https://www.cs.cornell.edu/~bindel/class/cs6210-f09/lec18.pdf
+@allow_compile
 def _get_w_tau(R: torch.Tensor, i: int, eps: float):
     R_ii = R[...,i,i]
     R_below = R[...,i:,i]
@@ -17,6 +18,7 @@ def _get_w_tau(R: torch.Tensor, i: int, eps: float):
     tau = torch.where(degenerate, 1, tau)
     return w, tau
+@allow_compile
 def _qr_householder_complete(A:torch.Tensor):
     *b,m,n = A.shape
     k = min(m,n)
@@ -33,6 +35,7 @@ def _qr_householder_complete(A:torch.Tensor):
     return Q, R
+@allow_compile
 def _qr_householder_reduced(A:torch.Tensor):
     *b,m,n = A.shape
     k = min(m,n)
@@ -64,7 +67,6 @@ def _qr_householder_reduced(A:torch.Tensor):
     return Q, R
-# @enable_compilation
 def qr_householder(A:torch.Tensor, mode: Literal['complete', 'reduced'] = 'reduced'):
     """an attempt at making QR decomposition for very tall and thin matrices that doesn't freeze, but it is around n_cols times slower than torch.linalg.qr, but compilation makes it faster, but it has to recompile when processing different shapes"""
     if mode == 'reduced': return _qr_householder_reduced(A)

torchzero 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl