PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/linalg/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from . import linear_operator
+from .matrix_power import (
+    matrix_power_eigh,
+    matrix_power_svd,
+    MatrixPowerMethod,
+)
+from .orthogonalize import zeropower_via_eigh, zeropower_via_newtonschulz5, zeropower_via_svd, orthogonalize,OrthogonalizeMethod
+from .qr import qr_householder
+from .solve import cg, nystrom_sketch_and_solve, nystrom_pcg
+from .eigh import nystrom_approximation, regularize_eigh

torchzero/linalg/eigh.py ADDED Viewed

@@ -0,0 +1,253 @@
+from collections.abc import Callable
+import torch
+from . import torch_linalg
+from .linalg_utils import mm
+from .orthogonalize import OrthogonalizeMethod, orthogonalize
+from .svd import tall_reduced_svd_via_eigh
+# https://arxiv.org/pdf/2110.02820
+def nystrom_approximation(
+    A_mv: Callable[[torch.Tensor], torch.Tensor] | None,
+    A_mm: Callable[[torch.Tensor], torch.Tensor] | None,
+    ndim: int,
+    rank: int,
+    device,
+    orthogonalize_method: OrthogonalizeMethod = 'qr',
+    eigv_tol: float = 0,
+    dtype = torch.float32,
+    generator = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Computes Nyström approximation to positive-semidefinite A factored as Q L Q^T (truncatd eigenvalue decomp),
+    returns ``(L, Q)``.
+    A is ``(m,m)``, then Q is ``(m, rank)``; L is a ``(rank, )`` vector - diagonal of ``(rank, rank)``"""
+    # basis
+    O = torch.randn((ndim, rank), device=device, dtype=dtype, generator=generator) # Gaussian test matrix
+    O = orthogonalize(O, method=orthogonalize_method) # Thin QR decomposition # pylint:disable=not-callable
+    # Y = AΩ
+    AO = mm(A_mv=A_mv, A_mm=A_mm, X=O)
+    v = torch.finfo(dtype).eps * torch.linalg.matrix_norm(AO, ord='fro') # Compute shift # pylint:disable=not-callable
+    Yv = AO + v*O # Shift for stability
+    C = torch.linalg.cholesky_ex(O.mT @ Yv)[0] # pylint:disable=not-callable
+    B = torch.linalg.solve_triangular(C, Yv.mT, upper=False, unitriangular=False).mT # pylint:disable=not-callable
+    # Q, S, _ = torch_linalg.svd(B, full_matrices=False) # pylint:disable=not-callable
+    # B is (ndim, rank) so we can use eigendecomp of (rank, rank)
+    Q, S = tall_reduced_svd_via_eigh(B, tol=eigv_tol, retry_float64=True)
+    L = S.pow(2) - v
+    return L, Q
+def regularize_eigh(
+    L: torch.Tensor,
+    Q: torch.Tensor,
+    truncate: int | None = None,
+    tol: float | None = None,
+    damping: float = 0,
+    rdamping: float = 0,
+) -> tuple[torch.Tensor, torch.Tensor] | tuple[None, None]:
+    """Applies regularization to eigendecomposition. Returns ``(L, Q)``.
+    Args:
+        L (torch.Tensor): eigenvalues, shape ``(rank,)``.
+        Q (torch.Tensor): eigenvectors, shape ``(n, rank)``.
+        truncate (int | None, optional):
+            keeps top ``truncate`` eigenvalues. Defaults to None.
+        tol (float | None, optional):
+            all eigenvalues smaller than largest eigenvalue times ``tol`` are removed. Defaults to None.
+        damping (float | None, optional): scalar added to eigenvalues. Defaults to 0.
+        rdamping (float | None, optional): scalar multiplied by largest eigenvalue and added to eigenvalues. Defaults to 0.
+    """
+    # remove non-finite eigenvalues
+    finite = L.isfinite()
+    if finite.any():
+        L = L[finite]
+        Q = Q[:, finite]
+    else:
+        return None, None
+    # largest finite!!! eigval
+    L_max = L[-1] # L is sorted in ascending order
+    # remove small eigenvalues relative to largest
+    if tol is not None:
+        indices = L > tol * L_max
+        L = L[indices]
+        Q = Q[:, indices]
+    # truncate to rank (L is ordered in ascending order)
+    if truncate is not None:
+        L = L[-truncate:]
+        Q = Q[:, -truncate:]
+    # damping
+    d = damping + rdamping * L_max
+    if d != 0:
+        L += d
+    return L, Q
+def eigh_plus_uuT(
+    L: torch.Tensor,
+    Q: torch.Tensor,
+    u: torch.Tensor,
+    alpha: float = 1,
+    tol: float | None = None,
+    retry_float64: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    compute eigendecomposition of Q L Q^T + alpha * (u u^T) where Q is ``(m, rank)`` and L is ``(rank, )`` and u is ``(m, )``
+    """
+    if tol is None: tol = torch.finfo(Q.dtype).eps
+    z = Q.T @ u  # (rank,)
+    # component of u orthogonal to the column space of Q
+    res = u - Q @ z # (m,)
+    beta = torch.linalg.vector_norm(res) # pylint:disable=not-callable
+    if beta < tol:
+        # u is already in the column space of Q
+        B = L.diag_embed().add_(z.outer(z), alpha=alpha) # (rank, rank)
+        L_prime, S = torch_linalg.eigh(B, retry_float64=retry_float64)
+        Q_prime = Q @ S
+        return L_prime, Q_prime
+    # normalize the orthogonal component to get a new orthonormal vector
+    v = res / beta # (m, )
+    # project and compute new eigendecomposition
+    D_diag = torch.cat([L, torch.tensor([0.0], device=Q.device, dtype=Q.dtype)])
+    w = torch.cat([z, beta.unsqueeze(0)]) # Shape: (rank+1,)
+    B = D_diag.diag_embed().add_(w.outer(w), alpha=alpha)
+    L_prime, S = torch_linalg.eigh(B, retry_float64=retry_float64)
+    # unproject and sort
+    basis = torch.cat([Q, v.unsqueeze(-1)], dim=1) # (m, rank+1)
+    Q_prime = basis @ S # (m, rank+1)
+    idx = torch.argsort(L_prime)
+    L_prime = L_prime[idx]
+    Q_prime = Q_prime[:, idx]
+    return L_prime, Q_prime
+def eigh_plus_UUT(
+    L: torch.Tensor,
+    Q: torch.Tensor,
+    U: torch.Tensor,
+    alpha: float = 1,
+    tol = None,
+    retry_float64: bool = False,
+):
+    """
+    compute eigendecomposition of Q L Q^T + alpha * (U U^T), where Q is ``(m, rank)`` and L is ``(rank, )``,
+    U is ``(m, k)`` where k is rank of correction
+    """
+    if U.size(1) == 1:
+        return eigh_plus_uuT(L, Q, U[:,0], alpha=alpha, tol=tol, retry_float64=retry_float64)
+    if tol is None: tol = torch.finfo(Q.dtype).eps
+    m, r = Q.shape
+    Z = Q.T @ U  # (r, k)
+    U_res = U - Q @ Z  # (m, k)
+    # find cols of U not in col space of Q
+    res_norms = torch.linalg.vector_norm(U_res, dim=0) # pylint:disable=not-callable
+    new_indices = torch.where(res_norms > tol)[0]
+    k_prime = len(new_indices)
+    if k_prime == 0:
+        # all cols are in Q
+        B = Q
+        C = Z # (r x k)
+        r_new = r
+    else:
+        # orthonormalize directions that aren't in Q
+        U_new = U_res[:, new_indices]
+        Q_u, _ = torch_linalg.qr(U_new, mode='reduced', retry_float64=retry_float64)
+        B = torch.hstack([Q, Q_u])
+        C = torch.vstack([Z, Q_u.T @ U])
+        r_new = r + k_prime
+    # project and compute new eigendecomposition
+    A_proj = torch.zeros((r_new, r_new), device=Q.device, dtype=Q.dtype)
+    A_proj[:r, :r] = L.diag_embed()
+    A_proj.addmm_(C, C.T, alpha=alpha)
+    L_prime, S = torch_linalg.eigh(A_proj, retry_float64=retry_float64)
+    # unproject and sort
+    Q_prime = B @ S
+    idx = torch.argsort(L_prime)
+    L_prime = L_prime[idx]
+    Q_prime = Q_prime[:, idx]
+    return L_prime, Q_prime
+def eigh_plus_UVT_symmetrize(
+    Q: torch.Tensor,
+    L: torch.Tensor,
+    U: torch.Tensor,
+    V: torch.Tensor,
+    alpha: float,
+    retry_float64: bool = False,
+):
+    """
+    Q is ``(m, rank)``; L is ``(rank, )``; U and V are the low rank correction such that U V^T is ``(m, m)``.
+    This computes eigendecomposition of A, where
+    ``M = Q diag(L) Q^T + alpha * (U V^T)``;
+    ``A = (M + M^T) / 2``
+    """
+    m, rank = Q.shape
+    _, k = V.shape
+    # project U and V out of the Q subspace via Gram-schmidt
+    Q_T_U = Q.T @ U
+    U_perp = U - Q @ Q_T_U
+    Q_T_V = Q.T @ V
+    V_perp = V - Q @ Q_T_V
+    R = torch.hstack([U_perp, V_perp])
+    Q_perp, _ = torch_linalg.qr(R, retry_float64=retry_float64)
+    Q_B = torch.hstack([Q, Q_perp])
+    r_B = Q_B.shape[1]
+    # project, symmetrize and compute new eigendecomposition
+    A_proj = torch.zeros((r_B, r_B), device=Q.device, dtype=Q.dtype)
+    A_proj[:rank, :rank] = L.diag_embed()
+    Q_perp_T_U = Q_perp.T @ U
+    Q_B_T_U = torch.vstack([Q_T_U, Q_perp_T_U])
+    Q_perp_T_V = Q_perp.T @ V
+    Q_B_T_V = torch.vstack([Q_T_V, Q_perp_T_V])
+    update_proj = Q_B_T_U @ Q_B_T_V.T + Q_B_T_V @ Q_B_T_U.T
+    A_proj.add_(update_proj, alpha=alpha/2)
+    L_prime, S = torch_linalg.eigh(A_proj, retry_float64=retry_float64)
+    # unproject and sort
+    Q_prime = Q_B @ S
+    idx = torch.argsort(L_prime)
+    L_prime = L_prime[idx]
+    Q_prime = Q_prime[:, idx]
+    return L_prime, Q_prime

torchzero/linalg/linalg_utils.py ADDED Viewed

@@ -0,0 +1,14 @@
+from collections.abc import Callable
+import torch
+def mm(
+    A_mv: Callable[[torch.Tensor], torch.Tensor] | None,
+    A_mm: Callable[[torch.Tensor], torch.Tensor] | None,
+    X
+):
+    """matrix-matrix when either mv or mm is given"""
+    if A_mm is not None: return A_mm(X)
+    assert A_mv is not None
+    return torch.stack([A_mv(col) for col in X.unbind(-1)], -1) # rank matvecs

torchzero/{utils/linalg → linalg}/linear_operator.py RENAMED Viewed

@@ -1,4 +1,6 @@
-"""simplified version of https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.LinearOperator.html. This is used for trust regions."""
+"""This is mainly used for trust regions. In some cases certain operations are relaxed, e.g. eigenvalue shift instead of
+adding diagonal when it isn't tractable, to make it work with Levenberg-Marquadt.
+"""
 import math
 from abc import ABC, abstractmethod
 from functools import partial
@@ -7,7 +9,8 @@ from typing import cast, final
 import torch
-from ..torch_tools import tofloat, tonumpy, totensor
+from ..utils.torch_tools import tofloat, tonumpy, totensor
+from .solve import nystrom_sketch_and_solve
 if find_spec('scipy') is not None:
     from scipy.sparse.linalg import LinearOperator as _ScipyLinearOperator
@@ -15,7 +18,6 @@ else:
     _ScipyLinearOperator = None
 class LinearOperator(ABC):
-    """this is used for trust region"""
     device: torch.types.Device
     dtype: torch.dtype | None
@@ -25,12 +27,18 @@ class LinearOperator(ABC):
     def rmatvec(self, x: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError(f"{self.__class__.__name__} doesn't implement rmatvec")
-    def matmat(self, x: torch.Tensor) -> "LinearOperator":
-        raise NotImplementedError(f"{self.__class__.__name__} doesn't implement matmul")
+    def matmat(self, X: torch.Tensor) -> "LinearOperator":
+        raise NotImplementedError(f"{self.__class__.__name__} doesn't implement matmat")
+    def rmatmat(self, X: torch.Tensor) -> "LinearOperator":
+        raise NotImplementedError(f"{self.__class__.__name__} doesn't implement rmatmat")
     def solve(self, b: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError(f"{self.__class__.__name__} doesn't implement solve")
+    def solve_plus_diag(self, b: torch.Tensor, diag: int | float | torch.Tensor) -> torch.Tensor:
+        return self.add_diagonal(diag).solve(b)
     def solve_bounded(self, b: torch.Tensor, bound:float, ord:float=2) -> torch.Tensor:
         """solve with a norm bound on x"""
         raise NotImplementedError(f"{self.__class__.__name__} doesn't implement solve_bounded")
@@ -129,8 +137,8 @@ class Dense(LinearOperator):
     def matvec(self, x): return self.A.mv(x)
     def rmatvec(self, x): return self.A.mH.mv(x)
-    def matmat(self, x): return Dense(self.A.mm(x))
-    def rmatmat(self, x): return Dense(self.A.mH.mm(x))
+    def matmat(self, X): return Dense(self.A.mm(X))
+    def rmatmat(self, X): return Dense(self.A.mH.mm(X))
     def solve(self, b): return _solve(self.A, b)
@@ -146,6 +154,12 @@ class Dense(LinearOperator):
     def is_dense(self): return True
     def transpose(self): return Dense(self.A.mH)
+class SPD(Dense):
+    def solve(self, b: torch.Tensor):
+        L, info = torch.linalg.cholesky_ex(self.A) # pylint:disable=not-callable
+        return torch.cholesky_solve(b.unsqueeze(-1), L).squeeze(-1)
 class DenseInverse(LinearOperator):
     """Represents inverse of a dense matrix A."""
     def __init__(self, A_inv: torch.Tensor):
@@ -156,8 +170,8 @@ class DenseInverse(LinearOperator):
     def matvec(self, x): return _solve(self.A_inv, x) # pylint:disable=not-callable
     def rmatvec(self, x): return _solve(self.A_inv.mH, x) # pylint:disable=not-callable
-    def matmat(self, x): return Dense(_solve(self.A_inv, x)) # pylint:disable=not-callable
-    def rmatmat(self, x): return Dense(_solve(self.A_inv.mH, x)) # pylint:disable=not-callable
+    def matmat(self, X): return Dense(_solve(self.A_inv, X)) # pylint:disable=not-callable
+    def rmatmat(self, X): return Dense(_solve(self.A_inv.mH, X)) # pylint:disable=not-callable
     def solve(self, b): return self.A_inv.mv(b)
@@ -190,8 +204,8 @@ class Diagonal(LinearOperator):
     def matvec(self, x): return self.A * x
     def rmatvec(self, x): return self.A * x
-    def matmat(self, x): return Dense(x * self.A.unsqueeze(-1))
-    def rmatmat(self, x): return Dense(x * self.A.unsqueeze(-1))
+    def matmat(self, X): return Dense(X * self.A.unsqueeze(-1))
+    def rmatmat(self, X): return Dense(X * self.A.unsqueeze(-1))
     def solve(self, b): return b/self.A
@@ -221,8 +235,8 @@ class ScaledIdentity(LinearOperator):
     def matvec(self, x): return x * self.s
     def rmatvec(self, x): return x * self.s
-    def matmat(self, x): return Dense(x * self.s)
-    def rmatmat(self, x): return Dense(x * self.s)
+    def matmat(self, X): return Dense(X * self.s)
+    def rmatmat(self, X): return Dense(X * self.s)
     def solve(self, b): return b / self.s
     def solve_bounded(self, b, bound, ord = 2):
@@ -263,6 +277,7 @@ class ScaledIdentity(LinearOperator):
     def is_dense(self): return False
     def transpose(self): return ScaledIdentity(self.s, shape=self.shape, device=self.device, dtype=self.dtype)
 class AtA(LinearOperator):
     def __init__(self, A: torch.Tensor):
         self.A = A
@@ -270,8 +285,8 @@ class AtA(LinearOperator):
     def matvec(self, x): return self.A.mH.mv(self.A.mv(x))
     def rmatvec(self, x): return self.matvec(x)
-    def matmat(self, x): return Dense(torch.linalg.multi_dot([self.A.mH, self.A, x])) # pylint:disable=not-callable
-    def rmatmat(self, x): return Dense(torch.linalg.multi_dot([self.A.mH, self.A, x])) # pylint:disable=not-callable
+    def matmat(self, X): return Dense(torch.linalg.multi_dot([self.A.mH, self.A, X])) # pylint:disable=not-callable
+    def rmatmat(self, X): return Dense(torch.linalg.multi_dot([self.A.mH, self.A, X])) # pylint:disable=not-callable
     def is_dense(self): return False
     def to_tensor(self): return self.A.mH @ self.A
@@ -283,51 +298,41 @@ class AtA(LinearOperator):
         return Dense(self.to_tensor() + torch.diag_embed(x))
     def solve(self, b):
-        return Dense(self.to_tensor()).solve(b)
+        *_, n, m = self.A.shape
+        if n >= m: return Dense(self.to_tensor()).solve(b)
-    def inv(self):
-        return Dense(self.to_tensor()).inv()
+        A = self.A
+        C = A @ A.mH # (n, n), SPD
+        L, info = torch.linalg.cholesky_ex(C) # pylint:disable=not-callable
+        z = torch.cholesky_solve((A @ b).unsqueeze(-1), L).squeeze(-1)
+        return A.mH @ z
-    def diagonal(self):
-        return self.A.pow(2).sum(1)
-    def size(self):
-        n = self.A.size(1)
-        return (n,n)
+    def solve_plus_diag(self, b, diag):
+        *_, n, m = self.A.shape
+        if (n >= m) or (isinstance(diag, torch.Tensor) and diag.numel() > 1):
+            return Dense(self.to_tensor()).solve_plus_diag(b, diag)
-class AAT(LinearOperator):
-    def __init__(self, A: torch.Tensor):
-        self.A = A
-        self.device = self.A.device; self.dtype = self.A.dtype
-    def matvec(self, x): return self.A.mv(self.A.mH.mv(x))
-    def rmatvec(self, x): return self.matvec(x)
-    def matmat(self, x): return Dense(torch.linalg.multi_dot([self.A, self.A.mH, x])) # pylint:disable=not-callable
-    def rmatmat(self, x): return Dense(torch.linalg.multi_dot([self.A, self.A.mH, x])) # pylint:disable=not-callable
-    def is_dense(self): return False
-    def to_tensor(self): return self.A @ self.A.mH
-    def transpose(self): return AAT(self.A)
-    def add_diagonal(self, x):
-        if isinstance(x, torch.Tensor) and x.numel() <= 1: x = x.item()
-        if isinstance(x, (int,float)): x = torch.full((self.shape[0],), fill_value=x, device=self.A.device, dtype=self.A.dtype)
-        return Dense(self.to_tensor() + torch.diag_embed(x))
+        A = self.A
+        I = torch.eye(A.size(-2), device=A.device, dtype=A.dtype)
-    def solve(self, b):
-        return Dense(self.to_tensor()).solve(b)
+        C = (A @ A.mH).add_(I.mul_(diag)) # (n, n), SPD
+        L, info = torch.linalg.cholesky_ex(C + I.mul_(diag)) # pylint:disable=not-callable
+        z = torch.cholesky_solve((A @ b).unsqueeze(-1), L).squeeze(-1)
+        return (1 / diag) * (b - A.mH @ z)
     def inv(self):
         return Dense(self.to_tensor()).inv()
     def diagonal(self):
-        return self.A.pow(2).sum(0)
+        return self.A.pow(2).sum(1)
     def size(self):
         n = self.A.size(1)
         return (n,n)
+class AAt(AtA):
+    def __init__(self, A: torch.Tensor):
+        super().__init__(A.mH)
 class Sketched(LinearOperator):
     """A projected by sketching matrix S, representing the operator S @ A_proj @ S.T.
@@ -339,7 +344,6 @@ class Sketched(LinearOperator):
         self.A_proj = A_proj
         self.device = self.A_proj.device; self.dtype = self.A_proj.dtype
     def matvec(self, x):
         x_proj = self.S.T @ x
         Ax_proj = self.A_proj @ x_proj
@@ -351,8 +355,8 @@ class Sketched(LinearOperator):
         return self.S @ ATx_proj
-    def matmat(self, x): return Dense(torch.linalg.multi_dot([self.S, self.A_proj, self.S.T, x])) # pylint:disable=not-callable
-    def rmatmat(self, x): return Dense(torch.linalg.multi_dot([self.S, self.A_proj.mH, self.S.T, x])) # pylint:disable=not-callable
+    def matmat(self, X): return Dense(torch.linalg.multi_dot([self.S, self.A_proj, self.S.T, X])) # pylint:disable=not-callable
+    def rmatmat(self, X): return Dense(torch.linalg.multi_dot([self.S, self.A_proj.mH, self.S.T, X])) # pylint:disable=not-callable
     def is_dense(self): return False
@@ -375,3 +379,49 @@ class Sketched(LinearOperator):
         n = self.S.size(0)
         return (n,n)
+class Eigendecomposition(LinearOperator):
+    """A represented as Q L Q^H. If A is (n,n), then Q is (n, rank); L is a vector - diagonal of (rank, rank)"""
+    def __init__(self, L: torch.Tensor, Q: torch.Tensor, use_nystrom: bool = True):
+        self.L = L
+        self.Q = Q
+        self.use_nystrom = use_nystrom
+        self.device = self.L.device; self.dtype = self.L.dtype
+    def matvec(self, x):
+        return self.Q @ ((self.Q.mH @ x) * self.L)
+    def rmatvec(self, x):
+        return self.matvec(x)
+    def matmat(self, X):
+        return Dense(self.Q @ (self.L[:, None] * (self.Q.mH @ X)))
+    def rmatmat(self, X):
+        return self.matmat(X)
+    def is_dense(self): return False
+    def to_tensor(self): return self.Q @ self.L.diag_embed() @ self.Q.mH
+    def transpose(self): return Eigendecomposition(L=self.L, Q=self.Q)
+    def add_diagonal(self, x):
+        """this doesn't correspond to adding diagonal to A, however it still works for LM etc."""
+        if isinstance(x, torch.Tensor) and x.numel() > 1:
+            raise RuntimeError("Eigendecomposition linear operator doesn't support add_diagonal with a vector diag")
+        return Eigendecomposition(L=self.L + x, Q = self.Q)
+    def solve(self, b):
+        return self.Q @ ((self.Q.mH @ b) / self.L)
+    def solve_plus_diag(self, b, diag):
+        if isinstance(diag, torch.Tensor) and diag.numel() > 1: return super().solve_plus_diag(b, diag)
+        if not self.use_nystrom: return super().solve_plus_diag(b, diag)
+        return nystrom_sketch_and_solve(L=self.L, Q=self.Q, b=b, reg=float(diag))
+    def inv(self):
+        return Eigendecomposition(L=1 / self.L, Q = self.Q)
+    def size(self):
+        n = self.Q.size(0)
+        return (n,n)

torchzero/linalg/matrix_power.py ADDED Viewed

@@ -0,0 +1,28 @@
+from typing import Literal
+import warnings
+from collections.abc import Callable
+import torch
+from . import torch_linalg
+def matrix_power_eigh(A: torch.Tensor, power:float, abs:bool=False):
+    """this is faster than SVD but only for positive semi-definite symmetric matrices
+    (covariance matrices are always SPD)"""
+    L, Q = torch_linalg.eigh(A, retry_float64=True) # pylint:disable=not-callable
+    if abs: L.abs_()
+    if power % 2 != 0: L.clip_(min = torch.finfo(A.dtype).tiny * 2)
+    return (Q * L.pow_(power).unsqueeze(-2)) @ Q.mH
+def matrix_power_svd(A: torch.Tensor, power: float) -> torch.Tensor:
+    """for any symmetric matrix"""
+    U, S, Vh = torch_linalg.svd(A, full_matrices=False, retry_float64=True) # pylint:disable=not-callable
+    if power % 2 != 0: S.clip_(min = torch.finfo(A.dtype).tiny * 2)
+    return (U * S.pow_(power).unsqueeze(-2)) @ Vh
+MatrixPowerMethod = Literal["eigh", "eigh_abs", "svd"]
+def matrix_power(A: torch.Tensor, power: float, method: MatrixPowerMethod = "eigh_abs") -> torch.Tensor:
+    if method == "eigh": return matrix_power_eigh(A, power)
+    if method == "eigh_abs": return matrix_power_eigh(A, power, abs=True)
+    if method == "svd": return matrix_power_svd(A, power)
+    raise ValueError(method)

torchzero/linalg/orthogonalize.py ADDED Viewed

@@ -0,0 +1,93 @@
+from typing import Literal
+import torch
+from ..utils.compile import allow_compile
+from . import torch_linalg
+# zeropower_via_newtonschulz5 from:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+# and
+# https://github.com/HomebrewML/HeavyBall/blob/main/heavyball/utils.py#L452
+_NS_COEFFS = (
+    (4.0848, -6.8946, 2.9270),
+    (3.9505, -6.3029, 2.6377),
+    (3.7418, -5.5913, 2.3037),
+    (2.8769, -3.1427, 1.2046),
+    (2.8366, -3.0525, 1.2012)
+)
+@allow_compile
+def zeropower_via_newtonschulz5(G: torch.Tensor, coeffs=_NS_COEFFS) -> torch.Tensor:
+    """
+    Applies to last 2 dims - so usually reverse_dims should be applied to G before and after.
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
+    X = G.bfloat16()
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm(dim=(-2, -1), keepdim=True).clip(min=torch.finfo(X.dtype).tiny * 2))
+    # Perform the NS iterations
+    for a,b,c in coeffs:
+        A = X @ X.mT
+        B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        X = a * X + B @ X
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    return X.to(G.dtype)
+def zeropower_via_svd(A: torch.Tensor) -> torch.Tensor:
+    """
+    Applies to first 2 dims and isn't batched - rest of dimensions are flattened.
+    """
+    try:
+        U, S, Vt = torch_linalg.svd(A, full_matrices=False, retry_float64=True) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError:
+        U, S, Vt = torch.svd_lowrank(A, q=1, M=1e-4 * A.mean() * torch.rand_like(A))
+    return  U @ Vt
+def zeropower_via_eigh(A: torch.Tensor) -> torch.Tensor:
+    """
+    Only SPD and I need to check if I apply those to SPD because this is better than SVD.
+    """
+    L, Q = torch_linalg.eigh(A, retry_float64=True)
+    return  Q @ Q.mH
+def orthogonalize_via_qr(A: torch.Tensor):
+    *_, m, n = A.shape
+    T = False
+    if m < n:
+        T = True
+        m,n = n,m
+        A = A.mH
+    Q = torch_linalg.qr(A, mode='reduced', retry_float64=True).Q
+    if T:
+        Q = Q.mH
+    return Q
+OrthogonalizeMethod = Literal["newtonschulz", "svd", "qr"]
+def orthogonalize(A: torch.Tensor, method: OrthogonalizeMethod) -> torch.Tensor:
+    if method == "newtonschulz": return zeropower_via_newtonschulz5(A)
+    if method == "svd": return zeropower_via_svd(A)
+    if method == "qr": return orthogonalize_via_qr(A)
+    if method == "eigh": return zeropower_via_eigh(A)
+    raise ValueError(method)

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl