PyPI - torchzero - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

tests/test_identical.py +22 -22
tests/test_opts.py +199 -198
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +1 -1
torchzero/core/functional.py +1 -1
torchzero/core/modular.py +5 -5
torchzero/core/module.py +2 -2
torchzero/core/objective.py +10 -10
torchzero/core/transform.py +1 -1
torchzero/linalg/__init__.py +3 -2
torchzero/linalg/eigh.py +223 -4
torchzero/linalg/orthogonalize.py +2 -4
torchzero/linalg/qr.py +12 -0
torchzero/linalg/solve.py +1 -3
torchzero/linalg/svd.py +47 -20
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +10 -10
torchzero/modules/adaptive/adahessian.py +2 -2
torchzero/modules/adaptive/adam.py +1 -1
torchzero/modules/adaptive/adan.py +1 -1
torchzero/modules/adaptive/adaptive_heavyball.py +1 -1
torchzero/modules/adaptive/esgd.py +2 -2
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +2 -1
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +2 -2
torchzero/modules/adaptive/matrix_momentum.py +1 -1
torchzero/modules/adaptive/msam.py +4 -4
torchzero/modules/adaptive/muon.py +9 -6
torchzero/modules/adaptive/natural_gradient.py +32 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rprop.py +2 -2
torchzero/modules/adaptive/sam.py +4 -4
torchzero/modules/adaptive/shampoo.py +28 -3
torchzero/modules/adaptive/soap.py +3 -3
torchzero/modules/adaptive/sophia_h.py +2 -2
torchzero/modules/clipping/clipping.py +7 -7
torchzero/modules/conjugate_gradient/cg.py +2 -2
torchzero/modules/experimental/__init__.py +5 -0
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +2 -2
torchzero/modules/experimental/newtonnewton.py +34 -40
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/rfdm.py +4 -4
torchzero/modules/least_squares/gn.py +68 -45
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/escape.py +1 -1
torchzero/modules/misc/gradient_accumulation.py +1 -1
torchzero/modules/misc/misc.py +1 -1
torchzero/modules/misc/multistep.py +4 -7
torchzero/modules/misc/regularization.py +2 -2
torchzero/modules/misc/split.py +1 -1
torchzero/modules/misc/switch.py +2 -2
torchzero/modules/momentum/cautious.py +3 -3
torchzero/modules/momentum/momentum.py +1 -1
torchzero/modules/ops/higher_level.py +1 -1
torchzero/modules/ops/multi.py +1 -1
torchzero/modules/projections/projection.py +5 -2
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +3 -3
torchzero/modules/quasi_newton/lsr1.py +3 -3
torchzero/modules/quasi_newton/quasi_newton.py +44 -29
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +17 -17
torchzero/modules/second_order/inm.py +33 -25
torchzero/modules/second_order/newton.py +132 -130
torchzero/modules/second_order/newton_cg.py +3 -3
torchzero/modules/second_order/nystrom.py +83 -32
torchzero/modules/second_order/rsn.py +41 -44
torchzero/modules/smoothing/laplacian.py +1 -1
torchzero/modules/smoothing/sampling.py +2 -3
torchzero/modules/step_size/adaptive.py +6 -6
torchzero/modules/step_size/lr.py +2 -2
torchzero/modules/trust_region/cubic_regularization.py +1 -1
torchzero/modules/trust_region/levenberg_marquardt.py +2 -2
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/variance_reduction/svrg.py +4 -5
torchzero/modules/weight_decay/reinit.py +2 -2
torchzero/modules/weight_decay/weight_decay.py +5 -5
torchzero/modules/wrappers/optim_wrapper.py +4 -4
torchzero/modules/zeroth_order/cd.py +1 -1
torchzero/optim/mbs.py +291 -0
torchzero/optim/wrappers/nevergrad.py +0 -9
torchzero/optim/wrappers/optuna.py +2 -0
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/derivatives.py +4 -4
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
torchzero/modules/adaptive/lmadagrad.py +0 -241
torchzero-0.4.0.dist-info/RECORD +0 -191
/torchzero/modules/{functional.py → opt_utils.py} +0 -0
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from . import core, optim, utils
-from .core import Modular
+from .core import Optimizer
 from .utils.compile import enable_compilation
 from . import modules as m

torchzero/core/__init__.py CHANGED Viewed

@@ -3,6 +3,6 @@ from .module import Chainable, Module
 from .objective import DerivativesMethod, HessianMethod, HVPMethod, Objective
 # order is important to avoid circular imports
-from .modular import Modular
+from .modular import Optimizer
 from .functional import apply, step, step_tensors, update
 from .chain import Chain, maybe_chain

torchzero/core/functional.py CHANGED Viewed

@@ -96,7 +96,7 @@ def step_tensors(
     objective.updates = list(tensors)
     # step with modules
-    # this won't update parameters in-place because objective.Modular is None
+    # this won't update parameters in-place because objective.Optimizer is None
     objective = _chain_step(objective, modules)
     # return updates

torchzero/core/modular.py CHANGED Viewed

@@ -15,7 +15,7 @@ from .objective import Objective
 class _EvalCounterClosure:
     """keeps track of how many times closure has been evaluated, and sets closure return"""
     __slots__ = ("modular", "closure")
-    def __init__(self, modular: "Modular", closure):
+    def __init__(self, modular: "Optimizer", closure):
         self.modular = modular
         self.closure = closure
@@ -46,9 +46,9 @@ def flatten_modules(*modules: Chainable) -> list[Module]:
     return flat
-# have to inherit from Modular to support lr schedulers
+# have to inherit from Optimizer to support lr schedulers
 # although Accelerate doesn't work due to converting param_groups to a dict
-class Modular(torch.optim.Optimizer):
+class Optimizer(torch.optim.Optimizer):
     """Chains multiple modules into an optimizer.
     Args:
@@ -62,7 +62,7 @@ class Modular(torch.optim.Optimizer):
     param_groups: list[ChainMap[str, Any]] # pyright:ignore[reportIncompatibleVariableOverride]
     def __init__(self, params: Params | torch.nn.Module, *modules: Module):
-        if len(modules) == 0: raise RuntimeError("Empty list of modules passed to `Modular`")
+        if len(modules) == 0: raise RuntimeError("Empty list of modules passed to `Optimizer`")
         self.model: torch.nn.Module | None = None
         """The model whose parameters are being optimized, if a model instance was passed to `__init__`."""
         if isinstance(params, torch.nn.Module):
@@ -229,5 +229,5 @@ class Modular(torch.optim.Optimizer):
         return self._closure_return
     def __repr__(self):
-        return f'Modular({", ".join(str(m) for m in self.modules)})'
+        return f'Optimizer({", ".join(str(m) for m in self.modules)})'

torchzero/core/module.py CHANGED Viewed

@@ -35,7 +35,7 @@ class Module(ABC):
         # settings are stored like state in per-tensor defaultdict, with per-parameter overrides possible
         # 0 - this module specific per-parameter setting overrides set via `set_param_groups` - highest priority
-        # 1 - global per-parameter setting overrides in param_groups passed to Modular - medium priority
+        # 1 - global per-parameter setting overrides in param_groups passed to Optimizer - medium priority
         # 2 - `defaults` - lowest priority
         self.settings: defaultdict[torch.Tensor, ChainMap[str, Any]] = defaultdict(lambda: ChainMap({}, {}, self.defaults))
         """per-parameter settings."""
@@ -273,7 +273,7 @@ class Module(ABC):
         return state_dict
     def _load_state_dict(self, state_dict: dict[str, Any], id_to_tensor: dict[int, torch.Tensor]):
-        """loads state_dict, ``id_to_tensor`` is passed by ``Modular``"""
+        """loads state_dict, ``id_to_tensor`` is passed by ``Optimizer``"""
         # load state
         state = state_dict['state']
         self.state.clear()

torchzero/core/objective.py CHANGED Viewed

@@ -20,7 +20,7 @@ from ..utils.derivatives import (
 from ..utils.thoad_tools import thoad_derivatives, thoad_single_tensor, lazy_thoad
 if TYPE_CHECKING:
-    from .modular import Modular
+    from .modular import Optimizer
     from .module import Module
 def _closure_backward(closure, params, backward, retain_graph, create_graph):
@@ -135,13 +135,13 @@ class Objective:
         model (torch.nn.Module | None, optional):
             ``torch.nn.Module`` object, needed for a few modules that require access to the model. Defaults to None.
         current_step (int, optional):
-            number of times ``Modular.step()`` has been called, starting at 0. Defaults to 0.
+            number of times ``Optimizer.step()`` has been called, starting at 0. Defaults to 0.
         parent (Objective | None, optional):
             parent ``Objective`` object. When ``self.get_grad()`` is called, it will also set ``parent.grad``.
             Same with ``self.get_loss()``. This is useful when ``self.params`` are different from ``parent.params``,
             e.g. when projecting. Defaults to None.
-        modular (Modular | None, optional):
-            Top-level ``Modular`` optimizer. Defaults to None.
+        modular (Optimizer | None, optional):
+            Top-level ``Optimizer`` optimizer. Defaults to None.
         storage (dict | None, optional):
             additional kwargs passed to ``step`` to control some module-specific behavior. Defaults to None.
@@ -154,7 +154,7 @@ class Objective:
         model: torch.nn.Module | None = None,
         current_step: int = 0,
         parent: "Objective | None" = None,
-        modular: "Modular | None" = None,
+        modular: "Optimizer | None" = None,
         storage: dict | None = None,
     ):
         self.params: list[torch.Tensor] = list(params)
@@ -175,8 +175,8 @@ class Objective:
         Same with ``self.get_loss()``. This is useful when ``self.params`` are different from ``parent.params``,
         e.g. when projecting."""
-        self.modular: "Modular | None" = modular
-        """Top-level ``Modular`` optimizer, ``None`` if it wasn't specified."""
+        self.modular: "Optimizer | None" = modular
+        """Top-level ``Optimizer`` optimizer, ``None`` if it wasn't specified."""
         self.updates: list[torch.Tensor] | None = None
         """
@@ -222,7 +222,7 @@ class Objective:
         # """Storage for any other data, such as hessian estimates, etc."""
         self.attrs: dict = {}
-        """attributes, ``Modular.attrs`` is updated with this after each step.
+        """attributes, ``Optimizer.attrs`` is updated with this after each step.
         This attribute should always be modified in-place"""
         if storage is None: storage = {}
@@ -231,7 +231,7 @@ class Objective:
         This attribute should always be modified in-place"""
         self.should_terminate: bool | None = None
-        """termination criteria, ``Modular.should_terminate`` is set to this after each step if not ``None``"""
+        """termination criteria, ``Optimizer.should_terminate`` is set to this after each step if not ``None``"""
         self.temp: Any = cast(Any, None)
         """temporary storage, ``Module.update`` can set this and ``Module.apply`` access via ``objective.poptemp()``.
@@ -756,7 +756,7 @@ class Objective:
             if g_list is not None and self.grads is None:
                 self.grads = list(g_list)
-        return f, g_list, H
+        return f, g_list, H.detach()
     @torch.no_grad
     def derivatives(self, order: int, at_x0: bool, method:DerivativesMethod="batched_autograd"):

torchzero/core/transform.py CHANGED Viewed

@@ -233,7 +233,7 @@ class TensorTransform(Transform):
         if self._uses_grad: grads = objective.get_grads()
         else: grads = None # better explicitly set to None rather than objective.grads because it shouldn't be used
-        if self._uses_loss: loss = objective.get_loss(backward=False)
+        if self._uses_loss: loss = objective.get_loss(backward=True)
         else: loss = None
         return grads, loss

torchzero/linalg/__init__.py CHANGED Viewed

@@ -3,8 +3,9 @@ from . import linear_operator
 from .matrix_power import (
     matrix_power_eigh,
     matrix_power_svd,
+    MatrixPowerMethod,
 )
-from .orthogonalize import zeropower_via_eigh, zeropower_via_newtonschulz5, zeropower_via_svd, orthogonalize
+from .orthogonalize import zeropower_via_eigh, zeropower_via_newtonschulz5, zeropower_via_svd, orthogonalize,OrthogonalizeMethod
 from .qr import qr_householder
 from .solve import cg, nystrom_sketch_and_solve, nystrom_pcg
-from .eigh import nystrom_approximation
+from .eigh import nystrom_approximation, regularize_eigh

torchzero/linalg/eigh.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from collections.abc import Callable
 import torch
-from .linalg_utils import mm
+from . import torch_linalg
+from .linalg_utils import mm
+from .orthogonalize import OrthogonalizeMethod, orthogonalize
+from .svd import tall_reduced_svd_via_eigh
 # https://arxiv.org/pdf/2110.02820
@@ -11,6 +15,8 @@ def nystrom_approximation(
     ndim: int,
     rank: int,
     device,
+    orthogonalize_method: OrthogonalizeMethod = 'qr',
+    eigv_tol: float = 0,
     dtype = torch.float32,
     generator = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -20,7 +26,7 @@ def nystrom_approximation(
     A is ``(m,m)``, then Q is ``(m, rank)``; L is a ``(rank, )`` vector - diagonal of ``(rank, rank)``"""
     # basis
     O = torch.randn((ndim, rank), device=device, dtype=dtype, generator=generator) # Gaussian test matrix
-    O, _ = torch.linalg.qr(O) # Thin QR decomposition # pylint:disable=not-callable
+    O = orthogonalize(O, method=orthogonalize_method) # Thin QR decomposition # pylint:disable=not-callable
     # Y = AΩ
     AO = mm(A_mv=A_mv, A_mm=A_mm, X=O)
@@ -29,6 +35,219 @@ def nystrom_approximation(
     Yv = AO + v*O # Shift for stability
     C = torch.linalg.cholesky_ex(O.mT @ Yv)[0] # pylint:disable=not-callable
     B = torch.linalg.solve_triangular(C, Yv.mT, upper=False, unitriangular=False).mT # pylint:disable=not-callable
-    Q, S, _ = torch.linalg.svd(B, full_matrices=False) # pylint:disable=not-callable
-    L = (S.pow(2) - v).clip(min=0) #Remove shift, compute eigs
+    # Q, S, _ = torch_linalg.svd(B, full_matrices=False) # pylint:disable=not-callable
+    # B is (ndim, rank) so we can use eigendecomp of (rank, rank)
+    Q, S = tall_reduced_svd_via_eigh(B, tol=eigv_tol, retry_float64=True)
+    L = S.pow(2) - v
+    return L, Q
+def regularize_eigh(
+    L: torch.Tensor,
+    Q: torch.Tensor,
+    truncate: int | None = None,
+    tol: float | None = None,
+    damping: float = 0,
+    rdamping: float = 0,
+) -> tuple[torch.Tensor, torch.Tensor] | tuple[None, None]:
+    """Applies regularization to eigendecomposition. Returns ``(L, Q)``.
+    Args:
+        L (torch.Tensor): eigenvalues, shape ``(rank,)``.
+        Q (torch.Tensor): eigenvectors, shape ``(n, rank)``.
+        truncate (int | None, optional):
+            keeps top ``truncate`` eigenvalues. Defaults to None.
+        tol (float | None, optional):
+            all eigenvalues smaller than largest eigenvalue times ``tol`` are removed. Defaults to None.
+        damping (float | None, optional): scalar added to eigenvalues. Defaults to 0.
+        rdamping (float | None, optional): scalar multiplied by largest eigenvalue and added to eigenvalues. Defaults to 0.
+    """
+    # remove non-finite eigenvalues
+    finite = L.isfinite()
+    if finite.any():
+        L = L[finite]
+        Q = Q[:, finite]
+    else:
+        return None, None
+    # largest finite!!! eigval
+    L_max = L[-1] # L is sorted in ascending order
+    # remove small eigenvalues relative to largest
+    if tol is not None:
+        indices = L > tol * L_max
+        L = L[indices]
+        Q = Q[:, indices]
+    # truncate to rank (L is ordered in ascending order)
+    if truncate is not None:
+        L = L[-truncate:]
+        Q = Q[:, -truncate:]
+    # damping
+    d = damping + rdamping * L_max
+    if d != 0:
+        L += d
     return L, Q
+def eigh_plus_uuT(
+    L: torch.Tensor,
+    Q: torch.Tensor,
+    u: torch.Tensor,
+    alpha: float = 1,
+    tol: float | None = None,
+    retry_float64: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    compute eigendecomposition of Q L Q^T + alpha * (u u^T) where Q is ``(m, rank)`` and L is ``(rank, )`` and u is ``(m, )``
+    """
+    if tol is None: tol = torch.finfo(Q.dtype).eps
+    z = Q.T @ u  # (rank,)
+    # component of u orthogonal to the column space of Q
+    res = u - Q @ z # (m,)
+    beta = torch.linalg.vector_norm(res) # pylint:disable=not-callable
+    if beta < tol:
+        # u is already in the column space of Q
+        B = L.diag_embed().add_(z.outer(z), alpha=alpha) # (rank, rank)
+        L_prime, S = torch_linalg.eigh(B, retry_float64=retry_float64)
+        Q_prime = Q @ S
+        return L_prime, Q_prime
+    # normalize the orthogonal component to get a new orthonormal vector
+    v = res / beta # (m, )
+    # project and compute new eigendecomposition
+    D_diag = torch.cat([L, torch.tensor([0.0], device=Q.device, dtype=Q.dtype)])
+    w = torch.cat([z, beta.unsqueeze(0)]) # Shape: (rank+1,)
+    B = D_diag.diag_embed().add_(w.outer(w), alpha=alpha)
+    L_prime, S = torch_linalg.eigh(B, retry_float64=retry_float64)
+    # unproject and sort
+    basis = torch.cat([Q, v.unsqueeze(-1)], dim=1) # (m, rank+1)
+    Q_prime = basis @ S # (m, rank+1)
+    idx = torch.argsort(L_prime)
+    L_prime = L_prime[idx]
+    Q_prime = Q_prime[:, idx]
+    return L_prime, Q_prime
+def eigh_plus_UUT(
+    L: torch.Tensor,
+    Q: torch.Tensor,
+    U: torch.Tensor,
+    alpha: float = 1,
+    tol = None,
+    retry_float64: bool = False,
+):
+    """
+    compute eigendecomposition of Q L Q^T + alpha * (U U^T), where Q is ``(m, rank)`` and L is ``(rank, )``,
+    U is ``(m, k)`` where k is rank of correction
+    """
+    if U.size(1) == 1:
+        return eigh_plus_uuT(L, Q, U[:,0], alpha=alpha, tol=tol, retry_float64=retry_float64)
+    if tol is None: tol = torch.finfo(Q.dtype).eps
+    m, r = Q.shape
+    Z = Q.T @ U  # (r, k)
+    U_res = U - Q @ Z  # (m, k)
+    # find cols of U not in col space of Q
+    res_norms = torch.linalg.vector_norm(U_res, dim=0) # pylint:disable=not-callable
+    new_indices = torch.where(res_norms > tol)[0]
+    k_prime = len(new_indices)
+    if k_prime == 0:
+        # all cols are in Q
+        B = Q
+        C = Z # (r x k)
+        r_new = r
+    else:
+        # orthonormalize directions that aren't in Q
+        U_new = U_res[:, new_indices]
+        Q_u, _ = torch_linalg.qr(U_new, mode='reduced', retry_float64=retry_float64)
+        B = torch.hstack([Q, Q_u])
+        C = torch.vstack([Z, Q_u.T @ U])
+        r_new = r + k_prime
+    # project and compute new eigendecomposition
+    A_proj = torch.zeros((r_new, r_new), device=Q.device, dtype=Q.dtype)
+    A_proj[:r, :r] = L.diag_embed()
+    A_proj.addmm_(C, C.T, alpha=alpha)
+    L_prime, S = torch_linalg.eigh(A_proj, retry_float64=retry_float64)
+    # unproject and sort
+    Q_prime = B @ S
+    idx = torch.argsort(L_prime)
+    L_prime = L_prime[idx]
+    Q_prime = Q_prime[:, idx]
+    return L_prime, Q_prime
+def eigh_plus_UVT_symmetrize(
+    Q: torch.Tensor,
+    L: torch.Tensor,
+    U: torch.Tensor,
+    V: torch.Tensor,
+    alpha: float,
+    retry_float64: bool = False,
+):
+    """
+    Q is ``(m, rank)``; L is ``(rank, )``; U and V are the low rank correction such that U V^T is ``(m, m)``.
+    This computes eigendecomposition of A, where
+    ``M = Q diag(L) Q^T + alpha * (U V^T)``;
+    ``A = (M + M^T) / 2``
+    """
+    m, rank = Q.shape
+    _, k = V.shape
+    # project U and V out of the Q subspace via Gram-schmidt
+    Q_T_U = Q.T @ U
+    U_perp = U - Q @ Q_T_U
+    Q_T_V = Q.T @ V
+    V_perp = V - Q @ Q_T_V
+    R = torch.hstack([U_perp, V_perp])
+    Q_perp, _ = torch_linalg.qr(R, retry_float64=retry_float64)
+    Q_B = torch.hstack([Q, Q_perp])
+    r_B = Q_B.shape[1]
+    # project, symmetrize and compute new eigendecomposition
+    A_proj = torch.zeros((r_B, r_B), device=Q.device, dtype=Q.dtype)
+    A_proj[:rank, :rank] = L.diag_embed()
+    Q_perp_T_U = Q_perp.T @ U
+    Q_B_T_U = torch.vstack([Q_T_U, Q_perp_T_U])
+    Q_perp_T_V = Q_perp.T @ V
+    Q_B_T_V = torch.vstack([Q_T_V, Q_perp_T_V])
+    update_proj = Q_B_T_U @ Q_B_T_V.T + Q_B_T_V @ Q_B_T_U.T
+    A_proj.add_(update_proj, alpha=alpha/2)
+    L_prime, S = torch_linalg.eigh(A_proj, retry_float64=retry_float64)
+    # unproject and sort
+    Q_prime = Q_B @ S
+    idx = torch.argsort(L_prime)
+    L_prime = L_prime[idx]
+    Q_prime = Q_prime[:, idx]
+    return L_prime, Q_prime

torchzero/linalg/orthogonalize.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from typing import Literal
 import torch
 from ..utils.compile import allow_compile
@@ -49,9 +50,6 @@ def zeropower_via_newtonschulz5(G: torch.Tensor, coeffs=_NS_COEFFS) -> torch.Ten
     return X.to(G.dtype)
-# code from https://github.com/MarkTuddenham/Orthogonal-Optimisers.
-# Tuddenham, M., Prügel-Bennett, A., & Hare, J. (2022).
-# Orthogonalising gradients to speed up neural network optimisation. arXiv preprint arXiv:2202.07052.
 def zeropower_via_svd(A: torch.Tensor) -> torch.Tensor:
     """
     Applies to first 2 dims and isn't batched - rest of dimensions are flattened.
@@ -87,7 +85,7 @@ def orthogonalize_via_qr(A: torch.Tensor):
     return Q
 OrthogonalizeMethod = Literal["newtonschulz", "svd", "qr"]
-def orthogonalize(A: torch.Tensor, method: OrthogonalizeMethod = "newtonschulz") -> torch.Tensor:
+def orthogonalize(A: torch.Tensor, method: OrthogonalizeMethod) -> torch.Tensor:
     if method == "newtonschulz": return zeropower_via_newtonschulz5(A)
     if method == "svd": return zeropower_via_svd(A)
     if method == "qr": return orthogonalize_via_qr(A)

torchzero/linalg/qr.py CHANGED Viewed

@@ -2,6 +2,18 @@ from typing import Literal
 import torch
 from ..utils.compile import allow_compile
+# super slow
+# def cholesky_qr(A):
+#     """QR of (m, n) A via cholesky of (n, n) matrix"""
+#     AtA = A.T @ A
+#     L, _ = torch.linalg.cholesky_ex(AtA) # pylint:disable=not-callable
+#     R = L.T
+#     Q = torch.linalg.solve_triangular(R.T, A.T, upper=False).T # pylint:disable=not-callable
+#     return Q, R
 # reference - https://www.cs.cornell.edu/~bindel/class/cs6210-f09/lec18.pdf
 @allow_compile
 def _get_w_tau(R: torch.Tensor, i: int, eps: float):

torchzero/linalg/solve.py CHANGED Viewed

@@ -25,15 +25,13 @@ def _make_A_mv_reg(A_mv: Callable, reg):
 def _identity(x): return x
-# TODO this is used in NystromSketchAndSolve
-# I need to add alternative to it where it just shifts eigenvalues by reg and uses their reciprocal
 def nystrom_sketch_and_solve(
     L: torch.Tensor,
     Q: torch.Tensor,
     b: torch.Tensor,
     reg: float = 1e-3,
 ) -> torch.Tensor:
-    """Solves (Q diag(L) Q.T + reg*I)x = b. Becomes super unstable with reg smaller than like 1e-5.
+    """Solves ``(Q diag(L) Q.T + reg*I)x = b``. Becomes super unstable with reg smaller than like 1e-5.
     Args:
         L (torch.Tensor): eigenvalues, like from ``nystrom_approximation``

torchzero/linalg/svd.py CHANGED Viewed

@@ -1,20 +1,47 @@
-# import torch
-# # projected svd
-# # adapted from https://github.com/smortezavi/Randomized_SVD_GPU
-# def randomized_svd(M: torch.Tensor, k: int, driver=None):
-#     *_, m, n = M.shape
-#     transpose = False
-#     if m < n:
-#         transpose = True
-#         M = M.mT
-#         m,n = n,m
-#     rand_matrix = torch.randn(size=(n, k), device=M.device, dtype=M.dtype)
-#     Q, _ = torch.linalg.qr(M @ rand_matrix, mode='reduced') # pylint:disable=not-callable
-#     smaller_matrix = Q.mT @ M
-#     U_hat, s, V = torch.linalg.svd(smaller_matrix, driver=driver, full_matrices=False) # pylint:disable=not-callable
-#     U = Q @ U_hat
-#     if transpose: return V.mT, s, U.mT
-#     return U, s, V
+import torch
+from . import torch_linalg
+def tall_reduced_svd_via_eigh(A: torch.Tensor, tol: float = 0, retry_float64:bool=False):
+    """
+    Given a tall matrix A of size (m, n), computes U and S from the reduced SVD(A)
+    using the eigendecomposition of (n, n) matrix which is faster than direct SVD when m >= n.
+    This truncates small singular values that would causes nans,
+    so the returned U and S can have reduced dimension ``k <= n``.
+    Returns U of size ``(m, k)`` and S of size ``(k, )``.
+    Args:
+        A (torch.Tensor): A tall matrix of size (m, n) with m >= n.
+        tol (float): Tolerance for truncating small singular values. Singular values
+                     less than ``tol * max_singular_value`` will be discarded.
+    """
+    # if m < n, A.T A will be low rank and we can't use eigh
+    m, n = A.size()
+    if m < n:
+        U, S, V = torch_linalg.svd(A, full_matrices=False, retry_float64=retry_float64)
+        return U, S
+    M = A.mH @ A # n,n
+    try:
+        L, Q = torch_linalg.eigh(M, retry_float64=retry_float64)
+    except torch.linalg.LinAlgError:
+        U, S, V = torch_linalg.svd(A, full_matrices=False, retry_float64=retry_float64)
+        return U, S
+    L = torch.flip(L, dims=[-1])
+    Q = torch.flip(Q, dims=[-1])
+    indices = L > tol * L[0] # L[0] is the max eigenvalue
+    L = L[indices]
+    Q = Q[:, indices]
+    S = L.sqrt()
+    U = (A @ Q) / S
+    return U, S

torchzero/modules/__init__.py CHANGED Viewed

@@ -1,4 +1,6 @@
 from . import experimental
+from .adaptive import *
+from .adaptive import lre_optimizers as lre
 from .clipping import *
 from .conjugate_gradient import *
 from .grad_approximation import *
@@ -7,9 +9,9 @@ from .line_search import *
 from .misc import *
 from .momentum import *
 from .ops import *
-from .adaptive import *
 from .projections import *
 from .quasi_newton import *
+from .restarts import *
 from .second_order import *
 from .smoothing import *
 from .step_size import *
@@ -18,5 +20,4 @@ from .trust_region import *
 from .variance_reduction import *
 from .weight_decay import *
 from .wrappers import *
-from .restarts import *
-from .zeroth_order import *
+from .zeroth_order import *

torchzero/modules/adaptive/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from .adagrad import Adagrad, FullMatrixAdagrad, AdagradNorm
+from . import lre_optimizers
+from .adagrad import Adagrad, AdagradNorm, FullMatrixAdagrad
 # from .curveball import CurveBall
 # from .spectral import SpectralPreconditioner
@@ -8,14 +9,21 @@ from .adan import Adan
 from .adaptive_heavyball import AdaptiveHeavyBall
 from .aegd import AEGD
 from .esgd import ESGD
-from .lmadagrad import LMAdagrad
 from .lion import Lion
+from .ggt import GGT
 from .mars import MARSCorrection
 from .matrix_momentum import MatrixMomentum
-from .msam import MSAMMomentum, MSAM
+from .msam import MSAM, MSAMMomentum
 from .muon import DualNormCorrection, MuonAdjustLR, Orthogonalize, orthogonalize_grads_
 from .natural_gradient import NaturalGradient
 from .orthograd import OrthoGrad, orthograd_
+from .psgd import (
+    PSGDDenseNewton,
+    PSGDKronNewton,
+    PSGDKronWhiten,
+    PSGDLRANewton,
+    PSGDLRAWhiten,
+)
 from .rmsprop import RMSprop
 from .rprop import (
     BacktrackOnSignChange,

torchzero 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl