PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/{utils/linalg → linalg}/qr.py RENAMED Viewed

@@ -1,8 +1,21 @@
 from typing import Literal
 import torch
-from ..compile import enable_compilation
+from ..utils.compile import allow_compile
+# super slow
+# def cholesky_qr(A):
+#     """QR of (m, n) A via cholesky of (n, n) matrix"""
+#     AtA = A.T @ A
+#     L, _ = torch.linalg.cholesky_ex(AtA) # pylint:disable=not-callable
+#     R = L.T
+#     Q = torch.linalg.solve_triangular(R.T, A.T, upper=False).T # pylint:disable=not-callable
+#     return Q, R
 # reference - https://www.cs.cornell.edu/~bindel/class/cs6210-f09/lec18.pdf
+@allow_compile
 def _get_w_tau(R: torch.Tensor, i: int, eps: float):
     R_ii = R[...,i,i]
     R_below = R[...,i:,i]
@@ -17,6 +30,7 @@ def _get_w_tau(R: torch.Tensor, i: int, eps: float):
     tau = torch.where(degenerate, 1, tau)
     return w, tau
+@allow_compile
 def _qr_householder_complete(A:torch.Tensor):
     *b,m,n = A.shape
     k = min(m,n)
@@ -33,6 +47,7 @@ def _qr_householder_complete(A:torch.Tensor):
     return Q, R
+@allow_compile
 def _qr_householder_reduced(A:torch.Tensor):
     *b,m,n = A.shape
     k = min(m,n)
@@ -64,7 +79,6 @@ def _qr_householder_reduced(A:torch.Tensor):
     return Q, R
-# @enable_compilation
 def qr_householder(A:torch.Tensor, mode: Literal['complete', 'reduced'] = 'reduced'):
     """an attempt at making QR decomposition for very tall and thin matrices that doesn't freeze, but it is around n_cols times slower than torch.linalg.qr, but compilation makes it faster, but it has to recompile when processing different shapes"""
     if mode == 'reduced': return _qr_householder_reduced(A)

torchzero/{utils/linalg → linalg}/solve.py RENAMED Viewed

@@ -1,3 +1,4 @@
+# pylint: disable = non-ascii-name
 # pyright: reportArgumentType=false
 import math
 from collections import deque
@@ -5,8 +6,8 @@ from collections.abc import Callable
 from typing import Any, NamedTuple, overload
 import torch
-from .. import (
+from .linalg_utils import mm
+from ..utils import (
     TensorList,
     generic_eq,
     generic_finfo_tiny,
@@ -15,88 +16,71 @@ from .. import (
     generic_zeros_like,
 )
-def _make_A_mm_reg(A_mm: Callable, reg):
-    def A_mm_reg(x): # A_mm with regularization
-        Ax = A_mm(x)
+def _make_A_mv_reg(A_mv: Callable, reg):
+    def A_mv_reg(x): # A_mm with regularization
+        Ax = A_mv(x)
         if not generic_eq(reg, 0): Ax += x*reg
         return Ax
-    return A_mm_reg
+    return A_mv_reg
 def _identity(x): return x
-# https://arxiv.org/pdf/2110.02820
-def nystrom_approximation(
-    A_mm: Callable[[torch.Tensor], torch.Tensor],
-    ndim: int,
-    rank: int,
-    device,
-    dtype = torch.float32,
-    generator = None,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    omega = torch.randn((ndim, rank), device=device, dtype=dtype, generator=generator) # Gaussian test matrix
-    omega, _ = torch.linalg.qr(omega) # Thin QR decomposition # pylint:disable=not-callable
-    # Y = AΩ
-    Y = torch.stack([A_mm(col) for col in omega.unbind(-1)], -1) # rank matvecs
-    v = torch.finfo(dtype).eps * torch.linalg.matrix_norm(Y, ord='fro') # Compute shift # pylint:disable=not-callable
-    Yv = Y + v*omega # Shift for stability
-    C = torch.linalg.cholesky_ex(omega.mT @ Yv)[0] # pylint:disable=not-callable
-    B = torch.linalg.solve_triangular(C, Yv.mT, upper=False, unitriangular=False).mT # pylint:disable=not-callable
-    U, S, _ = torch.linalg.svd(B, full_matrices=False) # pylint:disable=not-callable
-    lambd = (S.pow(2) - v).clip(min=0) #Remove shift, compute eigs
-    return U, lambd
 def nystrom_sketch_and_solve(
-    A_mm: Callable[[torch.Tensor], torch.Tensor],
+    L: torch.Tensor,
+    Q: torch.Tensor,
     b: torch.Tensor,
-    rank: int,
     reg: float = 1e-3,
-    generator=None,
 ) -> torch.Tensor:
-    U, lambd = nystrom_approximation(
-        A_mm=A_mm,
-        ndim=b.size(-1),
-        rank=rank,
-        device=b.device,
-        dtype=b.dtype,
-        generator=generator,
-    )
+    """Solves ``(Q diag(L) Q.T + reg*I)x = b``. Becomes super unstable with reg smaller than like 1e-5.
+    Args:
+        L (torch.Tensor): eigenvalues, like from ``nystrom_approximation``
+        Q (torch.Tensor): eigenvectors, like from ``nystrom_approximation``
+        b (torch.Tensor): right hand side
+        reg (float, optional): regularization. Defaults to 1e-3.
+    """
     b = b.unsqueeze(-1)
-    lambd += reg
+    L += reg
     # x = (A + μI)⁻¹ b
-    # (A + μI)⁻¹ = U(Λ + μI)⁻¹Uᵀ + (1/μ)(b - UUᵀ)
-    # x = U(Λ + μI)⁻¹Uᵀb + (1/μ)(b - UUᵀb)
-    Uᵀb = U.T @ b
-    term1 = U @ ((1/lambd).unsqueeze(-1) * Uᵀb)
-    term2 = (1.0 / reg) * (b - U @ Uᵀb)
+    # (A + μI)⁻¹ = Q(L + μI)⁻¹Qᵀ + (1/μ)(b - QQᵀ)
+    # x = Q(L + μI)⁻¹Qᵀb + (1/μ)(b - QQᵀb)
+    Qᵀb = Q.T @ b
+    term1 = Q @ ((1/L).unsqueeze(-1) * Qᵀb)
+    term2 = (1.0 / reg) * (b - Q @ Qᵀb)
     return (term1 + term2).squeeze(-1)
 def nystrom_pcg(
-    A_mm: Callable[[torch.Tensor], torch.Tensor],
+    L: torch.Tensor,
+    Q: torch.Tensor,
+    A_mv: Callable[[torch.Tensor], torch.Tensor],
     b: torch.Tensor,
-    sketch_size: int,
     reg: float = 1e-6,
     x0_: torch.Tensor | None = None,
-    tol: float | None = 1e-4,
+    tol: float | None = 1e-8,
     maxiter: int | None = None,
-    generator=None,
 ) -> torch.Tensor:
-    U, lambd = nystrom_approximation(
-        A_mm=A_mm,
-        ndim=b.size(-1),
-        rank=sketch_size,
-        device=b.device,
-        dtype=b.dtype,
-        generator=generator,
-    )
-    lambd += reg
+    """conjugate gradient preconditioned by nystrom approximation.
+    The preconditioner can be computed by one matrix-matrix multiplication with A.
+    If matrix-matrix is efficient, then this is good (e.g. batched hessian-vector products in pytorch)
+    Args:
+        L (torch.Tensor): eigenvalues of approximation of A, like from ``nystrom_approximation``
+        Q (torch.Tensor): eigenvectors of approximation of A, like from ``nystrom_approximation``
+        A_mv (Callable[[torch.Tensor], torch.Tensor]): mat-vec func with hessian
+        b (torch.Tensor): right hand side
+        reg (float, optional): regularization. Defaults to 1e-6.
+        x0_ (torch.Tensor | None, optional): initial guess (modified in-place). Defaults to None.
+        tol (float | None, optional): tolerance for convergence. Defaults to 1e-4.
+        maxiter (int | None, optional): maximum number of iterations. Defaults to None.
+    """
+    L += reg
     eps = torch.finfo(b.dtype).tiny * 2
     if tol is None: tol = eps
-    def A_mm_reg(x): # A_mm with regularization
-        Ax = A_mm(x)
+    def A_mv_reg(x): # A_mm with regularization
+        Ax = A_mv(x)
         if reg != 0: Ax += x*reg
         return Ax
@@ -104,10 +88,10 @@ def nystrom_pcg(
     if x0_ is None: x0_ = torch.zeros_like(b)
     x = x0_
-    residual = b - A_mm_reg(x)
+    residual = b - A_mv_reg(x)
     # z0 = P⁻¹ r0
-    term1 = lambd[...,-1] * U * (1/lambd.unsqueeze(-2)) @ U.mT
-    term2 = torch.eye(U.size(-2), device=U.device,dtype=U.dtype) - U@U.mT
+    term1 = L[...,-1] * Q * (1/L.unsqueeze(-2)) @ Q.mT
+    term2 = torch.eye(Q.size(-2), device=Q.device,dtype=Q.dtype) - Q@Q.mT
     P_inv = term1 + term2
     z = P_inv @ residual
     p = z.clone() # search direction
@@ -116,7 +100,7 @@ def nystrom_pcg(
     if init_norm < tol: return x
     k = 0
     while True:
-        Ap = A_mm_reg(p)
+        Ap = A_mv_reg(p)
         rz = residual.dot(z)
         step_size = rz / p.dot(Ap)
         x += step_size * p
@@ -138,7 +122,7 @@ def _safe_clip(x: torch.Tensor):
     if x.abs() < eps: return x.new_full(x.size(), eps).copysign(x)
     return x
-def _trust_tau(x,d,trust_radius):
+def _trust_tau(x, d, trust_radius):
     xx = x.dot(x)
     xd = x.dot(d)
     dd = _safe_clip(d.dot(d))
@@ -150,10 +134,10 @@ def _trust_tau(x,d,trust_radius):
 class CG:
-    """Conjugate gradient method.
+    """Conjugate gradient method optionally with norm constraint.
     Args:
-        A_mm (Callable[[torch.Tensor], torch.Tensor] | torch.Tensor): Callable that returns matvec ``Ax``.
+        A_mv (Callable[[torch.Tensor], torch.Tensor] | torch.Tensor): Callable that returns matvec ``Ax``.
         b (torch.Tensor): right hand side
         x0 (torch.Tensor | None, optional): initial guess, defaults to zeros. Defaults to None.
         tol (float | None, optional): tolerance for convergence. Defaults to 1e-8.
@@ -174,10 +158,10 @@ class CG:
     """
     def __init__(
         self,
-        A_mm: Callable,
+        A_mv: Callable,
         b: torch.Tensor | TensorList,
         x0: torch.Tensor | TensorList | None = None,
-        tol: float | None = 1e-4,
+        tol: float | None = 1e-8,
         maxiter: int | None = None,
         reg: float = 0,
         trust_radius: float | None = None,
@@ -187,7 +171,7 @@ class CG:
         P_mm: Callable | None = None,
 ):
         # --------------------------------- set attrs -------------------------------- #
-        self.A_mm = _make_A_mm_reg(A_mm, reg)
+        self.A_mv = _make_A_mv_reg(A_mv, reg)
         self.b = b
         if tol is None: tol = generic_finfo_tiny(b) * 2
         self.tol = tol
@@ -214,7 +198,7 @@ class CG:
             self.r = b
         else:
             self.x = x0
-            self.r = b - A_mm(self.x)
+            self.r = b - A_mv(self.x)
         self.z = self.P_mm(self.r)
         self.d = self.z
@@ -229,7 +213,7 @@ class CG:
         if self.iter >= self.maxiter:
             return x, True
-        Ad = self.A_mm(d)
+        Ad = self.A_mv(d)
         dAd = d.dot(Ad)
         # check negative curvature
@@ -289,7 +273,8 @@ class CG:
         return sol
 def find_within_trust_radius(history, trust_radius: float):
-    """find first ``x`` in history that exceeds trust radius, if no such ``x`` exists, returns ``None``"""
+    """find first ``x`` in history that exceeds trust radius and returns solution within,
+    if no such ``x`` exists, returns ``None``"""
     for x, x_norm, d in reversed(tuple(history)):
         if x_norm <= trust_radius:
             return _trust_tau(x, d, trust_radius)
@@ -306,7 +291,7 @@ class _TensorListSolution(NamedTuple):
 @overload
 def cg(
-    A_mm: Callable[[torch.Tensor], torch.Tensor],
+    A_mv: Callable[[torch.Tensor], torch.Tensor],
     b: torch.Tensor,
     x0: torch.Tensor | None = None,
     tol: float | None = 1e-8,
@@ -320,7 +305,7 @@ def cg(
 ) -> _TensorSolution: ...
 @overload
 def cg(
-    A_mm: Callable[[TensorList], TensorList],
+    A_mv: Callable[[TensorList], TensorList],
     b: TensorList,
     x0: TensorList | None = None,
     tol: float | None = 1e-8,
@@ -333,7 +318,7 @@ def cg(
     P_mm: Callable[[TensorList], TensorList] | None = None
 ) -> _TensorListSolution: ...
 def cg(
-    A_mm: Callable,
+    A_mv: Callable,
     b: torch.Tensor | TensorList,
     x0: torch.Tensor | TensorList | None = None,
     tol: float | None = 1e-8,
@@ -346,7 +331,7 @@ def cg(
     P_mm: Callable | None = None
 ):
     solver = CG(
-        A_mm=A_mm,
+        A_mv=A_mv,
         b=b,
         x0=x0,
         tol=tol,
@@ -370,10 +355,10 @@ def cg(
 # Liu, Yang, and Fred Roosta. "MINRES: From negative curvature detection to monotonicity properties." SIAM Journal on Optimization 32.4 (2022): 2636-2661.
 @overload
 def minres(
-    A_mm: Callable[[torch.Tensor], torch.Tensor] | torch.Tensor,
+    A_mv: Callable[[torch.Tensor], torch.Tensor] | torch.Tensor,
     b: torch.Tensor,
     x0: torch.Tensor | None = None,
-    tol: float | None = 1e-4,
+    tol: float | None = 1e-8,
     maxiter: int | None = None,
     reg: float = 0,
     npc_terminate: bool=True,
@@ -381,26 +366,27 @@ def minres(
 ) -> torch.Tensor: ...
 @overload
 def minres(
-    A_mm: Callable[[TensorList], TensorList],
+    A_mv: Callable[[TensorList], TensorList],
     b: TensorList,
     x0: TensorList | None = None,
-    tol: float | None = 1e-4,
+    tol: float | None = 1e-8,
     maxiter: int | None = None,
     reg: float | list[float] | tuple[float] = 0,
     npc_terminate: bool=True,
     trust_radius: float | None = None,
 ) -> TensorList: ...
 def minres(
-    A_mm,
+    A_mv,
     b,
     x0: torch.Tensor | TensorList | None = None,
-    tol: float | None = 1e-4,
+    tol: float | None = 1e-8,
     maxiter: int | None = None,
     reg: float | list[float] | tuple[float] = 0,
     npc_terminate: bool=True,
     trust_radius: float | None = None, #trust region is experimental
 ):
-    A_mm_reg = _make_A_mm_reg(A_mm, reg)
+    """MINRES (experimental)"""
+    A_mv_reg = _make_A_mv_reg(A_mv, reg)
     eps = math.sqrt(generic_finfo_tiny(b) * 2)
     if tol is None: tol = eps
@@ -409,7 +395,7 @@ def minres(
         R = b
         x0 = generic_zeros_like(b)
     else:
-        R = b - A_mm_reg(x0)
+        R = b - A_mv_reg(x0)
     X: Any = x0
     beta = b_norm = generic_vector_norm(b)
@@ -429,7 +415,7 @@ def minres(
     for _ in range(maxiter):
-        P = A_mm_reg(V)
+        P = A_mv_reg(V)
         alpha = V.dot(P)
         P -= beta*V_prev
         P -= alpha*V

torchzero/linalg/svd.py ADDED Viewed

@@ -0,0 +1,47 @@
+import torch
+from . import torch_linalg
+def tall_reduced_svd_via_eigh(A: torch.Tensor, tol: float = 0, retry_float64:bool=False):
+    """
+    Given a tall matrix A of size (m, n), computes U and S from the reduced SVD(A)
+    using the eigendecomposition of (n, n) matrix which is faster than direct SVD when m >= n.
+    This truncates small singular values that would causes nans,
+    so the returned U and S can have reduced dimension ``k <= n``.
+    Returns U of size ``(m, k)`` and S of size ``(k, )``.
+    Args:
+        A (torch.Tensor): A tall matrix of size (m, n) with m >= n.
+        tol (float): Tolerance for truncating small singular values. Singular values
+                     less than ``tol * max_singular_value`` will be discarded.
+    """
+    # if m < n, A.T A will be low rank and we can't use eigh
+    m, n = A.size()
+    if m < n:
+        U, S, V = torch_linalg.svd(A, full_matrices=False, retry_float64=retry_float64)
+        return U, S
+    M = A.mH @ A # n,n
+    try:
+        L, Q = torch_linalg.eigh(M, retry_float64=retry_float64)
+    except torch.linalg.LinAlgError:
+        U, S, V = torch_linalg.svd(A, full_matrices=False, retry_float64=retry_float64)
+        return U, S
+    L = torch.flip(L, dims=[-1])
+    Q = torch.flip(Q, dims=[-1])
+    indices = L > tol * L[0] # L[0] is the max eigenvalue
+    L = L[indices]
+    Q = Q[:, indices]
+    S = L.sqrt()
+    U = (A @ Q) / S
+    return U, S

torchzero/linalg/torch_linalg.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""torch linalg with correct typing and retries in float64"""
+from typing import NamedTuple
+import torch
+def cholesky(A: torch.Tensor, *, upper=False, retry_float64:bool=False) -> torch.Tensor:
+    """A - SPD, returns lower triangular L such that ``A = L @ L.mH`` also can pass L to ``torch.cholesky_solve``"""
+    try:
+        return torch.linalg.cholesky(A, upper=upper) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        return cholesky(A.to(torch.float64), upper=upper, retry_float64=False).to(dtype)
+class _QRTuple(NamedTuple):
+    Q: torch.Tensor
+    R: torch.Tensor
+def qr(A: torch.Tensor, mode='reduced', retry_float64:bool=False) -> _QRTuple:
+    """A - any matrix ``(*, m, n)`` (for some reason sometimes it takes ages on some matrices)
+    ### Returns (if mode = "reduced"):
+    Q: ``(*, m, k)`` - orthogonal
+    R: ``(*, k, n)`` - upper triangular
+    where ``k = min(m,n)``
+    """
+    try:
+        return torch.linalg.qr(A, mode=mode) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        Q, R = qr(A.to(torch.float64), mode=mode, retry_float64=False)
+        return _QRTuple(Q=Q.to(dtype), R=R.to(dtype))
+def eigh(A: torch.Tensor, UPLO="L", retry_float64:bool=False) -> tuple[torch.Tensor, torch.Tensor]:
+    """A - symmetric, returns ``(L, Q)``, ``A = Q @ torch.diag(L) @ Q.mH``, this is faster than SVD"""
+    try:
+        return torch.linalg.eigh(A, UPLO=UPLO) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        L, Q = eigh(A.to(torch.float64), UPLO=UPLO, retry_float64=False)
+        return L.to(dtype), Q.to(dtype)
+class _SVDTuple(NamedTuple):
+    U: torch.Tensor
+    S: torch.Tensor
+    Vh: torch.Tensor
+def svd(A: torch.Tensor, full_matrices=True, driver=None, retry_float64:bool=False) -> _SVDTuple:
+    """A - any matrix ``(*, n, m)``, but slows down if A isn't well conditioned, ``A = U @ torch.diag(S) @ Vh``
+    Don't forget to set ``full_matrices=False``
+    ### Returns:
+    U: ``(*, m, m)`` or ``(*, m, k)`` - orthogonal
+    S: ``(*, k,)`` - singular values
+    V^H: ``(*, n, n)`` or ``(*, n, k)`` - orthogonal
+    where ``k = min(m,n)``
+    ### Drivers
+    drivers are only supported on CUDA so A is moved to CUDA by this function if needed
+    from docs:
+    If A is well-conditioned (its condition number is not too large), or you do not mind some precision loss.
+    For a general matrix: ‘gesvdj’ (Jacobi method)
+    If A is tall or wide (m >> n or m << n): ‘gesvda’ (Approximate method)
+    If A is not well-conditioned or precision is relevant: ‘gesvd’ (QR based)
+    By default (driver= None), we call ‘gesvdj’ and, if it fails, we fallback to ‘gesvd’.
+    """
+    # drivers are only for CUDA
+    # also the only one that doesn't freeze is ‘gesvda’
+    device=None
+    if driver is not None:
+        device = A.device
+        A = A.cuda()
+    try:
+        U, S, Vh = torch.linalg.svd(A, full_matrices=full_matrices, driver=driver) # pylint:disable=not-callable
+        if device is not None:
+            U = U.to(device); S = S.to(device); Vh = Vh.to(device)
+        return _SVDTuple(U=U, S=S, Vh=Vh)
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        U, S, Vh = svd(A.to(torch.float64), full_matrices=full_matrices, driver=driver, retry_float64=False)
+        return _SVDTuple(U=U.to(dtype), S=S.to(dtype), Vh=Vh.to(dtype))
+def solve(A: torch.Tensor, B: torch.Tensor, left:bool=True, retry_float64:bool=False) -> torch.Tensor:
+    """I think this uses LU"""
+    try:
+        return torch.linalg.solve(A, B, left=left) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        return solve(A.to(torch.float64), B.to(torch.float64), left=left, retry_float64=False).to(dtype)
+class _SolveExTuple(NamedTuple):
+    result: torch.Tensor
+    info: int
+def solve_ex(A: torch.Tensor, B: torch.Tensor, left:bool=True, retry_float64:bool=False) -> _SolveExTuple:
+    """I think this uses LU"""
+    result, info = torch.linalg.solve_ex(A, B, left=left) # pylint:disable=not-callable
+    if info != 0:
+        if not retry_float64: return _SolveExTuple(result, info)
+        dtype = A.dtype
+        if dtype == torch.float64: return _SolveExTuple(result, info)
+        result, info = solve_ex(A.to(torch.float64), B.to(torch.float64), retry_float64=False)
+        return _SolveExTuple(result.to(dtype), info)
+    return _SolveExTuple(result, info)
+def inv(A: torch.Tensor, retry_float64:bool=False) -> torch.Tensor:
+    try:
+        return torch.linalg.inv(A) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        return inv(A.to(torch.float64), retry_float64=False).to(dtype)
+class _InvExTuple(NamedTuple):
+    inverse: torch.Tensor
+    info: int
+def inv_ex(A: torch.Tensor, *, check_errors=False, retry_float64:bool=False) -> _InvExTuple:
+    """this retries in float64 but on fail info will be not 0"""
+    inverse, info = torch.linalg.inv_ex(A, check_errors=check_errors) # pylint:disable=not-callable
+    if info != 0:
+        if not retry_float64: return _InvExTuple(inverse, info)
+        dtype = A.dtype
+        if dtype == torch.float64: return _InvExTuple(inverse, info)
+        inverse, info = inv_ex(A.to(torch.float64), retry_float64=False)
+        return _InvExTuple(inverse.to(dtype), info)
+    return _InvExTuple(inverse, info)

torchzero/modules/__init__.py CHANGED Viewed

@@ -1,4 +1,6 @@
 from . import experimental
+from .adaptive import *
+from .adaptive import lre_optimizers as lre
 from .clipping import *
 from .conjugate_gradient import *
 from .grad_approximation import *
@@ -7,9 +9,9 @@ from .line_search import *
 from .misc import *
 from .momentum import *
 from .ops import *
-from .adaptive import *
 from .projections import *
 from .quasi_newton import *
+from .restarts import *
 from .second_order import *
 from .smoothing import *
 from .step_size import *
@@ -18,5 +20,4 @@ from .trust_region import *
 from .variance_reduction import *
 from .weight_decay import *
 from .wrappers import *
-from .restarts import *
-from .zeroth_order import *
+from .zeroth_order import *

torchzero/modules/adaptive/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from .adagrad import Adagrad, FullMatrixAdagrad, AdagradNorm
+from . import lre_optimizers
+from .adagrad import Adagrad, AdagradNorm, FullMatrixAdagrad
 # from .curveball import CurveBall
 # from .spectral import SpectralPreconditioner
@@ -8,14 +9,21 @@ from .adan import Adan
 from .adaptive_heavyball import AdaptiveHeavyBall
 from .aegd import AEGD
 from .esgd import ESGD
-from .lmadagrad import LMAdagrad
 from .lion import Lion
+from .ggt import GGT
 from .mars import MARSCorrection
 from .matrix_momentum import MatrixMomentum
-from .msam import MSAM, MSAMObjective
+from .msam import MSAM, MSAMMomentum
 from .muon import DualNormCorrection, MuonAdjustLR, Orthogonalize, orthogonalize_grads_
 from .natural_gradient import NaturalGradient
 from .orthograd import OrthoGrad, orthograd_
+from .psgd import (
+    PSGDDenseNewton,
+    PSGDKronNewton,
+    PSGDKronWhiten,
+    PSGDLRANewton,
+    PSGDLRAWhiten,
+)
 from .rmsprop import RMSprop
 from .rprop import (
     BacktrackOnSignChange,

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl