PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +47 -36
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +8 -2
torchzero/core/chain.py +47 -0
torchzero/core/functional.py +103 -0
torchzero/core/modular.py +233 -0
torchzero/core/module.py +132 -643
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +56 -23
torchzero/core/transform.py +261 -365
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +132 -34
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +3 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/{higher_order → experimental}/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +20 -17
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +5 -5
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +8 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +3 -3
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +42 -51
torchzero/modules/line_search/strong_wolfe.py +5 -5
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +63 -61
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +25 -16
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +58 -0
torchzero/modules/second_order/inm.py +101 -0
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +105 -228
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +158 -178
torchzero/modules/second_order/rsn.py +237 -0
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +57 -50
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +112 -88
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/modules/experimental/momentum.py +0 -160
torchzero/modules/higher_order/__init__.py +0 -1
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.14.dist-info/RECORD +0 -167
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/{utils/linalg → linalg}/solve.py RENAMED Viewed

@@ -1,3 +1,4 @@
+# pylint: disable = non-ascii-name
 # pyright: reportArgumentType=false
 import math
 from collections import deque
@@ -5,8 +6,8 @@ from collections.abc import Callable
 from typing import Any, NamedTuple, overload
 import torch
-from .. import (
+from .linalg_utils import mm
+from ..utils import (
     TensorList,
     generic_eq,
     generic_finfo_tiny,
@@ -15,88 +16,73 @@ from .. import (
     generic_zeros_like,
 )
-def _make_A_mm_reg(A_mm: Callable, reg):
-    def A_mm_reg(x): # A_mm with regularization
-        Ax = A_mm(x)
+def _make_A_mv_reg(A_mv: Callable, reg):
+    def A_mv_reg(x): # A_mm with regularization
+        Ax = A_mv(x)
         if not generic_eq(reg, 0): Ax += x*reg
         return Ax
-    return A_mm_reg
+    return A_mv_reg
 def _identity(x): return x
-# https://arxiv.org/pdf/2110.02820
-def nystrom_approximation(
-    A_mm: Callable[[torch.Tensor], torch.Tensor],
-    ndim: int,
-    rank: int,
-    device,
-    dtype = torch.float32,
-    generator = None,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    omega = torch.randn((ndim, rank), device=device, dtype=dtype, generator=generator) # Gaussian test matrix
-    omega, _ = torch.linalg.qr(omega) # Thin QR decomposition # pylint:disable=not-callable
-    # Y = AΩ
-    Y = torch.stack([A_mm(col) for col in omega.unbind(-1)], -1) # rank matvecs
-    v = torch.finfo(dtype).eps * torch.linalg.matrix_norm(Y, ord='fro') # Compute shift # pylint:disable=not-callable
-    Yv = Y + v*omega # Shift for stability
-    C = torch.linalg.cholesky_ex(omega.mT @ Yv)[0] # pylint:disable=not-callable
-    B = torch.linalg.solve_triangular(C, Yv.mT, upper=False, unitriangular=False).mT # pylint:disable=not-callable
-    U, S, _ = torch.linalg.svd(B, full_matrices=False) # pylint:disable=not-callable
-    lambd = (S.pow(2) - v).clip(min=0) #Remove shift, compute eigs
-    return U, lambd
+# TODO this is used in NystromSketchAndSolve
+# I need to add alternative to it where it just shifts eigenvalues by reg and uses their reciprocal
 def nystrom_sketch_and_solve(
-    A_mm: Callable[[torch.Tensor], torch.Tensor],
+    L: torch.Tensor,
+    Q: torch.Tensor,
     b: torch.Tensor,
-    rank: int,
     reg: float = 1e-3,
-    generator=None,
 ) -> torch.Tensor:
-    U, lambd = nystrom_approximation(
-        A_mm=A_mm,
-        ndim=b.size(-1),
-        rank=rank,
-        device=b.device,
-        dtype=b.dtype,
-        generator=generator,
-    )
+    """Solves (Q diag(L) Q.T + reg*I)x = b. Becomes super unstable with reg smaller than like 1e-5.
+    Args:
+        L (torch.Tensor): eigenvalues, like from ``nystrom_approximation``
+        Q (torch.Tensor): eigenvectors, like from ``nystrom_approximation``
+        b (torch.Tensor): right hand side
+        reg (float, optional): regularization. Defaults to 1e-3.
+    """
     b = b.unsqueeze(-1)
-    lambd += reg
+    L += reg
     # x = (A + μI)⁻¹ b
-    # (A + μI)⁻¹ = U(Λ + μI)⁻¹Uᵀ + (1/μ)(b - UUᵀ)
-    # x = U(Λ + μI)⁻¹Uᵀb + (1/μ)(b - UUᵀb)
-    Uᵀb = U.T @ b
-    term1 = U @ ((1/lambd).unsqueeze(-1) * Uᵀb)
-    term2 = (1.0 / reg) * (b - U @ Uᵀb)
+    # (A + μI)⁻¹ = Q(L + μI)⁻¹Qᵀ + (1/μ)(b - QQᵀ)
+    # x = Q(L + μI)⁻¹Qᵀb + (1/μ)(b - QQᵀb)
+    Qᵀb = Q.T @ b
+    term1 = Q @ ((1/L).unsqueeze(-1) * Qᵀb)
+    term2 = (1.0 / reg) * (b - Q @ Qᵀb)
     return (term1 + term2).squeeze(-1)
 def nystrom_pcg(
-    A_mm: Callable[[torch.Tensor], torch.Tensor],
+    L: torch.Tensor,
+    Q: torch.Tensor,
+    A_mv: Callable[[torch.Tensor], torch.Tensor],
     b: torch.Tensor,
-    sketch_size: int,
     reg: float = 1e-6,
     x0_: torch.Tensor | None = None,
-    tol: float | None = 1e-4,
+    tol: float | None = 1e-8,
     maxiter: int | None = None,
-    generator=None,
 ) -> torch.Tensor:
-    U, lambd = nystrom_approximation(
-        A_mm=A_mm,
-        ndim=b.size(-1),
-        rank=sketch_size,
-        device=b.device,
-        dtype=b.dtype,
-        generator=generator,
-    )
-    lambd += reg
+    """conjugate gradient preconditioned by nystrom approximation.
+    The preconditioner can be computed by one matrix-matrix multiplication with A.
+    If matrix-matrix is efficient, then this is good (e.g. batched hessian-vector products in pytorch)
+    Args:
+        L (torch.Tensor): eigenvalues of approximation of A, like from ``nystrom_approximation``
+        Q (torch.Tensor): eigenvectors of approximation of A, like from ``nystrom_approximation``
+        A_mv (Callable[[torch.Tensor], torch.Tensor]): mat-vec func with hessian
+        b (torch.Tensor): right hand side
+        reg (float, optional): regularization. Defaults to 1e-6.
+        x0_ (torch.Tensor | None, optional): initial guess (modified in-place). Defaults to None.
+        tol (float | None, optional): tolerance for convergence. Defaults to 1e-4.
+        maxiter (int | None, optional): maximum number of iterations. Defaults to None.
+    """
+    L += reg
     eps = torch.finfo(b.dtype).tiny * 2
     if tol is None: tol = eps
-    def A_mm_reg(x): # A_mm with regularization
-        Ax = A_mm(x)
+    def A_mv_reg(x): # A_mm with regularization
+        Ax = A_mv(x)
         if reg != 0: Ax += x*reg
         return Ax
@@ -104,10 +90,10 @@ def nystrom_pcg(
     if x0_ is None: x0_ = torch.zeros_like(b)
     x = x0_
-    residual = b - A_mm_reg(x)
+    residual = b - A_mv_reg(x)
     # z0 = P⁻¹ r0
-    term1 = lambd[...,-1] * U * (1/lambd.unsqueeze(-2)) @ U.mT
-    term2 = torch.eye(U.size(-2), device=U.device,dtype=U.dtype) - U@U.mT
+    term1 = L[...,-1] * Q * (1/L.unsqueeze(-2)) @ Q.mT
+    term2 = torch.eye(Q.size(-2), device=Q.device,dtype=Q.dtype) - Q@Q.mT
     P_inv = term1 + term2
     z = P_inv @ residual
     p = z.clone() # search direction
@@ -116,7 +102,7 @@ def nystrom_pcg(
     if init_norm < tol: return x
     k = 0
     while True:
-        Ap = A_mm_reg(p)
+        Ap = A_mv_reg(p)
         rz = residual.dot(z)
         step_size = rz / p.dot(Ap)
         x += step_size * p
@@ -138,7 +124,7 @@ def _safe_clip(x: torch.Tensor):
     if x.abs() < eps: return x.new_full(x.size(), eps).copysign(x)
     return x
-def _trust_tau(x,d,trust_radius):
+def _trust_tau(x, d, trust_radius):
     xx = x.dot(x)
     xd = x.dot(d)
     dd = _safe_clip(d.dot(d))
@@ -150,10 +136,10 @@ def _trust_tau(x,d,trust_radius):
 class CG:
-    """Conjugate gradient method.
+    """Conjugate gradient method optionally with norm constraint.
     Args:
-        A_mm (Callable[[torch.Tensor], torch.Tensor] | torch.Tensor): Callable that returns matvec ``Ax``.
+        A_mv (Callable[[torch.Tensor], torch.Tensor] | torch.Tensor): Callable that returns matvec ``Ax``.
         b (torch.Tensor): right hand side
         x0 (torch.Tensor | None, optional): initial guess, defaults to zeros. Defaults to None.
         tol (float | None, optional): tolerance for convergence. Defaults to 1e-8.
@@ -174,10 +160,10 @@ class CG:
     """
     def __init__(
         self,
-        A_mm: Callable,
+        A_mv: Callable,
         b: torch.Tensor | TensorList,
         x0: torch.Tensor | TensorList | None = None,
-        tol: float | None = 1e-4,
+        tol: float | None = 1e-8,
         maxiter: int | None = None,
         reg: float = 0,
         trust_radius: float | None = None,
@@ -187,7 +173,7 @@ class CG:
         P_mm: Callable | None = None,
 ):
         # --------------------------------- set attrs -------------------------------- #
-        self.A_mm = _make_A_mm_reg(A_mm, reg)
+        self.A_mv = _make_A_mv_reg(A_mv, reg)
         self.b = b
         if tol is None: tol = generic_finfo_tiny(b) * 2
         self.tol = tol
@@ -214,7 +200,7 @@ class CG:
             self.r = b
         else:
             self.x = x0
-            self.r = b - A_mm(self.x)
+            self.r = b - A_mv(self.x)
         self.z = self.P_mm(self.r)
         self.d = self.z
@@ -229,7 +215,7 @@ class CG:
         if self.iter >= self.maxiter:
             return x, True
-        Ad = self.A_mm(d)
+        Ad = self.A_mv(d)
         dAd = d.dot(Ad)
         # check negative curvature
@@ -289,7 +275,8 @@ class CG:
         return sol
 def find_within_trust_radius(history, trust_radius: float):
-    """find first ``x`` in history that exceeds trust radius, if no such ``x`` exists, returns ``None``"""
+    """find first ``x`` in history that exceeds trust radius and returns solution within,
+    if no such ``x`` exists, returns ``None``"""
     for x, x_norm, d in reversed(tuple(history)):
         if x_norm <= trust_radius:
             return _trust_tau(x, d, trust_radius)
@@ -306,7 +293,7 @@ class _TensorListSolution(NamedTuple):
 @overload
 def cg(
-    A_mm: Callable[[torch.Tensor], torch.Tensor],
+    A_mv: Callable[[torch.Tensor], torch.Tensor],
     b: torch.Tensor,
     x0: torch.Tensor | None = None,
     tol: float | None = 1e-8,
@@ -320,7 +307,7 @@ def cg(
 ) -> _TensorSolution: ...
 @overload
 def cg(
-    A_mm: Callable[[TensorList], TensorList],
+    A_mv: Callable[[TensorList], TensorList],
     b: TensorList,
     x0: TensorList | None = None,
     tol: float | None = 1e-8,
@@ -333,7 +320,7 @@ def cg(
     P_mm: Callable[[TensorList], TensorList] | None = None
 ) -> _TensorListSolution: ...
 def cg(
-    A_mm: Callable,
+    A_mv: Callable,
     b: torch.Tensor | TensorList,
     x0: torch.Tensor | TensorList | None = None,
     tol: float | None = 1e-8,
@@ -346,7 +333,7 @@ def cg(
     P_mm: Callable | None = None
 ):
     solver = CG(
-        A_mm=A_mm,
+        A_mv=A_mv,
         b=b,
         x0=x0,
         tol=tol,
@@ -370,10 +357,10 @@ def cg(
 # Liu, Yang, and Fred Roosta. "MINRES: From negative curvature detection to monotonicity properties." SIAM Journal on Optimization 32.4 (2022): 2636-2661.
 @overload
 def minres(
-    A_mm: Callable[[torch.Tensor], torch.Tensor] | torch.Tensor,
+    A_mv: Callable[[torch.Tensor], torch.Tensor] | torch.Tensor,
     b: torch.Tensor,
     x0: torch.Tensor | None = None,
-    tol: float | None = 1e-4,
+    tol: float | None = 1e-8,
     maxiter: int | None = None,
     reg: float = 0,
     npc_terminate: bool=True,
@@ -381,26 +368,27 @@ def minres(
 ) -> torch.Tensor: ...
 @overload
 def minres(
-    A_mm: Callable[[TensorList], TensorList],
+    A_mv: Callable[[TensorList], TensorList],
     b: TensorList,
     x0: TensorList | None = None,
-    tol: float | None = 1e-4,
+    tol: float | None = 1e-8,
     maxiter: int | None = None,
     reg: float | list[float] | tuple[float] = 0,
     npc_terminate: bool=True,
     trust_radius: float | None = None,
 ) -> TensorList: ...
 def minres(
-    A_mm,
+    A_mv,
     b,
     x0: torch.Tensor | TensorList | None = None,
-    tol: float | None = 1e-4,
+    tol: float | None = 1e-8,
     maxiter: int | None = None,
     reg: float | list[float] | tuple[float] = 0,
     npc_terminate: bool=True,
     trust_radius: float | None = None, #trust region is experimental
 ):
-    A_mm_reg = _make_A_mm_reg(A_mm, reg)
+    """MINRES (experimental)"""
+    A_mv_reg = _make_A_mv_reg(A_mv, reg)
     eps = math.sqrt(generic_finfo_tiny(b) * 2)
     if tol is None: tol = eps
@@ -409,7 +397,7 @@ def minres(
         R = b
         x0 = generic_zeros_like(b)
     else:
-        R = b - A_mm_reg(x0)
+        R = b - A_mv_reg(x0)
     X: Any = x0
     beta = b_norm = generic_vector_norm(b)
@@ -429,7 +417,7 @@ def minres(
     for _ in range(maxiter):
-        P = A_mm_reg(V)
+        P = A_mv_reg(V)
         alpha = V.dot(P)
         P -= beta*V_prev
         P -= alpha*V

torchzero/linalg/svd.py ADDED Viewed

@@ -0,0 +1,20 @@
+# import torch
+# # projected svd
+# # adapted from https://github.com/smortezavi/Randomized_SVD_GPU
+# def randomized_svd(M: torch.Tensor, k: int, driver=None):
+#     *_, m, n = M.shape
+#     transpose = False
+#     if m < n:
+#         transpose = True
+#         M = M.mT
+#         m,n = n,m
+#     rand_matrix = torch.randn(size=(n, k), device=M.device, dtype=M.dtype)
+#     Q, _ = torch.linalg.qr(M @ rand_matrix, mode='reduced') # pylint:disable=not-callable
+#     smaller_matrix = Q.mT @ M
+#     U_hat, s, V = torch.linalg.svd(smaller_matrix, driver=driver, full_matrices=False) # pylint:disable=not-callable
+#     U = Q @ U_hat
+#     if transpose: return V.mT, s, U.mT
+#     return U, s, V

torchzero/linalg/torch_linalg.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""torch linalg with correct typing and retries in float64"""
+from typing import NamedTuple
+import torch
+def cholesky(A: torch.Tensor, *, upper=False, retry_float64:bool=False) -> torch.Tensor:
+    """A - SPD, returns lower triangular L such that ``A = L @ L.mH`` also can pass L to ``torch.cholesky_solve``"""
+    try:
+        return torch.linalg.cholesky(A, upper=upper) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        return cholesky(A.to(torch.float64), upper=upper, retry_float64=False).to(dtype)
+class _QRTuple(NamedTuple):
+    Q: torch.Tensor
+    R: torch.Tensor
+def qr(A: torch.Tensor, mode='reduced', retry_float64:bool=False) -> _QRTuple:
+    """A - any matrix ``(*, m, n)`` (for some reason sometimes it takes ages on some matrices)
+    ### Returns (if mode = "reduced"):
+    Q: ``(*, m, k)`` - orthogonal
+    R: ``(*, k, n)`` - upper triangular
+    where ``k = min(m,n)``
+    """
+    try:
+        return torch.linalg.qr(A, mode=mode) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        Q, R = qr(A.to(torch.float64), mode=mode, retry_float64=False)
+        return _QRTuple(Q=Q.to(dtype), R=R.to(dtype))
+def eigh(A: torch.Tensor, UPLO="L", retry_float64:bool=False) -> tuple[torch.Tensor, torch.Tensor]:
+    """A - symmetric, returns ``(L, Q)``, ``A = Q @ torch.diag(L) @ Q.mH``, this is faster than SVD"""
+    try:
+        return torch.linalg.eigh(A, UPLO=UPLO) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        L, Q = eigh(A.to(torch.float64), UPLO=UPLO, retry_float64=False)
+        return L.to(dtype), Q.to(dtype)
+class _SVDTuple(NamedTuple):
+    U: torch.Tensor
+    S: torch.Tensor
+    Vh: torch.Tensor
+def svd(A: torch.Tensor, full_matrices=True, driver=None, retry_float64:bool=False) -> _SVDTuple:
+    """A - any matrix ``(*, n, m)``, but slows down if A isn't well conditioned, ``A = U @ torch.diag(S) @ Vh``
+    Don't forget to set ``full_matrices=False``
+    ### Returns:
+    U: ``(*, m, m)`` or ``(*, m, k)`` - orthogonal
+    S: ``(*, k,)`` - singular values
+    V^H: ``(*, n, n)`` or ``(*, n, k)`` - orthogonal
+    where ``k = min(m,n)``
+    ### Drivers
+    drivers are only supported on CUDA so A is moved to CUDA by this function if needed
+    from docs:
+    If A is well-conditioned (its condition number is not too large), or you do not mind some precision loss.
+    For a general matrix: ‘gesvdj’ (Jacobi method)
+    If A is tall or wide (m >> n or m << n): ‘gesvda’ (Approximate method)
+    If A is not well-conditioned or precision is relevant: ‘gesvd’ (QR based)
+    By default (driver= None), we call ‘gesvdj’ and, if it fails, we fallback to ‘gesvd’.
+    """
+    # drivers are only for CUDA
+    # also the only one that doesn't freeze is ‘gesvda’
+    device=None
+    if driver is not None:
+        device = A.device
+        A = A.cuda()
+    try:
+        U, S, Vh = torch.linalg.svd(A, full_matrices=full_matrices, driver=driver) # pylint:disable=not-callable
+        if device is not None:
+            U = U.to(device); S = S.to(device); Vh = Vh.to(device)
+        return _SVDTuple(U=U, S=S, Vh=Vh)
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        U, S, Vh = svd(A.to(torch.float64), full_matrices=full_matrices, driver=driver, retry_float64=False)
+        return _SVDTuple(U=U.to(dtype), S=S.to(dtype), Vh=Vh.to(dtype))
+def solve(A: torch.Tensor, B: torch.Tensor, left:bool=True, retry_float64:bool=False) -> torch.Tensor:
+    """I think this uses LU"""
+    try:
+        return torch.linalg.solve(A, B, left=left) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        return solve(A.to(torch.float64), B.to(torch.float64), left=left, retry_float64=False).to(dtype)
+class _SolveExTuple(NamedTuple):
+    result: torch.Tensor
+    info: int
+def solve_ex(A: torch.Tensor, B: torch.Tensor, left:bool=True, retry_float64:bool=False) -> _SolveExTuple:
+    """I think this uses LU"""
+    result, info = torch.linalg.solve_ex(A, B, left=left) # pylint:disable=not-callable
+    if info != 0:
+        if not retry_float64: return _SolveExTuple(result, info)
+        dtype = A.dtype
+        if dtype == torch.float64: return _SolveExTuple(result, info)
+        result, info = solve_ex(A.to(torch.float64), B.to(torch.float64), retry_float64=False)
+        return _SolveExTuple(result.to(dtype), info)
+    return _SolveExTuple(result, info)
+def inv(A: torch.Tensor, retry_float64:bool=False) -> torch.Tensor:
+    try:
+        return torch.linalg.inv(A) # pylint:disable=not-callable
+    except torch.linalg.LinAlgError as e:
+        if not retry_float64: raise e
+        dtype = A.dtype
+        if dtype == torch.float64: raise e
+        return inv(A.to(torch.float64), retry_float64=False).to(dtype)
+class _InvExTuple(NamedTuple):
+    inverse: torch.Tensor
+    info: int
+def inv_ex(A: torch.Tensor, *, check_errors=False, retry_float64:bool=False) -> _InvExTuple:
+    """this retries in float64 but on fail info will be not 0"""
+    inverse, info = torch.linalg.inv_ex(A, check_errors=check_errors) # pylint:disable=not-callable
+    if info != 0:
+        if not retry_float64: return _InvExTuple(inverse, info)
+        dtype = A.dtype
+        if dtype == torch.float64: return _InvExTuple(inverse, info)
+        inverse, info = inv_ex(A.to(torch.float64), retry_float64=False)
+        return _InvExTuple(inverse.to(dtype), info)
+    return _InvExTuple(inverse, info)

torchzero/modules/__init__.py CHANGED Viewed

@@ -2,7 +2,6 @@ from . import experimental
 from .clipping import *
 from .conjugate_gradient import *
 from .grad_approximation import *
-from .higher_order import *
 from .least_squares import *
 from .line_search import *
 from .misc import *

torchzero/modules/adaptive/__init__.py CHANGED Viewed

@@ -12,7 +12,7 @@ from .lmadagrad import LMAdagrad
 from .lion import Lion
 from .mars import MARSCorrection
 from .matrix_momentum import MatrixMomentum
-from .msam import MSAM, MSAMObjective
+from .msam import MSAMMomentum, MSAM
 from .muon import DualNormCorrection, MuonAdjustLR, Orthogonalize, orthogonalize_grads_
 from .natural_gradient import NaturalGradient
 from .orthograd import OrthoGrad, orthograd_

torchzero 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl