PyPI - torchzero - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

tests/test_identical.py +22 -22
tests/test_opts.py +199 -198
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +1 -1
torchzero/core/functional.py +1 -1
torchzero/core/modular.py +5 -5
torchzero/core/module.py +2 -2
torchzero/core/objective.py +10 -10
torchzero/core/transform.py +1 -1
torchzero/linalg/__init__.py +3 -2
torchzero/linalg/eigh.py +223 -4
torchzero/linalg/orthogonalize.py +2 -4
torchzero/linalg/qr.py +12 -0
torchzero/linalg/solve.py +1 -3
torchzero/linalg/svd.py +47 -20
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +10 -10
torchzero/modules/adaptive/adahessian.py +2 -2
torchzero/modules/adaptive/adam.py +1 -1
torchzero/modules/adaptive/adan.py +1 -1
torchzero/modules/adaptive/adaptive_heavyball.py +1 -1
torchzero/modules/adaptive/esgd.py +2 -2
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +2 -1
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +2 -2
torchzero/modules/adaptive/matrix_momentum.py +1 -1
torchzero/modules/adaptive/msam.py +4 -4
torchzero/modules/adaptive/muon.py +9 -6
torchzero/modules/adaptive/natural_gradient.py +32 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rprop.py +2 -2
torchzero/modules/adaptive/sam.py +4 -4
torchzero/modules/adaptive/shampoo.py +28 -3
torchzero/modules/adaptive/soap.py +3 -3
torchzero/modules/adaptive/sophia_h.py +2 -2
torchzero/modules/clipping/clipping.py +7 -7
torchzero/modules/conjugate_gradient/cg.py +2 -2
torchzero/modules/experimental/__init__.py +5 -0
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +2 -2
torchzero/modules/experimental/newtonnewton.py +34 -40
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/rfdm.py +4 -4
torchzero/modules/least_squares/gn.py +68 -45
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/escape.py +1 -1
torchzero/modules/misc/gradient_accumulation.py +1 -1
torchzero/modules/misc/misc.py +1 -1
torchzero/modules/misc/multistep.py +4 -7
torchzero/modules/misc/regularization.py +2 -2
torchzero/modules/misc/split.py +1 -1
torchzero/modules/misc/switch.py +2 -2
torchzero/modules/momentum/cautious.py +3 -3
torchzero/modules/momentum/momentum.py +1 -1
torchzero/modules/ops/higher_level.py +1 -1
torchzero/modules/ops/multi.py +1 -1
torchzero/modules/projections/projection.py +5 -2
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +3 -3
torchzero/modules/quasi_newton/lsr1.py +3 -3
torchzero/modules/quasi_newton/quasi_newton.py +44 -29
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +17 -17
torchzero/modules/second_order/inm.py +33 -25
torchzero/modules/second_order/newton.py +132 -130
torchzero/modules/second_order/newton_cg.py +3 -3
torchzero/modules/second_order/nystrom.py +83 -32
torchzero/modules/second_order/rsn.py +41 -44
torchzero/modules/smoothing/laplacian.py +1 -1
torchzero/modules/smoothing/sampling.py +2 -3
torchzero/modules/step_size/adaptive.py +6 -6
torchzero/modules/step_size/lr.py +2 -2
torchzero/modules/trust_region/cubic_regularization.py +1 -1
torchzero/modules/trust_region/levenberg_marquardt.py +2 -2
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/variance_reduction/svrg.py +4 -5
torchzero/modules/weight_decay/reinit.py +2 -2
torchzero/modules/weight_decay/weight_decay.py +5 -5
torchzero/modules/wrappers/optim_wrapper.py +4 -4
torchzero/modules/zeroth_order/cd.py +1 -1
torchzero/optim/mbs.py +291 -0
torchzero/optim/wrappers/nevergrad.py +0 -9
torchzero/optim/wrappers/optuna.py +2 -0
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/derivatives.py +4 -4
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
torchzero/modules/adaptive/lmadagrad.py +0 -241
torchzero-0.4.0.dist-info/RECORD +0 -191
/torchzero/modules/{functional.py → opt_utils.py} +0 -0
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/second_order/newton.py CHANGED Viewed

@@ -1,22 +1,22 @@
 from collections.abc import Callable
-from typing import Literal
+from typing import Any
 import torch
-from ...core import Chainable, Transform, Objective, HessianMethod, Module
-from ...utils import vec_to_tensors
-from ...linalg.linear_operator import Dense, DenseWithInverse
+from ...core import Chainable, Transform, Objective, HessianMethod
+from ...utils import vec_to_tensors_
+from ...linalg.linear_operator import Dense, DenseWithInverse, Eigendecomposition
+from ...linalg import torch_linalg
-def _lu_solve(H: torch.Tensor, g: torch.Tensor):
+def _try_lu_solve(H: torch.Tensor, g: torch.Tensor):
     try:
-        x, info = torch.linalg.solve_ex(H, g) # pylint:disable=not-callable
+        x, info = torch_linalg.solve_ex(H, g, retry_float64=True)
         if info == 0: return x
         return None
     except RuntimeError:
         return None
-def _cholesky_solve(H: torch.Tensor, g: torch.Tensor):
+def _try_cholesky_solve(H: torch.Tensor, g: torch.Tensor):
     L, info = torch.linalg.cholesky_ex(H) # pylint:disable=not-callable
     if info == 0:
         return torch.cholesky_solve(g.unsqueeze(-1), L).squeeze(-1)
@@ -25,77 +25,91 @@ def _cholesky_solve(H: torch.Tensor, g: torch.Tensor):
 def _least_squares_solve(H: torch.Tensor, g: torch.Tensor):
     return torch.linalg.lstsq(H, g)[0] # pylint:disable=not-callable
-def _eigh_solve(H: torch.Tensor, g: torch.Tensor, tfm: Callable | None, search_negative: bool):
-    try:
-        L, Q = torch.linalg.eigh(H) # pylint:disable=not-callable
-        if tfm is not None: L = tfm(L)
-        if search_negative and L[0] < 0:
-            neg_mask = L < 0
-            Q_neg = Q[:, neg_mask] * L[neg_mask]
-            return (Q_neg * (g @ Q_neg).sign()).mean(1)
-        return Q @ ((Q.mH @ g) / L)
-    except torch.linalg.LinAlgError:
-        return None
-def _newton_step(objective: Objective, H: torch.Tensor, damping:float, H_tfm, eigval_fn, use_lstsq:bool, g_proj: Callable | None = None, no_inner: Module | None = None) -> torch.Tensor:
-    """INNER SHOULD BE NONE IN MOST CASES! Because Transform already has inner.
-    Returns the update tensor, then do vec_to_tensor(update, params)"""
-    # -------------------------------- inner step -------------------------------- #
-    if no_inner is not None:
-        objective = no_inner.step(objective)
-    update = objective.get_updates()
-    g = torch.cat([t.ravel() for t in update])
-    if g_proj is not None: g = g_proj(g)
-    # ----------------------------------- solve ---------------------------------- #
-    update = None
+def _newton_update_state_(
+    state: dict,
+    H: torch.Tensor,
+    damping: float,
+    eigval_fn: Callable | None,
+    precompute_inverse: bool,
+    use_lstsq: bool,
+):
+    """used in most hessian-based modules"""
+    # add damping
     if damping != 0:
-        H = H + torch.eye(H.size(-1), dtype=H.dtype, device=H.device).mul_(damping)
-    if H_tfm is not None:
-        ret = H_tfm(H, g)
-        if isinstance(ret, torch.Tensor):
-            update = ret
+        reg = torch.eye(H.size(0), device=H.device, dtype=H.dtype).mul_(damping)
+        H += reg
-        else: # returns (H, is_inv)
-            H, is_inv = ret
-            if is_inv: update = H @ g
-    if eigval_fn is not None:
-        update = _eigh_solve(H, g, eigval_fn, search_negative=False)
-    if update is None and use_lstsq: update = _least_squares_solve(H, g)
-    if update is None: update = _cholesky_solve(H, g)
-    if update is None: update = _lu_solve(H, g)
-    if update is None: update = _least_squares_solve(H, g)
-    return update
-def _get_H(H: torch.Tensor, eigval_fn):
+    # if eigval_fn is given, we don't need H or H_inv, we store factors
     if eigval_fn is not None:
-        try:
-            L, Q = torch.linalg.eigh(H) # pylint:disable=not-callable
-            L: torch.Tensor = eigval_fn(L)
-            H = Q @ L.diag_embed() @ Q.mH
-            H_inv = Q @ L.reciprocal().diag_embed() @ Q.mH
-            return DenseWithInverse(H, H_inv)
-        except torch.linalg.LinAlgError:
-            pass
+        L, Q = torch_linalg.eigh(H, retry_float64=True)
+        L = eigval_fn(L)
+        state["L"] = L
+        state["Q"] = Q
+        return
+    # pre-compute inverse if requested
+    # store H to as it is needed for trust regions
+    state["H"] = H
+    if precompute_inverse:
+        if use_lstsq:
+            H_inv = torch.linalg.pinv(H) # pylint:disable=not-callable
+        else:
+            H_inv, _ = torch_linalg.inv_ex(H)
+        state["H_inv"] = H_inv
+def _newton_solve(
+    b: torch.Tensor,
+    state: dict[str, torch.Tensor | Any],
+    use_lstsq: bool = False,
+):
+    """
+    used in most hessian-based modules. state is from ``_newton_update_state_``, in it:
-    return Dense(H)
+    H (torch.Tensor): hessian
+    H_inv (torch.Tensor | None): hessian inverse
+    L (torch.Tensor | None): eigenvalues (transformed)
+    Q (torch.Tensor | None): eigenvectors
+    """
+    # use eig if provided
+    if "L" in state:
+        Q = state["Q"]; L = state["L"]
+        assert Q is not None
+        return Q @ ((Q.mH @ b) / L)
+    # use inverse if cached
+    if "H_inv" in state:
+        return state["H_inv"] @ b
+    # use hessian
+    H = state["H"]
+    if use_lstsq: return _least_squares_solve(H, b)
+    dir = None
+    if dir is None: dir = _try_cholesky_solve(H, b)
+    if dir is None: dir = _try_lu_solve(H, b)
+    if dir is None: dir = _least_squares_solve(H, b)
+    return dir
+def _newton_get_H(state: dict[str, torch.Tensor | Any]):
+    """used in most hessian-based modules. state is from ``_newton_update_state_``"""
+    if "H_inv" in state:
+        return DenseWithInverse(state["H"], state["H_inv"])
+    if "L" in state:
+        # Eigendecomposition has sligthly different solve_plus_diag
+        # I am pretty sure it should be very close and it uses no solves
+        # best way to test is to try cubic regularization with this
+        return Eigendecomposition(state["L"], state["Q"], use_nystrom=False)
+    return Dense(state["H"])
 class Newton(Transform):
-    """Exact newton's method via autograd.
+    """Exact Newton's method via autograd.
     Newton's method produces a direction jumping to the stationary point of quadratic approximation of the target function.
     The update rule is given by ``(H + yI)⁻¹g``, where ``H`` is the hessian and ``g`` is the gradient, ``y`` is the ``damping`` parameter.
     ``g`` can be output of another module, if it is specifed in ``inner`` argument.
     Note:
@@ -107,27 +121,19 @@ class Newton(Transform):
         The closure must accept a ``backward`` argument (refer to documentation).
     Args:
-        damping (float, optional): tikhonov regularizer value. Set this to 0 when using trust region. Defaults to 0.
-        search_negative (bool, Optional):
-            if True, whenever a negative eigenvalue is detected,
-            search direction is proposed along weighted sum of eigenvectors corresponding to negative eigenvalues.
-        use_lstsq (bool, Optional):
-            if True, least squares will be used to solve the linear system, this may generate reasonable directions
-            when hessian is not invertible. If False, tries cholesky, if it fails tries LU, and then least squares.
-            If ``eigval_fn`` is specified, eigendecomposition will always be used to solve the linear system and this
-            argument will be ignored.
-        H_tfm (Callable | None, optional):
-            optional hessian transforms, takes in two arguments - `(hessian, gradient)`.
-            must return either a tuple: `(hessian, is_inverted)` with transformed hessian and a boolean value
-            which must be True if transform inverted the hessian and False otherwise.
-            Or it returns a single tensor which is used as the update.
-            Defaults to None.
+        damping (float, optional): tikhonov regularizer value. Defaults to 0.
         eigval_fn (Callable | None, optional):
-            optional eigenvalues transform, for example ``torch.abs`` or ``lambda L: torch.clip(L, min=1e-8)``.
+            function to apply to eigenvalues, for example ``torch.abs`` or ``lambda L: torch.clip(L, min=1e-8)``.
             If this is specified, eigendecomposition will be used to invert the hessian.
+        update_freq (int, optional):
+            updates hessian every ``update_freq`` steps.
+        precompute_inverse (bool, optional):
+            if ``True``, whenever hessian is computed, also computes the inverse. This is more efficient
+            when ``update_freq`` is large. If ``None``, this is ``True`` if ``update_freq >= 10``.
+        use_lstsq (bool, Optional):
+            if True, least squares will be used to solve the linear system, this can prevent it from exploding
+            when hessian is indefinite. If False, tries cholesky, if it fails tries LU, and then least squares.
+            If ``eigval_fn`` is specified, eigendecomposition is always used and this argument is ignored.
         hessian_method (str):
             Determines how hessian is computed.
@@ -139,17 +145,19 @@ class Newton(Transform):
             - ``"gfd_forward"`` - computes ``ndim`` hessian-vector products via gradient finite difference using a less accurate forward formula which requires one extra gradient evaluation per hessian-vector product.
             - ``"gfd_central"`` - computes ``ndim`` hessian-vector products via gradient finite difference using a more accurate central formula which requires two gradient evaluations per hessian-vector product.
             - ``"fd"`` - uses function values to estimate gradient and hessian via finite difference. This uses less evaluations than chaining ``"gfd_*"`` after ``tz.m.FDM``.
+            - ``"thoad"`` - uses ``thoad`` library, can be significantly faster than pytorch but limited operator coverage.
             Defaults to ``"batched_autograd"``.
         h (float, optional):
-            finite difference step size for "fd_forward" and "fd_central".
+            finite difference step size if hessian is compute via finite-difference.
         inner (Chainable | None, optional): modules to apply hessian preconditioner to. Defaults to None.
     # See also
-    * ``tz.m.NewtonCG``: uses a matrix-free conjugate gradient solver and hessian-vector products,
+    * ``tz.m.NewtonCG``: uses a matrix-free conjugate gradient solver and hessian-vector products.
     useful for large scale problems as it doesn't form the full hessian.
     * ``tz.m.NewtonCGSteihaug``: trust region version of ``tz.m.NewtonCG``.
+    * ``tz.m.ImprovedNewton``: Newton with additional rank one correction to the hessian, can be faster than Newton.
     * ``tz.m.InverseFreeNewton``: an inverse-free variant of Newton's method.
     * ``tz.m.quasi_newton``: large collection of quasi-newton methods that estimate the hessian.
@@ -158,57 +166,48 @@ class Newton(Transform):
     ## Implementation details
     ``(H + yI)⁻¹g`` is calculated by solving the linear system ``(H + yI)x = g``.
-    The linear system is solved via cholesky decomposition, if that fails, LU decomposition, and if that fails, least squares.
-    Least squares can be forced by setting ``use_lstsq=True``, which may generate better search directions when linear system is overdetermined.
+    The linear system is solved via cholesky decomposition, if that fails, LU decomposition, and if that fails, least squares. Least squares can be forced by setting ``use_lstsq=True``.
     Additionally, if ``eigval_fn`` is specified, eigendecomposition of the hessian is computed,
-    ``eigval_fn`` is applied to the eigenvalues, and ``(H + yI)⁻¹`` is computed using the computed eigenvectors and transformed eigenvalues. This is more generally more computationally expensive,
-    but not by much
+    ``eigval_fn`` is applied to the eigenvalues, and ``(H + yI)⁻¹`` is computed using the computed eigenvectors and transformed eigenvalues. This is more generally more computationally expensive but not by much.
     ## Handling non-convexity
     Standard Newton's method does not handle non-convexity well without some modifications.
     This is because it jumps to the stationary point, which may be the maxima of the quadratic approximation.
-    The first modification to handle non-convexity is to modify the eignevalues to be positive,
+    A modification to handle non-convexity is to modify the eignevalues to be positive,
     for example by setting ``eigval_fn = lambda L: L.abs().clip(min=1e-4)``.
-    Second modification is ``search_negative=True``, which will search along a negative curvature direction if one is detected.
-    This also requires an eigendecomposition.
-    The Newton direction can also be forced to be a descent direction by using ``tz.m.GradSign()`` or ``tz.m.Cautious``,
-    but that may be significantly less efficient.
     # Examples:
     Newton's method with backtracking line search
     ```py
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.Newton(),
         tz.m.Backtracking()
     )
     ```
-    Newton preconditioning applied to momentum
+    Newton's method for non-convex optimization.
     ```py
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
-        tz.m.Newton(inner=tz.m.EMA(0.9)),
-        tz.m.LR(0.1)
+        tz.m.Newton(eigval_fn = lambda L: L.abs().clip(min=1e-4)),
+        tz.m.Backtracking()
     )
     ```
-    Diagonal newton example. This will still evaluate the entire hessian so it isn't efficient,
-    but if you wanted to see how diagonal newton behaves or compares to full newton, you can use this.
+    Newton preconditioning applied to momentum
     ```py
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
-        tz.m.Newton(H_tfm = lambda H, g: g/H.diag()),
-        tz.m.Backtracking()
+        tz.m.Newton(inner=tz.m.EMA(0.9)),
+        tz.m.LR(0.1)
     )
     ```
@@ -216,10 +215,10 @@ class Newton(Transform):
     def __init__(
         self,
         damping: float = 0,
-        use_lstsq: bool = False,
-        update_freq: int = 1,
-        H_tfm: Callable[[torch.Tensor, torch.Tensor], tuple[torch.Tensor, bool]] | Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
         eigval_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
+        update_freq: int = 1,
+        precompute_inverse: bool | None = None,
+        use_lstsq: bool = False,
         hessian_method: HessianMethod = "batched_autograd",
         h: float = 1e-3,
         inner: Chainable | None = None,
@@ -232,29 +231,32 @@ class Newton(Transform):
     def update_states(self, objective, states, settings):
         fs = settings[0]
-        _, _, self.global_state['H'] = objective.hessian(
-            hessian_method=fs['hessian_method'],
-            h=fs['h'],
-            at_x0=True
+        precompute_inverse = fs["precompute_inverse"]
+        if precompute_inverse is None:
+            precompute_inverse = fs["__update_freq"] >= 10
+        __, _, H = objective.hessian(hessian_method=fs["hessian_method"], h=fs["h"], at_x0=True)
+        _newton_update_state_(
+            state = self.global_state,
+            H=H,
+            damping = fs["damping"],
+            eigval_fn = fs["eigval_fn"],
+            precompute_inverse = precompute_inverse,
+            use_lstsq = fs["use_lstsq"]
         )
     @torch.no_grad
     def apply_states(self, objective, states, settings):
-        params = objective.params
+        updates = objective.get_updates()
         fs = settings[0]
-        update = _newton_step(
-            objective=objective,
-            H = self.global_state["H"],
-            damping = fs["damping"],
-            H_tfm = fs["H_tfm"],
-            eigval_fn = fs["eigval_fn"],
-            use_lstsq = fs["use_lstsq"],
-        )
+        b = torch.cat([t.ravel() for t in updates])
+        sol = _newton_solve(b=b, state=self.global_state, use_lstsq=fs["use_lstsq"])
-        objective.updates = vec_to_tensors(update, params)
+        vec_to_tensors_(sol, updates)
         return objective
     def get_H(self,objective=...):
-        return _get_H(self.global_state["H"], self.defaults["eigval_fn"])
+        return _newton_get_H(self.global_state)

torchzero/modules/second_order/newton_cg.py CHANGED Viewed

@@ -57,7 +57,7 @@ class NewtonCG(Transform):
     Newton-CG with a backtracking line search:
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.NewtonCG(),
         tz.m.Backtracking()
@@ -66,7 +66,7 @@ class NewtonCG(Transform):
     Truncated Newton method (useful for large-scale problems):
     ```
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.NewtonCG(maxiter=10),
         tz.m.Backtracking()
@@ -198,7 +198,7 @@ class NewtonCGSteihaug(Transform):
     Trust-region Newton-CG:
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.NewtonCGSteihaug(),
     )

torchzero/modules/second_order/nystrom.py CHANGED Viewed

@@ -1,10 +1,11 @@
+import warnings
 from typing import Literal
 import torch
 from ...core import Chainable, Transform, HVPMethod
 from ...utils import TensorList, vec_to_tensors
-from ...linalg import nystrom_pcg, nystrom_sketch_and_solve, nystrom_approximation, cg
+from ...linalg import nystrom_pcg, nystrom_sketch_and_solve, nystrom_approximation, cg, regularize_eigh, OrthogonalizeMethod
 from ...linalg.linear_operator import Eigendecomposition, ScaledIdentity
 class NystromSketchAndSolve(Transform):
@@ -19,7 +20,18 @@ class NystromSketchAndSolve(Transform):
     Args:
         rank (int): size of the sketch, this many hessian-vector products will be evaluated per step.
-        reg (float, optional): regularization parameter. Defaults to 1e-3.
+        reg (float | None, optional):
+            scale of identity matrix added to hessian. Note that if this is specified, nystrom sketch-and-solve
+            is used to compute ``(Q diag(L) Q.T + reg*I)x = b``. It is very unstable when ``reg`` is small,
+            i.e. smaller than 1e-4. If this is None,``(Q diag(L) Q.T)x = b`` is computed by simply taking
+            reciprocal of eigenvalues. Defaults to 1e-3.
+        eigv_tol (float, optional):
+            all eigenvalues smaller than largest eigenvalue times ``eigv_tol`` are removed. Defaults to None.
+        truncate (int | None, optional):
+            keeps top ``truncate`` eigenvalues. Defaults to None.
+        damping (float, optional): scalar added to eigenvalues. Defaults to 0.
+        rdamping (float, optional): scalar multiplied by largest eigenvalue and added to eigenvalues. Defaults to 0.
+        update_freq (int, optional): frequency of updating preconditioner. Defaults to 1.
         hvp_method (str, optional):
             Determines how Hessian-vector products are computed.
@@ -40,7 +52,7 @@ class NystromSketchAndSolve(Transform):
     NystromSketchAndSolve with backtracking line search
     ```py
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.NystromSketchAndSolve(100),
         tz.m.Backtracking()
@@ -50,7 +62,7 @@ class NystromSketchAndSolve(Transform):
     Trust region NystromSketchAndSolve
     ```py
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.LevenbergMarquadt(tz.m.NystromSketchAndSolve(100)),
     )
@@ -64,10 +76,15 @@ class NystromSketchAndSolve(Transform):
     def __init__(
         self,
         rank: int,
-        reg: float = 1e-3,
+        reg: float | None = 1e-2,
+        eigv_tol: float = 0,
+        truncate: int | None = None,
+        damping: float = 0,
+        rdamping: float = 0,
+        update_freq: int = 1,
+        orthogonalize_method: OrthogonalizeMethod = 'qr',
         hvp_method: HVPMethod = "batched_autograd",
         h: float = 1e-3,
-        update_freq: int = 1,
         inner: Chainable | None = None,
         seed: int | None = None,
     ):
@@ -92,25 +109,53 @@ class NystromSketchAndSolve(Transform):
         generator = self.get_generator(params[0].device, seed=fs['seed'])
         try:
-            L, Q = nystrom_approximation(A_mv=H_mv, A_mm=H_mm, ndim=ndim, rank=fs['rank'],
-                                        dtype=dtype, device=device, generator=generator)
+            # compute the approximation
+            L, Q = nystrom_approximation(
+                A_mv=H_mv,
+                A_mm=H_mm,
+                ndim=ndim,
+                rank=min(fs["rank"], ndim),
+                eigv_tol=fs["eigv_tol"],
+                orthogonalize_method=fs["orthogonalize_method"],
+                dtype=dtype,
+                device=device,
+                generator=generator,
+            )
+            # regularize
+            L, Q = regularize_eigh(
+                L=L,
+                Q=Q,
+                truncate=fs["truncate"],
+                tol=fs["eigv_tol"],
+                damping=fs["damping"],
+                rdamping=fs["rdamping"],
+            )
+            # store
+            if L is not None:
+                self.global_state["L"] = L
+                self.global_state["Q"] = Q
-            self.global_state["L"] = L
-            self.global_state["Q"] = Q
-        except torch.linalg.LinAlgError:
-            pass
+        except torch.linalg.LinAlgError as e:
+            warnings.warn(f"Nystrom approximation failed with: {e}")
     def apply_states(self, objective, states, settings):
-        fs = settings[0]
-        b = objective.get_updates()
-        # ----------------------------------- solve ---------------------------------- #
         if "L" not in self.global_state:
             return objective
+        fs = settings[0]
+        updates = objective.get_updates()
+        b=torch.cat([t.ravel() for t in updates])
+        # ----------------------------------- solve ---------------------------------- #
         L = self.global_state["L"]
         Q = self.global_state["Q"]
-        x = nystrom_sketch_and_solve(L=L, Q=Q, b=torch.cat([t.ravel() for t in b]), reg=fs["reg"])
+        if fs["reg"] is None:
+            x = Q @ ((Q.mH @ b) / L)
+        else:
+            x = nystrom_sketch_and_solve(L=L, Q=Q, b=b, reg=fs["reg"])
         # -------------------------------- set update -------------------------------- #
         objective.updates = vec_to_tensors(x, reference=objective.params)
@@ -127,8 +172,6 @@ class NystromSketchAndSolve(Transform):
 class NystromPCG(Transform):
     """Newton's method with a Nyström-preconditioned conjugate gradient solver.
-    This tends to outperform NewtonCG but requires tuning sketch size.
-    An adaptive version exists in https://arxiv.org/abs/2110.02820, I might implement it too at some point.
     Notes:
         - This module requires the a closure passed to the optimizer step,
@@ -138,7 +181,7 @@ class NystromPCG(Transform):
         - In most cases NystromPCG should be the first module in the chain because it relies on autograd. Use the ``inner`` argument if you wish to apply Newton preconditioning to another module's output.
     Args:
-        sketch_size (int):
+        rank (int):
             size of the sketch for preconditioning, this many hessian-vector products will be evaluated before
             running the conjugate gradient solver. Larger value improves the preconditioning and speeds up
             conjugate gradient.
@@ -169,7 +212,7 @@ class NystromPCG(Transform):
     NystromPCG with backtracking line search
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.NystromPCG(10),
         tz.m.Backtracking()
@@ -187,6 +230,8 @@ class NystromPCG(Transform):
         tol=1e-8,
         reg: float = 1e-6,
         update_freq: int = 1, # here update_freq is within update_states
+        eigv_tol: float = 0,
+        orthogonalize_method: OrthogonalizeMethod = 'qr',
         hvp_method: HVPMethod = "batched_autograd",
         h=1e-3,
         inner: Chainable | None = None,
@@ -202,31 +247,36 @@ class NystromPCG(Transform):
         # ---------------------- Hessian vector product function --------------------- #
         # this should run on every update_states
-        hvp_method = fs['hvp_method']
-        h = fs['h']
-        _, H_mv, H_mm = objective.tensor_Hvp_function(hvp_method=hvp_method, h=h, at_x0=True)
+        _, H_mv, H_mm = objective.tensor_Hvp_function(hvp_method=fs['hvp_method'], h=fs['h'], at_x0=True)
         objective.temp = H_mv
         # --------------------------- update preconditioner -------------------------- #
         step = self.increment_counter("step", 0)
-        update_freq = self.defaults["update_freq"]
-        if step % update_freq == 0:
+        if step % fs["update_freq"] == 0:
-            rank = fs['rank']
             ndim = sum(t.numel() for t in objective.params)
             device = objective.params[0].device
             dtype = objective.params[0].dtype
             generator = self.get_generator(device, seed=fs['seed'])
             try:
-                L, Q = nystrom_approximation(A_mv=None, A_mm=H_mm, ndim=ndim, rank=rank,
-                                            dtype=dtype, device=device, generator=generator)
+                L, Q = nystrom_approximation(
+                    A_mv=None,
+                    A_mm=H_mm,
+                    ndim=ndim,
+                    rank=min(fs["rank"], ndim),
+                    eigv_tol=fs["eigv_tol"],
+                    orthogonalize_method=fs["orthogonalize_method"],
+                    dtype=dtype,
+                    device=device,
+                    generator=generator,
+                )
                 self.global_state["L"] = L
                 self.global_state["Q"] = Q
-            except torch.linalg.LinAlgError:
-                pass
+            except torch.linalg.LinAlgError as e:
+                warnings.warn(f"Nystrom approximation failed with: {e}")
     @torch.no_grad
     def apply_states(self, objective, states, settings):
@@ -243,6 +293,7 @@ class NystromPCG(Transform):
         L = self.global_state["L"]
         Q = self.global_state["Q"]
         x = nystrom_pcg(L=L, Q=Q, A_mv=H_mv, b=torch.cat([t.ravel() for t in b]),
                         reg=fs['reg'], tol=fs["tol"], maxiter=fs["maxiter"])

torchzero 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl