PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/second_order/nystrom.py CHANGED Viewed

@@ -1,153 +1,187 @@
-from collections.abc import Callable
-from typing import Literal, overload
 import warnings
-import torch
+from typing import Literal
-from ...utils import TensorList, as_tensorlist, generic_zeros_like, generic_vector_norm, generic_numel, vec_to_tensors
-from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
+import torch
-from ...core import Chainable, apply_transform, Module
-from ...utils.linalg.solve import nystrom_sketch_and_solve, nystrom_pcg
+from ...core import Chainable, Transform, HVPMethod
+from ...utils import TensorList, vec_to_tensors
+from ...linalg import nystrom_pcg, nystrom_sketch_and_solve, nystrom_approximation, cg, regularize_eigh, OrthogonalizeMethod
+from ...linalg.linear_operator import Eigendecomposition, ScaledIdentity
-class NystromSketchAndSolve(Module):
+class NystromSketchAndSolve(Transform):
     """Newton's method with a Nyström sketch-and-solve solver.
-    .. note::
-        This module requires the a closure passed to the optimizer step,
-        as it needs to re-evaluate the loss and gradients for calculating HVPs.
-        The closure must accept a ``backward`` argument (refer to documentation).
-    .. note::
-        In most cases NystromSketchAndSolve should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply Newton preconditioning to another module's output.
+    Notes:
+        - This module requires the a closure passed to the optimizer step, as it needs to re-evaluate the loss and gradients for calculating HVPs. The closure must accept a ``backward`` argument (refer to documentation).
-    .. note::
-        If this is unstable, increase the :code:`reg` parameter and tune the rank.
+        - In most cases NystromSketchAndSolve should be the first module in the chain because it relies on autograd. Use the ``inner`` argument if you wish to apply Newton preconditioning to another module's output.
-    .. note:
-        :code:`tz.m.NystromPCG` usually outperforms this.
+        - If this is unstable, increase the ``reg`` parameter and tune the rank.
     Args:
         rank (int): size of the sketch, this many hessian-vector products will be evaluated per step.
-        reg (float, optional): regularization parameter. Defaults to 1e-3.
+        reg (float | None, optional):
+            scale of identity matrix added to hessian. Note that if this is specified, nystrom sketch-and-solve
+            is used to compute ``(Q diag(L) Q.T + reg*I)x = b``. It is very unstable when ``reg`` is small,
+            i.e. smaller than 1e-4. If this is None,``(Q diag(L) Q.T)x = b`` is computed by simply taking
+            reciprocal of eigenvalues. Defaults to 1e-3.
+        eigv_tol (float, optional):
+            all eigenvalues smaller than largest eigenvalue times ``eigv_tol`` are removed. Defaults to None.
+        truncate (int | None, optional):
+            keeps top ``truncate`` eigenvalues. Defaults to None.
+        damping (float, optional): scalar added to eigenvalues. Defaults to 0.
+        rdamping (float, optional): scalar multiplied by largest eigenvalue and added to eigenvalues. Defaults to 0.
+        update_freq (int, optional): frequency of updating preconditioner. Defaults to 1.
         hvp_method (str, optional):
-            Determines how Hessian-vector products are evaluated.
-            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
-              This requires creating a graph for the gradient.
-            - ``"forward"``: Use a forward finite difference formula to
-              approximate the HVP. This requires one extra gradient evaluation.
-            - ``"central"``: Use a central finite difference formula for a
-              more accurate HVP approximation. This requires two extra
-              gradient evaluations.
-            Defaults to "autograd".
-        h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
+            Determines how Hessian-vector products are computed.
+            - ``"batched_autograd"`` - uses autograd with batched hessian-vector products to compute the preconditioner. Faster than ``"autograd"`` but uses more memory.
+            - ``"autograd"`` - uses autograd hessian-vector products, uses a for loop to compute the preconditioner. Slower than ``"batched_autograd"`` but uses less memory.
+            - ``"fd_forward"`` - uses gradient finite difference approximation with a less accurate forward formula which requires one extra gradient evaluation per hessian-vector product.
+            - ``"fd_central"`` - uses gradient finite difference approximation with a more accurate central formula which requires two gradient evaluations per hessian-vector product.
+            Defaults to ``"autograd"``.
+        h (float, optional):
+            The step size for finite difference if ``hvp_method`` is
+            ``"fd_forward"`` or ``"fd_central"``. Defaults to 1e-3.
         inner (Chainable | None, optional): modules to apply hessian preconditioner to. Defaults to None.
         seed (int | None, optional): seed for random generator. Defaults to None.
     Examples:
-        NystromSketchAndSolve with backtracking line search
+    NystromSketchAndSolve with backtracking line search
-        .. code-block:: python
+    ```py
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.NystromSketchAndSolve(100),
+        tz.m.Backtracking()
+    )
+    ```
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.NystromSketchAndSolve(10),
-                tz.m.Backtracking()
-            )
+    Trust region NystromSketchAndSolve
+    ```py
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.LevenbergMarquadt(tz.m.NystromSketchAndSolve(100)),
+    )
+    ```
+    References:
+    - [Frangella, Z., Rathore, P., Zhao, S., & Udell, M. (2024). SketchySGD: Reliable Stochastic Optimization via Randomized Curvature Estimates. SIAM Journal on Mathematics of Data Science, 6(4), 1173-1204.](https://arxiv.org/pdf/2211.08597)
+    - [Frangella, Z., Tropp, J. A., & Udell, M. (2023). Randomized nyström preconditioning. SIAM Journal on Matrix Analysis and Applications, 44(2), 718-752](https://arxiv.org/abs/2110.02820)
-    Reference:
-        Frangella, Z., Tropp, J. A., & Udell, M. (2023). Randomized nyström preconditioning. SIAM Journal on Matrix Analysis and Applications, 44(2), 718-752. https://arxiv.org/abs/2110.02820
     """
     def __init__(
         self,
         rank: int,
-        reg: float = 1e-3,
-        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        reg: float | None = 1e-2,
+        eigv_tol: float = 0,
+        truncate: int | None = None,
+        damping: float = 0,
+        rdamping: float = 0,
+        update_freq: int = 1,
+        orthogonalize_method: OrthogonalizeMethod = 'qr',
+        hvp_method: HVPMethod = "batched_autograd",
         h: float = 1e-3,
         inner: Chainable | None = None,
         seed: int | None = None,
     ):
-        defaults = dict(rank=rank, reg=reg, hvp_method=hvp_method, h=h, seed=seed)
-        super().__init__(defaults,)
-        if inner is not None:
-            self.set_child('inner', inner)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner'], defaults["update_freq"]
+        super().__init__(defaults, update_freq=update_freq, inner=inner)
     @torch.no_grad
-    def step(self, var):
-        params = TensorList(var.params)
-        closure = var.closure
-        if closure is None: raise RuntimeError('NewtonCG requires closure')
-        settings = self.settings[params[0]]
-        rank = settings['rank']
-        reg = settings['reg']
-        hvp_method = settings['hvp_method']
-        h = settings['h']
-        seed = settings['seed']
-        generator = None
-        if seed is not None:
-            if 'generator' not in self.global_state:
-                self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
-            generator = self.global_state['generator']
+    def update_states(self, objective, states, settings):
+        params = TensorList(objective.params)
+        fs = settings[0]
         # ---------------------- Hessian vector product function --------------------- #
-        if hvp_method == 'autograd':
-            grad = var.get_grad(create_graph=True)
+        hvp_method = fs['hvp_method']
+        h = fs['h']
+        _, H_mv, H_mm = objective.tensor_Hvp_function(hvp_method=hvp_method, h=h, at_x0=True)
+        # ---------------------------------- sketch ---------------------------------- #
+        ndim = sum(t.numel() for t in objective.params)
+        device = params[0].device
+        dtype = params[0].dtype
+        generator = self.get_generator(params[0].device, seed=fs['seed'])
+        try:
+            # compute the approximation
+            L, Q = nystrom_approximation(
+                A_mv=H_mv,
+                A_mm=H_mm,
+                ndim=ndim,
+                rank=min(fs["rank"], ndim),
+                eigv_tol=fs["eigv_tol"],
+                orthogonalize_method=fs["orthogonalize_method"],
+                dtype=dtype,
+                device=device,
+                generator=generator,
+            )
-            def H_mm(x):
-                with torch.enable_grad():
-                    Hvp = hvp(params, grad, params.from_vec(x), retain_graph=True)
-                    return torch.cat([t.ravel() for t in Hvp])
+            # regularize
+            L, Q = regularize_eigh(
+                L=L,
+                Q=Q,
+                truncate=fs["truncate"],
+                tol=fs["eigv_tol"],
+                damping=fs["damping"],
+                rdamping=fs["rdamping"],
+            )
-        else:
+            # store
+            if L is not None:
+                self.global_state["L"] = L
+                self.global_state["Q"] = Q
-            with torch.enable_grad():
-                grad = var.get_grad()
+        except torch.linalg.LinAlgError as e:
+            warnings.warn(f"Nystrom approximation failed with: {e}")
-            if hvp_method == 'forward':
-                def H_mm(x):
-                    Hvp = hvp_fd_forward(closure, params, params.from_vec(x), h=h, g_0=grad, normalize=True)[1]
-                    return torch.cat([t.ravel() for t in Hvp])
+    def apply_states(self, objective, states, settings):
+        if "L" not in self.global_state:
+            return objective
-            elif hvp_method == 'central':
-                def H_mm(x):
-                    Hvp = hvp_fd_central(closure, params, params.from_vec(x), h=h, normalize=True)[1]
-                    return torch.cat([t.ravel() for t in Hvp])
+        fs = settings[0]
+        updates = objective.get_updates()
+        b=torch.cat([t.ravel() for t in updates])
-            else:
-                raise ValueError(hvp_method)
+        # ----------------------------------- solve ---------------------------------- #
+        L = self.global_state["L"]
+        Q = self.global_state["Q"]
+        if fs["reg"] is None:
+            x = Q @ ((Q.mH @ b) / L)
+        else:
+            x = nystrom_sketch_and_solve(L=L, Q=Q, b=b, reg=fs["reg"])
-        # -------------------------------- inner step -------------------------------- #
-        b = var.get_update()
-        if 'inner' in self.children:
-            b = apply_transform(self.children['inner'], b, params=params, grads=grad, var=var)
+        # -------------------------------- set update -------------------------------- #
+        objective.updates = vec_to_tensors(x, reference=objective.params)
+        return objective
-        # ------------------------------ sketch&n&solve ------------------------------ #
-        x = nystrom_sketch_and_solve(A_mm=H_mm, b=torch.cat([t.ravel() for t in b]), rank=rank, reg=reg, generator=generator)
-        var.update = vec_to_tensors(x, reference=params)
-        return var
+    def get_H(self, objective=...):
+        if "L" not in self.global_state:
+            return ScaledIdentity()
+        L = self.global_state["L"]
+        Q = self.global_state["Q"]
+        return Eigendecomposition(L, Q)
-class NystromPCG(Module):
+class NystromPCG(Transform):
     """Newton's method with a Nyström-preconditioned conjugate gradient solver.
-    This tends to outperform NewtonCG but requires tuning sketch size.
-    An adaptive version exists in https://arxiv.org/abs/2110.02820, I might implement it too at some point.
-    .. note::
-        This module requires the a closure passed to the optimizer step,
+    Notes:
+        - This module requires the a closure passed to the optimizer step,
         as it needs to re-evaluate the loss and gradients for calculating HVPs.
         The closure must accept a ``backward`` argument (refer to documentation).
-    .. note::
-        In most cases NystromPCG should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply Newton preconditioning to another module's output.
+        - In most cases NystromPCG should be the first module in the chain because it relies on autograd. Use the ``inner`` argument if you wish to apply Newton preconditioning to another module's output.
     Args:
-        sketch_size (int):
+        rank (int):
             size of the sketch for preconditioning, this many hessian-vector products will be evaluated before
             running the conjugate gradient solver. Larger value improves the preconditioning and speeds up
             conjugate gradient.
@@ -159,31 +193,31 @@ class NystromPCG(Module):
         tol (float, optional): relative tolerance for conjugate gradient solver. Defaults to 1e-4.
         reg (float, optional): regularization parameter. Defaults to 1e-8.
         hvp_method (str, optional):
-            Determines how Hessian-vector products are evaluated.
-            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
-              This requires creating a graph for the gradient.
-            - ``"forward"``: Use a forward finite difference formula to
-              approximate the HVP. This requires one extra gradient evaluation.
-            - ``"central"``: Use a central finite difference formula for a
-              more accurate HVP approximation. This requires two extra
-              gradient evaluations.
-            Defaults to "autograd".
-        h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
+            Determines how Hessian-vector products are computed.
+            - ``"batched_autograd"`` - uses autograd with batched hessian-vector products to compute the preconditioner. Faster than ``"autograd"`` but uses more memory.
+            - ``"autograd"`` - uses autograd hessian-vector products, uses a for loop to compute the preconditioner. Slower than ``"batched_autograd"`` but uses less memory.
+            - ``"fd_forward"`` - uses gradient finite difference approximation with a less accurate forward formula which requires one extra gradient evaluation per hessian-vector product.
+            - ``"fd_central"`` - uses gradient finite difference approximation with a more accurate central formula which requires two gradient evaluations per hessian-vector product.
+            Defaults to ``"autograd"``.
+        h (float, optional):
+            The step size for finite difference if ``hvp_method`` is
+            ``"fd_forward"`` or ``"fd_central"``. Defaults to 1e-3.
         inner (Chainable | None, optional): modules to apply hessian preconditioner to. Defaults to None.
         seed (int | None, optional): seed for random generator. Defaults to None.
     Examples:
-        NystromPCG with backtracking line search
-        .. code-block:: python
+    NystromPCG with backtracking line search
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.NystromPCG(10),
-                tz.m.Backtracking()
-            )
+    ```python
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.NystromPCG(10),
+        tz.m.Backtracking()
+    )
+    ```
     Reference:
         Frangella, Z., Tropp, J. A., & Udell, M. (2023). Randomized nyström preconditioning. SIAM Journal on Matrix Analysis and Applications, 44(2), 718-752. https://arxiv.org/abs/2110.02820
@@ -191,81 +225,78 @@ class NystromPCG(Module):
     """
     def __init__(
         self,
-        sketch_size: int,
+        rank: int,
         maxiter=None,
         tol=1e-8,
         reg: float = 1e-6,
-        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
+        update_freq: int = 1, # here update_freq is within update_states
+        eigv_tol: float = 0,
+        orthogonalize_method: OrthogonalizeMethod = 'qr',
+        hvp_method: HVPMethod = "batched_autograd",
         h=1e-3,
         inner: Chainable | None = None,
         seed: int | None = None,
     ):
-        defaults = dict(sketch_size=sketch_size, reg=reg, maxiter=maxiter, tol=tol, hvp_method=hvp_method, h=h, seed=seed)
-        super().__init__(defaults,)
-        if inner is not None:
-            self.set_child('inner', inner)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner']
+        super().__init__(defaults, inner=inner)
     @torch.no_grad
-    def step(self, var):
-        params = TensorList(var.params)
-        closure = var.closure
-        if closure is None: raise RuntimeError('NewtonCG requires closure')
-        settings = self.settings[params[0]]
-        sketch_size = settings['sketch_size']
-        maxiter = settings['maxiter']
-        tol = settings['tol']
-        reg = settings['reg']
-        hvp_method = settings['hvp_method']
-        h = settings['h']
-        seed = settings['seed']
-        generator = None
-        if seed is not None:
-            if 'generator' not in self.global_state:
-                self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
-            generator = self.global_state['generator']
+    def update_states(self, objective, states, settings):
+        fs = settings[0]
         # ---------------------- Hessian vector product function --------------------- #
-        if hvp_method == 'autograd':
-            grad = var.get_grad(create_graph=True)
-            def H_mm(x):
-                with torch.enable_grad():
-                    Hvp = hvp(params, grad, params.from_vec(x), retain_graph=True)
-                    return torch.cat([t.ravel() for t in Hvp])
-        else:
-            with torch.enable_grad():
-                grad = var.get_grad()
-            if hvp_method == 'forward':
-                def H_mm(x):
-                    Hvp = hvp_fd_forward(closure, params, params.from_vec(x), h=h, g_0=grad, normalize=True)[1]
-                    return torch.cat([t.ravel() for t in Hvp])
-            elif hvp_method == 'central':
-                def H_mm(x):
-                    Hvp = hvp_fd_central(closure, params, params.from_vec(x), h=h, normalize=True)[1]
-                    return torch.cat([t.ravel() for t in Hvp])
-            else:
-                raise ValueError(hvp_method)
-        # -------------------------------- inner step -------------------------------- #
-        b = var.get_update()
-        if 'inner' in self.children:
-            b = apply_transform(self.children['inner'], b, params=params, grads=grad, var=var)
-        # ------------------------------ sketch&n&solve ------------------------------ #
-        x = nystrom_pcg(A_mm=H_mm, b=torch.cat([t.ravel() for t in b]), sketch_size=sketch_size, reg=reg, tol=tol, maxiter=maxiter, x0_=None, generator=generator)
-        var.update = vec_to_tensors(x, reference=params)
-        return var
+        # this should run on every update_states
+        _, H_mv, H_mm = objective.tensor_Hvp_function(hvp_method=fs['hvp_method'], h=fs['h'], at_x0=True)
+        objective.temp = H_mv
+        # --------------------------- update preconditioner -------------------------- #
+        step = self.increment_counter("step", 0)
+        if step % fs["update_freq"] == 0:
+            ndim = sum(t.numel() for t in objective.params)
+            device = objective.params[0].device
+            dtype = objective.params[0].dtype
+            generator = self.get_generator(device, seed=fs['seed'])
+            try:
+                L, Q = nystrom_approximation(
+                    A_mv=None,
+                    A_mm=H_mm,
+                    ndim=ndim,
+                    rank=min(fs["rank"], ndim),
+                    eigv_tol=fs["eigv_tol"],
+                    orthogonalize_method=fs["orthogonalize_method"],
+                    dtype=dtype,
+                    device=device,
+                    generator=generator,
+                )
+                self.global_state["L"] = L
+                self.global_state["Q"] = Q
+            except torch.linalg.LinAlgError as e:
+                warnings.warn(f"Nystrom approximation failed with: {e}")
+    @torch.no_grad
+    def apply_states(self, objective, states, settings):
+        b = objective.get_updates()
+        H_mv = objective.poptemp()
+        fs = self.settings[objective.params[0]]
+        # ----------------------------------- solve ---------------------------------- #
+        if "L" not in self.global_state:
+            # fallback on cg
+            sol = cg(A_mv=H_mv, b=TensorList(b), tol=fs["tol"], reg=fs["reg"], maxiter=fs["maxiter"])
+            objective.updates = sol.x
+            return objective
+        L = self.global_state["L"]
+        Q = self.global_state["Q"]
+        x = nystrom_pcg(L=L, Q=Q, A_mv=H_mv, b=torch.cat([t.ravel() for t in b]),
+                        reg=fs['reg'], tol=fs["tol"], maxiter=fs["maxiter"])
+        # -------------------------------- set update -------------------------------- #
+        objective.updates = vec_to_tensors(x, reference=objective.params)
+        return objective

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl