PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +43 -33
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +48 -52
torchzero/core/module.py +130 -50
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +2 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +15 -12
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +1 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +32 -32
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +50 -48
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +10 -10
torchzero/modules/quasi_newton/sg2.py +19 -19
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +49 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +57 -90
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +157 -177
torchzero/modules/second_order/rsn.py +106 -96
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +10 -10
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +93 -69
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/second_order/multipoint.py CHANGED Viewed

@@ -1,19 +1,17 @@
-from collections.abc import Callable
-from contextlib import nullcontext
 from abc import ABC, abstractmethod
+from collections.abc import Callable, Mapping
+from typing import Any
 import numpy as np
 import torch
-from ...core import Chainable, Module, apply_transform, Var
-from ...utils import TensorList, vec_to_tensors, vec_to_tensors_
-from ...utils.derivatives import (
-    flatten_jacobian,
-    jacobian_wrt,
-)
+from ...core import Chainable, DerivativesMethod, Objective, Transform
+from ...utils import TensorList, vec_to_tensors
-class HigherOrderMethodBase(Module, ABC):
-    def __init__(self, defaults: dict | None = None, vectorize: bool = True):
-        self._vectorize = vectorize
+class HigherOrderMethodBase(Transform, ABC):
+    def __init__(self, defaults: dict | None = None, derivatives_method: DerivativesMethod = 'batched_autograd'):
+        self._derivatives_method: DerivativesMethod = derivatives_method
         super().__init__(defaults)
     @abstractmethod
@@ -21,61 +19,27 @@ class HigherOrderMethodBase(Module, ABC):
         self,
         x: torch.Tensor,
         evaluate: Callable[[torch.Tensor, int], tuple[torch.Tensor, ...]],
-        var: Var,
+        objective: Objective,
+        setting: Mapping[str, Any],
     ) -> torch.Tensor:
         """"""
     @torch.no_grad
-    def step(self, var):
-        params = TensorList(var.params)
-        x0 = params.clone()
-        closure = var.closure
+    def apply_states(self, objective, states, settings):
+        params = TensorList(objective.params)
+        closure = objective.closure
         if closure is None: raise RuntimeError('MultipointNewton requires closure')
-        vectorize = self._vectorize
+        derivatives_method = self._derivatives_method
         def evaluate(x, order) -> tuple[torch.Tensor, ...]:
             """order=0 - returns (loss,), order=1 - returns (loss, grad), order=2 - returns (loss, grad, hessian), etc."""
-            params.from_vec_(x)
-            if order == 0:
-                loss = closure(False)
-                params.copy_(x0)
-                return (loss, )
-            if order == 1:
-                with torch.enable_grad():
-                    loss = closure()
-                grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
-                params.copy_(x0)
-                return loss, torch.cat([g.ravel() for g in grad])
-            with torch.enable_grad():
-                loss = var.loss = var.loss_approx = closure(False)
-                g_list = torch.autograd.grad(loss, params, create_graph=True)
-                var.grad = list(g_list)
-                g = torch.cat([t.ravel() for t in g_list])
-                n = g.numel()
-                ret = [loss, g]
-                T = g # current derivatives tensor
-                # get all derivative up to order
-                for o in range(2, order + 1):
-                    is_last = o == order
-                    T_list = jacobian_wrt([T], params, create_graph=not is_last, batched=vectorize)
-                    with torch.no_grad() if is_last else nullcontext():
-                        # the shape is (ndim, ) * order
-                        T = flatten_jacobian(T_list).view(n, n, *T.shape[1:])
-                        ret.append(T)
-            params.copy_(x0)
-            return tuple(ret)
+            return objective.derivatives_at(x, order, method=derivatives_method)
         x = torch.cat([p.ravel() for p in params])
-        dir = self.one_iteration(x, evaluate, var)
-        var.update = vec_to_tensors(dir, var.params)
-        return var
+        dir = self.one_iteration(x, evaluate, objective, settings[0])
+        objective.updates = vec_to_tensors(dir, objective.params)
+        return objective
 def _inv(A: torch.Tensor, lstsq:bool) -> torch.Tensor:
     if lstsq: return torch.linalg.pinv(A) # pylint:disable=not-callable
@@ -106,16 +70,15 @@ class SixthOrder3P(HigherOrderMethodBase):
     Abro, Hameer Akhtar, and Muhammad Mujtaba Shaikh. "A new time-efficient and convergent nonlinear solver." Applied Mathematics and Computation 355 (2019): 516-536.
     """
-    def __init__(self, lstsq: bool=False, vectorize: bool = True):
+    def __init__(self, lstsq: bool=False, derivatives_method: DerivativesMethod = 'batched_autograd'):
         defaults=dict(lstsq=lstsq)
-        super().__init__(defaults=defaults, vectorize=vectorize)
+        super().__init__(defaults=defaults, derivatives_method=derivatives_method)
-    def one_iteration(self, x, evaluate, var):
-        settings = self.defaults
-        lstsq = settings['lstsq']
+    @torch.no_grad
+    def one_iteration(self, x, evaluate, objective, setting):
         def f(x): return evaluate(x, 1)[1]
         def f_j(x): return evaluate(x, 2)[1:]
-        x_star = sixth_order_3p(x, f, f_j, lstsq)
+        x_star = sixth_order_3p(x, f, f_j, setting['lstsq'])
         return x - x_star
 # I don't think it works (I tested root finding with this and it goes all over the place)
@@ -173,15 +136,14 @@ def sixth_order_5p(x:torch.Tensor, f_j, lstsq:bool=False):
 class SixthOrder5P(HigherOrderMethodBase):
     """Argyros, Ioannis K., et al. "Extended convergence for two sixth order methods under the same weak conditions." Foundations 3.1 (2023): 127-139."""
-    def __init__(self, lstsq: bool=False, vectorize: bool = True):
+    def __init__(self, lstsq: bool=False, derivatives_method: DerivativesMethod = 'batched_autograd'):
         defaults=dict(lstsq=lstsq)
-        super().__init__(defaults=defaults, vectorize=vectorize)
+        super().__init__(defaults=defaults, derivatives_method=derivatives_method)
-    def one_iteration(self, x, evaluate, var):
-        settings = self.defaults
-        lstsq = settings['lstsq']
+    @torch.no_grad
+    def one_iteration(self, x, evaluate, objective, setting):
         def f_j(x): return evaluate(x, 2)[1:]
-        x_star = sixth_order_5p(x, f_j, lstsq)
+        x_star = sixth_order_5p(x, f_j, setting['lstsq'])
         return x - x_star
 # 2f 1J 2 solves
@@ -196,16 +158,15 @@ class TwoPointNewton(HigherOrderMethodBase):
     """two-point Newton method with frozen derivative with third order convergence.
     Sharma, Janak Raj, and Deepak Kumar. "A fast and efficient composite Newton–Chebyshev method for systems of nonlinear equations." Journal of Complexity 49 (2018): 56-73."""
-    def __init__(self, lstsq: bool=False, vectorize: bool = True):
+    def __init__(self, lstsq: bool=False, derivatives_method: DerivativesMethod = 'batched_autograd'):
         defaults=dict(lstsq=lstsq)
-        super().__init__(defaults=defaults, vectorize=vectorize)
+        super().__init__(defaults=defaults, derivatives_method=derivatives_method)
-    def one_iteration(self, x, evaluate, var):
-        settings = self.defaults
-        lstsq = settings['lstsq']
+    @torch.no_grad
+    def one_iteration(self, x, evaluate, objective, setting):
         def f(x): return evaluate(x, 1)[1]
         def f_j(x): return evaluate(x, 2)[1:]
-        x_star = two_point_newton(x, f, f_j, lstsq)
+        x_star = two_point_newton(x, f, f_j, setting['lstsq'])
         return x - x_star
 #3f 2J 1inv
@@ -224,15 +185,14 @@ def sixth_order_3pm2(x:torch.Tensor, f, f_j, lstsq:bool=False):
 class SixthOrder3PM2(HigherOrderMethodBase):
     """Wang, Xiaofeng, and Yang Li. "An efficient sixth-order Newton-type method for solving nonlinear systems." Algorithms 10.2 (2017): 45."""
-    def __init__(self, lstsq: bool=False, vectorize: bool = True):
+    def __init__(self, lstsq: bool=False, derivatives_method: DerivativesMethod = 'batched_autograd'):
         defaults=dict(lstsq=lstsq)
-        super().__init__(defaults=defaults, vectorize=vectorize)
+        super().__init__(defaults=defaults, derivatives_method=derivatives_method)
-    def one_iteration(self, x, evaluate, var):
-        settings = self.defaults
-        lstsq = settings['lstsq']
+    @torch.no_grad
+    def one_iteration(self, x, evaluate, objective, setting):
         def f_j(x): return evaluate(x, 2)[1:]
         def f(x): return evaluate(x, 1)[1]
-        x_star = sixth_order_3pm2(x, f, f_j, lstsq)
+        x_star = sixth_order_3pm2(x, f, f_j, setting['lstsq'])
         return x - x_star

torchzero/modules/second_order/newton.py CHANGED Viewed

@@ -1,21 +1,12 @@
-import warnings
 from collections.abc import Callable
-from functools import partial
 from typing import Literal
 import torch
-from ...core import Chainable, Module, apply_transform, Var
-from ...utils import TensorList, vec_to_tensors
-from ...utils.derivatives import (
-    flatten_jacobian,
-    hessian_mat,
-    hvp,
-    hvp_fd_central,
-    hvp_fd_forward,
-    jacobian_and_hessian_wrt,
-)
-from ...utils.linalg.linear_operator import DenseWithInverse, Dense
+from ...core import Chainable, Transform, Objective, HessianMethod, Module
+from ...utils import vec_to_tensors
+from ...linalg.linear_operator import Dense, DenseWithInverse
 def _lu_solve(H: torch.Tensor, g: torch.Tensor):
     try:
@@ -26,10 +17,9 @@ def _lu_solve(H: torch.Tensor, g: torch.Tensor):
         return None
 def _cholesky_solve(H: torch.Tensor, g: torch.Tensor):
-    x, info = torch.linalg.cholesky_ex(H) # pylint:disable=not-callable
+    L, info = torch.linalg.cholesky_ex(H) # pylint:disable=not-callable
     if info == 0:
-        g.unsqueeze_(1)
-        return torch.cholesky_solve(g, x)
+        return torch.cholesky_solve(g.unsqueeze(-1), L).squeeze(-1)
     return None
 def _least_squares_solve(H: torch.Tensor, g: torch.Tensor):
@@ -49,49 +39,14 @@ def _eigh_solve(H: torch.Tensor, g: torch.Tensor, tfm: Callable | None, search_n
     except torch.linalg.LinAlgError:
         return None
-def _get_loss_grad_and_hessian(var: Var, hessian_method:str, vectorize:bool):
-    """returns (loss, g_list, H). Also sets var.loss and var.grad.
-    If hessian_method isn't 'autograd', loss is not set and returned as None"""
-    closure = var.closure
-    if closure is None:
-        raise RuntimeError("Second order methods requires a closure to be provided to the `step` method.")
-    params = var.params
-    # ------------------------ calculate grad and hessian ------------------------ #
-    loss = None
-    if hessian_method == 'autograd':
-        with torch.enable_grad():
-            loss = var.loss = var.loss_approx = closure(False)
-            g_list, H_list = jacobian_and_hessian_wrt([loss], params, batched=vectorize)
-            g_list = [t[0] for t in g_list] # remove leading dim from loss
-            var.grad = g_list
-            H = flatten_jacobian(H_list)
-    elif hessian_method in ('func', 'autograd.functional'):
-        strat = 'forward-mode' if vectorize else 'reverse-mode'
-        with torch.enable_grad():
-            g_list = var.get_grad(retain_graph=True)
-            H = hessian_mat(partial(closure, backward=False), params,
-                            method=hessian_method, vectorize=vectorize, outer_jacobian_strategy=strat) # pyright:ignore[reportAssignmentType]
-    else:
-        raise ValueError(hessian_method)
-    return loss, g_list, H
-def _newton_step(var: Var, H: torch.Tensor, damping:float, inner: Module | None, H_tfm, eigval_fn, use_lstsq:bool, g_proj: Callable | None = None) -> torch.Tensor:
-    """returns the update tensor, then do vec_to_tensor(update, params)"""
-    params = var.params
-    if damping != 0:
-        H = H + torch.eye(H.size(-1), dtype=H.dtype, device=H.device).mul_(damping)
+def _newton_step(objective: Objective, H: torch.Tensor, damping:float, H_tfm, eigval_fn, use_lstsq:bool, g_proj: Callable | None = None, no_inner: Module | None = None) -> torch.Tensor:
+    """INNER SHOULD BE NONE IN MOST CASES! Because Transform already has inner.
+    Returns the update tensor, then do vec_to_tensor(update, params)"""
     # -------------------------------- inner step -------------------------------- #
-    update = var.get_update()
-    if inner is not None:
-        update = apply_transform(inner, update, params=params, grads=var.grad, loss=var.loss, var=var)
+    if no_inner is not None:
+        objective = no_inner.step(objective)
+    update = objective.get_updates()
     g = torch.cat([t.ravel() for t in update])
     if g_proj is not None: g = g_proj(g)
@@ -99,6 +54,9 @@ def _newton_step(var: Var, H: torch.Tensor, damping:float, inner: Module | None,
     # ----------------------------------- solve ---------------------------------- #
     update = None
+    if damping != 0:
+        H = H + torch.eye(H.size(-1), dtype=H.dtype, device=H.device).mul_(damping)
     if H_tfm is not None:
         ret = H_tfm(H, g)
@@ -133,7 +91,7 @@ def _get_H(H: torch.Tensor, eigval_fn):
     return Dense(H)
-class Newton(Module):
+class Newton(Transform):
     """Exact newton's method via autograd.
     Newton's method produces a direction jumping to the stationary point of quadratic approximation of the target function.
@@ -141,7 +99,7 @@ class Newton(Module):
     ``g`` can be output of another module, if it is specifed in ``inner`` argument.
     Note:
-        In most cases Newton should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply Newton preconditioning to another module's output.
+        In most cases Newton should be the first module in the chain because it relies on autograd. Use the ``inner`` argument if you wish to apply Newton preconditioning to another module's output.
     Note:
         This module requires the a closure passed to the optimizer step,
@@ -158,10 +116,6 @@ class Newton(Module):
             when hessian is not invertible. If False, tries cholesky, if it fails tries LU, and then least squares.
             If ``eigval_fn`` is specified, eigendecomposition will always be used to solve the linear system and this
             argument will be ignored.
-        hessian_method (str):
-            how to calculate hessian. Defaults to "autograd".
-        vectorize (bool, optional):
-            whether to enable vectorized hessian. Defaults to True.
         H_tfm (Callable | None, optional):
             optional hessian transforms, takes in two arguments - `(hessian, gradient)`.
@@ -174,6 +128,21 @@ class Newton(Module):
         eigval_fn (Callable | None, optional):
             optional eigenvalues transform, for example ``torch.abs`` or ``lambda L: torch.clip(L, min=1e-8)``.
             If this is specified, eigendecomposition will be used to invert the hessian.
+        hessian_method (str):
+            Determines how hessian is computed.
+            - ``"batched_autograd"`` - uses autograd to compute ``ndim`` batched hessian-vector products. Faster than ``"autograd"`` but uses more memory.
+            - ``"autograd"`` - uses autograd to compute ``ndim`` hessian-vector products using for loop. Slower than ``"batched_autograd"`` but uses less memory.
+            - ``"functional_revrev"`` - uses ``torch.autograd.functional`` with "reverse-over-reverse" strategy and a for-loop. This is generally equivalent to ``"autograd"``.
+            - ``"functional_fwdrev"`` - uses ``torch.autograd.functional`` with vectorized "forward-over-reverse" strategy. Faster than ``"functional_fwdrev"`` but uses more memory (``"batched_autograd"`` seems to be faster)
+            - ``"func"`` - uses ``torch.func.hessian`` which uses "forward-over-reverse" strategy. This method is the fastest and is recommended, however it is more restrictive and fails with some operators which is why it isn't the default.
+            - ``"gfd_forward"`` - computes ``ndim`` hessian-vector products via gradient finite difference using a less accurate forward formula which requires one extra gradient evaluation per hessian-vector product.
+            - ``"gfd_central"`` - computes ``ndim`` hessian-vector products via gradient finite difference using a more accurate central formula which requires two gradient evaluations per hessian-vector product.
+            - ``"fd"`` - uses function values to estimate gradient and hessian via finite difference. This uses less evaluations than chaining ``"gfd_*"`` after ``tz.m.FDM``.
+            Defaults to ``"batched_autograd"``.
+        h (float, optional):
+            finite difference step size for "fd_forward" and "fd_central".
         inner (Chainable | None, optional): modules to apply hessian preconditioner to. Defaults to None.
     # See also
@@ -249,45 +218,43 @@ class Newton(Module):
         damping: float = 0,
         use_lstsq: bool = False,
         update_freq: int = 1,
-        hessian_method: Literal["autograd", "func", "autograd.functional"] = "autograd",
-        vectorize: bool = True,
         H_tfm: Callable[[torch.Tensor, torch.Tensor], tuple[torch.Tensor, bool]] | Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
         eigval_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
+        hessian_method: HessianMethod = "batched_autograd",
+        h: float = 1e-3,
         inner: Chainable | None = None,
     ):
-        defaults = dict(damping=damping, hessian_method=hessian_method, use_lstsq=use_lstsq, vectorize=vectorize, H_tfm=H_tfm, eigval_fn=eigval_fn, update_freq=update_freq)
-        super().__init__(defaults)
-        if inner is not None:
-            self.set_child('inner', inner)
+        defaults = locals().copy()
+        del defaults['self'], defaults['update_freq'], defaults["inner"]
+        super().__init__(defaults, update_freq=update_freq, inner=inner)
     @torch.no_grad
-    def update(self, var):
-        step = self.global_state.get('step', 0)
-        self.global_state['step'] = step + 1
+    def update_states(self, objective, states, settings):
+        fs = settings[0]
-        if step % self.defaults['update_freq'] == 0:
-            loss, g_list, self.global_state['H'] = _get_loss_grad_and_hessian(
-                var, self.defaults['hessian_method'], self.defaults['vectorize']
-            )
+        _, _, self.global_state['H'] = objective.hessian(
+            hessian_method=fs['hessian_method'],
+            h=fs['h'],
+            at_x0=True
+        )
     @torch.no_grad
-    def apply(self, var):
-        params = var.params
+    def apply_states(self, objective, states, settings):
+        params = objective.params
+        fs = settings[0]
         update = _newton_step(
-            var=var,
+            objective=objective,
             H = self.global_state["H"],
-            damping=self.defaults["damping"],
-            inner=self.children.get("inner", None),
-            H_tfm=self.defaults["H_tfm"],
-            eigval_fn=self.defaults["eigval_fn"],
-            use_lstsq=self.defaults["use_lstsq"],
+            damping = fs["damping"],
+            H_tfm = fs["H_tfm"],
+            eigval_fn = fs["eigval_fn"],
+            use_lstsq = fs["use_lstsq"],
         )
-        var.update = vec_to_tensors(update, params)
-        return var
+        objective.updates = vec_to_tensors(update, params)
+        return objective
-    def get_H(self,var=...):
+    def get_H(self,objective=...):
         return _get_H(self.global_state["H"], self.defaults["eigval_fn"])

torchzero 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl