PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/restarts/restars.py CHANGED Viewed

@@ -4,12 +4,14 @@ from typing import final, Literal, cast
 import torch
-from ...core import Chainable, Module, Var
+from ...core import Chainable, Module, Objective
 from ...utils import TensorList
 from ..termination import TerminationCriteriaBase
-def _reset_except_self(optimizer, var, self: Module):
-    for m in optimizer.unrolled_modules: m.reset()
+def _reset_except_self(objective, modules, self: Module):
+    for m in modules:
+        if m is not self:
+            m.reset()
 class RestartStrategyBase(Module, ABC):
     """Base class for restart strategies.
@@ -24,38 +26,38 @@ class RestartStrategyBase(Module, ABC):
             self.set_child('modules', modules)
     @abstractmethod
-    def should_reset(self, var: Var) -> bool:
+    def should_reset(self, objective: Objective) -> bool:
         """returns whether reset should occur"""
-    def _reset_on_condition(self, var):
+    def _reset_on_condition(self, objective: Objective):
         modules = self.children.get('modules', None)
-        if self.should_reset(var):
+        if self.should_reset(objective):
             if modules is None:
-                var.post_step_hooks.append(partial(_reset_except_self, self=self))
+                objective.post_step_hooks.append(partial(_reset_except_self, self=self))
             else:
                 modules.reset()
         return modules
     @final
-    def update(self, var):
-        modules = self._reset_on_condition(var)
+    def update(self, objective):
+        modules = self._reset_on_condition(objective)
         if modules is not None:
-            modules.update(var)
+            modules.update(objective)
     @final
-    def apply(self, var):
+    def apply(self, objective):
         # don't check here because it was check in `update`
         modules = self.children.get('modules', None)
-        if modules is None: return var
-        return modules.apply(var.clone(clone_update=False))
+        if modules is None: return objective
+        return modules.apply(objective.clone(clone_updates=False))
     @final
-    def step(self, var):
-        modules = self._reset_on_condition(var)
-        if modules is None: return var
-        return modules.step(var.clone(clone_update=False))
+    def step(self, objective):
+        modules = self._reset_on_condition(objective)
+        if modules is None: return objective
+        return modules.step(objective.clone(clone_updates=False))
@@ -76,11 +78,11 @@ class RestartOnStuck(RestartStrategyBase):
         super().__init__(defaults, modules)
     @torch.no_grad
-    def should_reset(self, var):
+    def should_reset(self, objective):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
-        params = TensorList(var.params)
+        params = TensorList(objective.params)
         tol = self.defaults['tol']
         if tol is None: tol = torch.finfo(params[0].dtype).tiny * 2
         n_tol = self.defaults['n_tol']
@@ -122,12 +124,12 @@ class RestartEvery(RestartStrategyBase):
         defaults = dict(steps=steps)
         super().__init__(defaults, modules)
-    def should_reset(self, var):
+    def should_reset(self, objective):
         step = self.global_state.get('step', 0) + 1
         self.global_state['step'] = step
         n = self.defaults['steps']
-        if isinstance(n, str): n = sum(p.numel() for p in var.params if p.requires_grad)
+        if isinstance(n, str): n = sum(p.numel() for p in objective.params if p.requires_grad)
         # reset every n steps
         if step % n == 0:
@@ -141,9 +143,9 @@ class RestartOnTerminationCriteria(RestartStrategyBase):
         super().__init__(None, modules)
         self.set_child('criteria', criteria)
-    def should_reset(self, var):
+    def should_reset(self, objective):
         criteria = cast(TerminationCriteriaBase, self.children['criteria'])
-        return criteria.should_terminate(var)
+        return criteria.should_terminate(objective)
 class PowellRestart(RestartStrategyBase):
     """Powell's two restarting criterions for conjugate gradient methods.
@@ -169,14 +171,14 @@ class PowellRestart(RestartStrategyBase):
         defaults=dict(cond1=cond1, cond2=cond2)
         super().__init__(defaults, modules)
-    def should_reset(self, var):
-        g = TensorList(var.get_grad())
+    def should_reset(self, objective):
+        g = TensorList(objective.get_grads())
         cond1 = self.defaults['cond1']; cond2 = self.defaults['cond2']
         # -------------------------------- initialize -------------------------------- #
         if 'initialized' not in self.global_state:
             self.global_state['initialized'] = 0
-            g_prev = self.get_state(var.params, 'g_prev', init=g)
+            g_prev = self.get_state(objective.params, 'g_prev', init=g)
             return False
         g_g = g.dot(g)
@@ -184,7 +186,7 @@ class PowellRestart(RestartStrategyBase):
         reset = False
         # ------------------------------- 1st condition ------------------------------ #
         if cond1 is not None:
-            g_prev = self.get_state(var.params, 'g_prev', must_exist=True, cls=TensorList)
+            g_prev = self.get_state(objective.params, 'g_prev', must_exist=True, cls=TensorList)
             g_g_prev = g_prev.dot(g)
             if g_g_prev.abs() >= cond1 * g_g:
@@ -192,7 +194,7 @@ class PowellRestart(RestartStrategyBase):
         # ------------------------------- 2nd condition ------------------------------ #
         if (cond2 is not None) and (not reset):
-            d_g = TensorList(var.get_update()).dot(g)
+            d_g = TensorList(objective.get_updates()).dot(g)
             if (-1-cond2) * g_g < d_g < (-1 + cond2) * g_g:
                 reset = True
@@ -229,17 +231,17 @@ class BirginMartinezRestart(Module):
         self.set_child("module", module)
-    def update(self, var):
+    def update(self, objective):
         module = self.children['module']
-        module.update(var)
+        module.update(objective)
-    def apply(self, var):
+    def apply(self, objective):
         module = self.children['module']
-        var = module.apply(var.clone(clone_update=False))
+        objective = module.apply(objective.clone(clone_updates=False))
         cond = self.defaults['cond']
-        g = TensorList(var.get_grad())
-        d = TensorList(var.get_update())
+        g = TensorList(objective.get_grads())
+        d = TensorList(objective.get_updates())
         d_g = d.dot(g)
         d_norm = d.global_vector_norm()
         g_norm = g.global_vector_norm()
@@ -247,7 +249,7 @@ class BirginMartinezRestart(Module):
         # d in our case is same direction as g so it has a minus sign
         if -d_g > -cond * d_norm * g_norm:
             module.reset()
-            var.update = g.clone()
-            return var
+            objective.updates = g.clone()
+            return objective
-        return var
+        return objective

torchzero/modules/second_order/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from .ifn import InverseFreeNewton
-from .inm import INM
+from .inm import ImprovedNewton
 from .multipoint import SixthOrder3P, SixthOrder3PM2, SixthOrder5P, TwoPointNewton
 from .newton import Newton
 from .newton_cg import NewtonCG, NewtonCGSteihaug
 from .nystrom import NystromPCG, NystromSketchAndSolve
-from .rsn import RSN
+from .rsn import SubspaceNewton

torchzero/modules/second_order/ifn.py CHANGED Viewed

@@ -1,89 +1,58 @@
-import warnings
-from collections.abc import Callable
-from functools import partial
-from typing import Literal
 import torch
-from ...core import Chainable, Module, apply_transform, Var
+from ...core import Chainable, Transform, HessianMethod
 from ...utils import TensorList, vec_to_tensors
-from ...utils.linalg.linear_operator import DenseWithInverse, Dense
-from .newton import _get_H, _get_loss_grad_and_hessian, _newton_step
+from ...linalg.linear_operator import DenseWithInverse
-class InverseFreeNewton(Module):
+class InverseFreeNewton(Transform):
     """Inverse-free newton's method
-    .. note::
-        In most cases Newton should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply Newton preconditioning to another module's output.
-    .. note::
-        This module requires the a closure passed to the optimizer step,
-        as it needs to re-evaluate the loss and gradients for calculating the hessian.
-        The closure must accept a ``backward`` argument (refer to documentation).
-    .. warning::
-        this uses roughly O(N^2) memory.
     Reference
         [Massalski, Marcin, and Magdalena Nockowska-Rosiak. "INVERSE-FREE NEWTON'S METHOD." Journal of Applied Analysis & Computation 15.4 (2025): 2238-2257.](https://www.jaac-online.com/article/doi/10.11948/20240428)
     """
     def __init__(
         self,
         update_freq: int = 1,
-        hessian_method: Literal["autograd", "func", "autograd.functional"] = "autograd",
-        vectorize: bool = True,
+        hessian_method: HessianMethod = "batched_autograd",
+        h: float = 1e-3,
         inner: Chainable | None = None,
     ):
-        defaults = dict(hessian_method=hessian_method, vectorize=vectorize, update_freq=update_freq)
-        super().__init__(defaults)
-        if inner is not None:
-            self.set_child('inner', inner)
+        defaults = dict(hessian_method=hessian_method, h=h)
+        super().__init__(defaults, update_freq=update_freq, inner=inner)
     @torch.no_grad
-    def update(self, var):
-        update_freq = self.defaults['update_freq']
+    def update_states(self, objective, states, settings):
+        fs = settings[0]
-        step = self.global_state.get('step', 0)
-        self.global_state['step'] = step + 1
+        _, _, H = objective.hessian(
+            hessian_method=fs['hessian_method'],
+            h=fs['h'],
+            at_x0=True
+        )
-        if step % update_freq == 0:
-            loss, g_list, H = _get_loss_grad_and_hessian(
-                var, self.defaults['hessian_method'], self.defaults['vectorize']
-            )
-            self.global_state["H"] = H
+        self.global_state["H"] = H
-            # inverse free part
-            if 'Y' not in self.global_state:
-                num = H.T
-                denom = (torch.linalg.norm(H, 1) * torch.linalg.norm(H, float('inf'))) # pylint:disable=not-callable
+        # inverse free part
+        if 'Y' not in self.global_state:
+            num = H.T
+            denom = (torch.linalg.norm(H, 1) * torch.linalg.norm(H, float('inf'))) # pylint:disable=not-callable
-                finfo = torch.finfo(H.dtype)
-                self.global_state['Y'] = num.div_(denom.clip(min=finfo.tiny * 2, max=finfo.max / 2))
+            finfo = torch.finfo(H.dtype)
+            self.global_state['Y'] = num.div_(denom.clip(min=finfo.tiny * 2, max=finfo.max / 2))
-            else:
-                Y = self.global_state['Y']
-                I2 = torch.eye(Y.size(0), device=Y.device, dtype=Y.dtype).mul_(2)
-                I2 -= H @ Y
-                self.global_state['Y'] = Y @ I2
+        else:
+            Y = self.global_state['Y']
+            I2 = torch.eye(Y.size(0), device=Y.device, dtype=Y.dtype).mul_(2)
+            I2 -= H @ Y
+            self.global_state['Y'] = Y @ I2
-    def apply(self, var):
+    def apply_states(self, objective, states, settings):
         Y = self.global_state["Y"]
-        params = var.params
-        # -------------------------------- inner step -------------------------------- #
-        update = var.get_update()
-        if 'inner' in self.children:
-            update = apply_transform(self.children['inner'], update, params=params, grads=var.grad, var=var)
-        g = torch.cat([t.ravel() for t in update])
-        # ----------------------------------- solve ---------------------------------- #
-        var.update = vec_to_tensors(Y@g, params)
-        return var
+        g = torch.cat([t.ravel() for t in objective.get_updates()])
+        objective.updates = vec_to_tensors(Y@g, objective.params)
+        return objective
-    def get_H(self,var):
+    def get_H(self,objective=...):
         return DenseWithInverse(A = self.global_state["H"], A_inv=self.global_state["Y"])

torchzero/modules/second_order/inm.py CHANGED Viewed

@@ -1,12 +1,11 @@
 from collections.abc import Callable
-from typing import Literal
 import torch
-from ...core import Chainable, Module
-from ...utils import TensorList, vec_to_tensors
-from ..functional import safe_clip
-from .newton import _get_H, _get_loss_grad_and_hessian, _newton_step
+from ...core import Chainable, Transform, HessianMethod
+from ...utils import TensorList, vec_to_tensors_, unpack_states
+from ..opt_utils import safe_clip
+from .newton import _newton_update_state_, _newton_solve, _newton_get_H
 @torch.no_grad
 def inm(f:torch.Tensor, J:torch.Tensor, s:torch.Tensor, y:torch.Tensor):
@@ -25,7 +24,7 @@ def _eigval_fn(J: torch.Tensor, fn) -> torch.Tensor:
     L, Q = torch.linalg.eigh(J) # pylint:disable=not-callable
     return (Q * L.unsqueeze(-2)) @ Q.mH
-class INM(Module):
+class ImprovedNewton(Transform):
     """Improved Newton's Method (INM).
     Reference:
@@ -35,71 +34,76 @@ class INM(Module):
     def __init__(
         self,
         damping: float = 0,
-        use_lstsq: bool = False,
+        eigval_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
         update_freq: int = 1,
-        hessian_method: Literal["autograd", "func", "autograd.functional"] = "autograd",
-        vectorize: bool = True,
+        precompute_inverse: bool | None = None,
+        use_lstsq: bool = False,
+        hessian_method: HessianMethod = "batched_autograd",
+        h: float = 1e-3,
         inner: Chainable | None = None,
-        H_tfm: Callable[[torch.Tensor, torch.Tensor], tuple[torch.Tensor, bool]] | Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
-        eigval_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
     ):
-        defaults = dict(damping=damping, hessian_method=hessian_method, use_lstsq=use_lstsq, vectorize=vectorize, H_tfm=H_tfm, eigval_fn=eigval_fn, update_freq=update_freq)
-        super().__init__(defaults)
-        if inner is not None:
-            self.set_child("inner", inner)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner'], defaults["update_freq"]
+        super().__init__(defaults, update_freq=update_freq, inner=inner, )
     @torch.no_grad
-    def update(self, var):
-        update_freq = self.defaults['update_freq']
-        step = self.global_state.get('step', 0)
-        self.global_state['step'] = step + 1
+    def update_states(self, objective, states, settings):
+        fs = settings[0]
-        if step % update_freq == 0:
-            _, f_list, J = _get_loss_grad_and_hessian(
-                var, self.defaults['hessian_method'], self.defaults['vectorize']
-            )
+        _, f_list, J = objective.hessian(
+            hessian_method=fs['hessian_method'],
+            h=fs['h'],
+            at_x0=True
+        )
+        if f_list is None: f_list = objective.get_grads()
-            f = torch.cat([t.ravel() for t in f_list])
-            J = _eigval_fn(J, self.defaults["eigval_fn"])
+        f = torch.cat([t.ravel() for t in f_list])
+        J = _eigval_fn(J, fs["eigval_fn"])
-            x_list = TensorList(var.params)
-            f_list = TensorList(var.get_grad())
-            x_prev, f_prev = self.get_state(var.params, "x_prev", "f_prev", cls=TensorList)
+        x_list = TensorList(objective.params)
+        f_list = TensorList(objective.get_grads())
+        x_prev, f_prev = unpack_states(states, objective.params, "x_prev", "f_prev", cls=TensorList)
-            # initialize on 1st step, do Newton step
-            if step == 0:
-                x_prev.copy_(x_list)
-                f_prev.copy_(f_list)
-                self.global_state["P"] = J
-                return
+        # initialize on 1st step, do Newton step
+        if "H" not in self.global_state:
+            x_prev.copy_(x_list)
+            f_prev.copy_(f_list)
+            P = J
-            # INM update
+        # INM update
+        else:
             s_list = x_list - x_prev
             y_list = f_list - f_prev
             x_prev.copy_(x_list)
             f_prev.copy_(f_list)
-            self.global_state["P"] = inm(f, J, s=s_list.to_vec(), y=y_list.to_vec())
+            P = inm(f, J, s=s_list.to_vec(), y=y_list.to_vec())
+        # update state
+        precompute_inverse = fs["precompute_inverse"]
+        if precompute_inverse is None:
+            precompute_inverse = fs["__update_freq"] >= 10
-    @torch.no_grad
-    def apply(self, var):
-        params = var.params
-        update = _newton_step(
-            var=var,
-            H = self.global_state["P"],
-            damping=self.defaults["damping"],
-            inner=self.children.get("inner", None),
-            H_tfm=self.defaults["H_tfm"],
-            eigval_fn=None, # it is applied in `update`
-            use_lstsq=self.defaults["use_lstsq"],
+        _newton_update_state_(
+            H=P,
+            state = self.global_state,
+            damping = fs["damping"],
+            eigval_fn = fs["eigval_fn"],
+            precompute_inverse = precompute_inverse,
+            use_lstsq = fs["use_lstsq"]
         )
-        var.update = vec_to_tensors(update, params)
+    @torch.no_grad
+    def apply_states(self, objective, states, settings):
+        updates = objective.get_updates()
+        fs = settings[0]
+        b = torch.cat([t.ravel() for t in updates])
+        sol = _newton_solve(b=b, state=self.global_state, use_lstsq=fs["use_lstsq"])
+        vec_to_tensors_(sol, updates)
+        return objective
-        return var
-    def get_H(self,var=...):
-        return _get_H(self.global_state["P"], eigval_fn=None)
+    def get_H(self,objective=...):
+        return _newton_get_H(self.global_state)

torchzero/modules/second_order/multipoint.py CHANGED Viewed

@@ -1,19 +1,17 @@
-from collections.abc import Callable
-from contextlib import nullcontext
 from abc import ABC, abstractmethod
+from collections.abc import Callable, Mapping
+from typing import Any
 import numpy as np
 import torch
-from ...core import Chainable, Module, apply_transform, Var
-from ...utils import TensorList, vec_to_tensors, vec_to_tensors_
-from ...utils.derivatives import (
-    flatten_jacobian,
-    jacobian_wrt,
-)
+from ...core import Chainable, DerivativesMethod, Objective, Transform
+from ...utils import TensorList, vec_to_tensors
-class HigherOrderMethodBase(Module, ABC):
-    def __init__(self, defaults: dict | None = None, vectorize: bool = True):
-        self._vectorize = vectorize
+class HigherOrderMethodBase(Transform, ABC):
+    def __init__(self, defaults: dict | None = None, derivatives_method: DerivativesMethod = 'batched_autograd'):
+        self._derivatives_method: DerivativesMethod = derivatives_method
         super().__init__(defaults)
     @abstractmethod
@@ -21,61 +19,27 @@ class HigherOrderMethodBase(Module, ABC):
         self,
         x: torch.Tensor,
         evaluate: Callable[[torch.Tensor, int], tuple[torch.Tensor, ...]],
-        var: Var,
+        objective: Objective,
+        setting: Mapping[str, Any],
     ) -> torch.Tensor:
         """"""
     @torch.no_grad
-    def step(self, var):
-        params = TensorList(var.params)
-        x0 = params.clone()
-        closure = var.closure
+    def apply_states(self, objective, states, settings):
+        params = TensorList(objective.params)
+        closure = objective.closure
         if closure is None: raise RuntimeError('MultipointNewton requires closure')
-        vectorize = self._vectorize
+        derivatives_method = self._derivatives_method
         def evaluate(x, order) -> tuple[torch.Tensor, ...]:
             """order=0 - returns (loss,), order=1 - returns (loss, grad), order=2 - returns (loss, grad, hessian), etc."""
-            params.from_vec_(x)
-            if order == 0:
-                loss = closure(False)
-                params.copy_(x0)
-                return (loss, )
-            if order == 1:
-                with torch.enable_grad():
-                    loss = closure()
-                grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
-                params.copy_(x0)
-                return loss, torch.cat([g.ravel() for g in grad])
-            with torch.enable_grad():
-                loss = var.loss = var.loss_approx = closure(False)
-                g_list = torch.autograd.grad(loss, params, create_graph=True)
-                var.grad = list(g_list)
-                g = torch.cat([t.ravel() for t in g_list])
-                n = g.numel()
-                ret = [loss, g]
-                T = g # current derivatives tensor
-                # get all derivative up to order
-                for o in range(2, order + 1):
-                    is_last = o == order
-                    T_list = jacobian_wrt([T], params, create_graph=not is_last, batched=vectorize)
-                    with torch.no_grad() if is_last else nullcontext():
-                        # the shape is (ndim, ) * order
-                        T = flatten_jacobian(T_list).view(n, n, *T.shape[1:])
-                        ret.append(T)
-            params.copy_(x0)
-            return tuple(ret)
+            return objective.derivatives_at(x, order, method=derivatives_method)
         x = torch.cat([p.ravel() for p in params])
-        dir = self.one_iteration(x, evaluate, var)
-        var.update = vec_to_tensors(dir, var.params)
-        return var
+        dir = self.one_iteration(x, evaluate, objective, settings[0])
+        objective.updates = vec_to_tensors(dir, objective.params)
+        return objective
 def _inv(A: torch.Tensor, lstsq:bool) -> torch.Tensor:
     if lstsq: return torch.linalg.pinv(A) # pylint:disable=not-callable
@@ -106,16 +70,15 @@ class SixthOrder3P(HigherOrderMethodBase):
     Abro, Hameer Akhtar, and Muhammad Mujtaba Shaikh. "A new time-efficient and convergent nonlinear solver." Applied Mathematics and Computation 355 (2019): 516-536.
     """
-    def __init__(self, lstsq: bool=False, vectorize: bool = True):
+    def __init__(self, lstsq: bool=False, derivatives_method: DerivativesMethod = 'batched_autograd'):
         defaults=dict(lstsq=lstsq)
-        super().__init__(defaults=defaults, vectorize=vectorize)
+        super().__init__(defaults=defaults, derivatives_method=derivatives_method)
-    def one_iteration(self, x, evaluate, var):
-        settings = self.defaults
-        lstsq = settings['lstsq']
+    @torch.no_grad
+    def one_iteration(self, x, evaluate, objective, setting):
         def f(x): return evaluate(x, 1)[1]
         def f_j(x): return evaluate(x, 2)[1:]
-        x_star = sixth_order_3p(x, f, f_j, lstsq)
+        x_star = sixth_order_3p(x, f, f_j, setting['lstsq'])
         return x - x_star
 # I don't think it works (I tested root finding with this and it goes all over the place)
@@ -173,15 +136,14 @@ def sixth_order_5p(x:torch.Tensor, f_j, lstsq:bool=False):
 class SixthOrder5P(HigherOrderMethodBase):
     """Argyros, Ioannis K., et al. "Extended convergence for two sixth order methods under the same weak conditions." Foundations 3.1 (2023): 127-139."""
-    def __init__(self, lstsq: bool=False, vectorize: bool = True):
+    def __init__(self, lstsq: bool=False, derivatives_method: DerivativesMethod = 'batched_autograd'):
         defaults=dict(lstsq=lstsq)
-        super().__init__(defaults=defaults, vectorize=vectorize)
+        super().__init__(defaults=defaults, derivatives_method=derivatives_method)
-    def one_iteration(self, x, evaluate, var):
-        settings = self.defaults
-        lstsq = settings['lstsq']
+    @torch.no_grad
+    def one_iteration(self, x, evaluate, objective, setting):
         def f_j(x): return evaluate(x, 2)[1:]
-        x_star = sixth_order_5p(x, f_j, lstsq)
+        x_star = sixth_order_5p(x, f_j, setting['lstsq'])
         return x - x_star
 # 2f 1J 2 solves
@@ -196,16 +158,15 @@ class TwoPointNewton(HigherOrderMethodBase):
     """two-point Newton method with frozen derivative with third order convergence.
     Sharma, Janak Raj, and Deepak Kumar. "A fast and efficient composite Newton–Chebyshev method for systems of nonlinear equations." Journal of Complexity 49 (2018): 56-73."""
-    def __init__(self, lstsq: bool=False, vectorize: bool = True):
+    def __init__(self, lstsq: bool=False, derivatives_method: DerivativesMethod = 'batched_autograd'):
         defaults=dict(lstsq=lstsq)
-        super().__init__(defaults=defaults, vectorize=vectorize)
+        super().__init__(defaults=defaults, derivatives_method=derivatives_method)
-    def one_iteration(self, x, evaluate, var):
-        settings = self.defaults
-        lstsq = settings['lstsq']
+    @torch.no_grad
+    def one_iteration(self, x, evaluate, objective, setting):
         def f(x): return evaluate(x, 1)[1]
         def f_j(x): return evaluate(x, 2)[1:]
-        x_star = two_point_newton(x, f, f_j, lstsq)
+        x_star = two_point_newton(x, f, f_j, setting['lstsq'])
         return x - x_star
 #3f 2J 1inv
@@ -224,15 +185,14 @@ def sixth_order_3pm2(x:torch.Tensor, f, f_j, lstsq:bool=False):
 class SixthOrder3PM2(HigherOrderMethodBase):
     """Wang, Xiaofeng, and Yang Li. "An efficient sixth-order Newton-type method for solving nonlinear systems." Algorithms 10.2 (2017): 45."""
-    def __init__(self, lstsq: bool=False, vectorize: bool = True):
+    def __init__(self, lstsq: bool=False, derivatives_method: DerivativesMethod = 'batched_autograd'):
         defaults=dict(lstsq=lstsq)
-        super().__init__(defaults=defaults, vectorize=vectorize)
+        super().__init__(defaults=defaults, derivatives_method=derivatives_method)
-    def one_iteration(self, x, evaluate, var):
-        settings = self.defaults
-        lstsq = settings['lstsq']
+    @torch.no_grad
+    def one_iteration(self, x, evaluate, objective, setting):
         def f_j(x): return evaluate(x, 2)[1:]
         def f(x): return evaluate(x, 1)[1]
-        x_star = sixth_order_3pm2(x, f, f_j, lstsq)
+        x_star = sixth_order_3pm2(x, f, f_j, setting['lstsq'])
         return x - x_star

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl