PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

tests/test_opts.py +95 -76
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +229 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/spsa1.py +93 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/__init__.py +1 -1
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +6 -7
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +114 -175
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +16 -4
torchzero/modules/line_search/strong_wolfe.py +319 -220
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +253 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +207 -170
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +99 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +122 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/optimizer.py +2 -2
torchzero/utils/python_tools.py +7 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.14.dist-info/METADATA +14 -0
torchzero-0.3.14.dist-info/RECORD +167 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/WHEEL +0 -0

torchzero/modules/second_order/newton_cg.py CHANGED Viewed

@@ -1,30 +1,24 @@
-from typing import Literal, overload
+import warnings
+import math
+from typing import Literal, cast
+from operator import itemgetter
 import torch
-from ...utils import TensorList, as_tensorlist, NumberList
+from ...core import Chainable, Module, apply_transform
+from ...utils import TensorList, as_tensorlist, tofloat
 from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
-from ...core import Chainable, apply_transform, Module
-from ...utils.linalg.solve import cg, steihaug_toint_cg, minres
+from ...utils.linalg.solve import cg, minres, find_within_trust_radius
+from ..trust_region.trust_region import default_radius
 class NewtonCG(Module):
     """Newton's method with a matrix-free conjugate gradient or minimial-residual solver.
-    This optimizer implements Newton's method using a matrix-free conjugate
-    gradient (CG) or a minimal-residual (MINRES) solver to approximate the search direction. Instead of
-    forming the full Hessian matrix, it only requires Hessian-vector products
-    (HVPs). These can be calculated efficiently using automatic
-    differentiation or approximated using finite differences.
-    .. note::
-        In most cases NewtonCG should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply Newton preconditioning to another module's output.
+    Notes:
+        * In most cases NewtonCGSteihaug should be the first module in the chain because it relies on autograd. Use the ``inner`` argument if you wish to apply Newton preconditioning to another module's output.
-    .. note::
-        This module requires the a closure passed to the optimizer step,
-        as it needs to re-evaluate the loss and gradients for calculating HVPs.
-        The closure must accept a ``backward`` argument (refer to documentation).
+        * This module requires the a closure passed to the optimizer step, as it needs to re-evaluate the loss and gradients for calculating HVPs. The closure must accept a ``backward`` argument (refer to documentation).
-    .. warning::
+    Warning:
         CG may fail if hessian is not positive-definite.
     Args:
@@ -63,45 +57,48 @@ class NewtonCG(Module):
             NewtonCG will attempt to apply preconditioning to the output of this module.
     Examples:
-        Newton-CG with a backtracking line search:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.NewtonCG(),
-                tz.m.Backtracking()
-            )
-        Truncated Newton method (useful for large-scale problems):
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.NewtonCG(maxiter=10, warm_start=True),
-                tz.m.Backtracking()
-            )
+    Newton-CG with a backtracking line search:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.NewtonCG(),
+        tz.m.Backtracking()
+    )
+    ```
+    Truncated Newton method (useful for large-scale problems):
+    ```
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.NewtonCG(maxiter=10),
+        tz.m.Backtracking()
+    )
+    ```
     """
     def __init__(
         self,
         maxiter: int | None = None,
-        tol: float = 1e-4,
+        tol: float = 1e-8,
         reg: float = 1e-8,
         hvp_method: Literal["forward", "central", "autograd"] = "autograd",
         solver: Literal['cg', 'minres', 'minres_npc'] = 'cg',
-        h: float = 1e-3,
+        h: float = 1e-3, # tuned 1e-4 or 1e-3
+        miniter:int = 1,
         warm_start=False,
         inner: Chainable | None = None,
     ):
-        defaults = dict(tol=tol, maxiter=maxiter, reg=reg, hvp_method=hvp_method, solver=solver, h=h, warm_start=warm_start)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner']
         super().__init__(defaults,)
         if inner is not None:
             self.set_child('inner', inner)
+        self._num_hvps = 0
+        self._num_hvps_last_step = 0
     @torch.no_grad
     def step(self, var):
         params = TensorList(var.params)
@@ -117,11 +114,13 @@ class NewtonCG(Module):
         h = settings['h']
         warm_start = settings['warm_start']
+        self._num_hvps_last_step = 0
         # ---------------------- Hessian vector product function --------------------- #
         if hvp_method == 'autograd':
             grad = var.get_grad(create_graph=True)
             def H_mm(x):
+                self._num_hvps_last_step += 1
                 with torch.enable_grad():
                     return TensorList(hvp(params, grad, x, retain_graph=True))
@@ -132,10 +131,12 @@ class NewtonCG(Module):
             if hvp_method == 'forward':
                 def H_mm(x):
+                    self._num_hvps_last_step += 1
                     return TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1])
             elif hvp_method == 'central':
                 def H_mm(x):
+                    self._num_hvps_last_step += 1
                     return TensorList(hvp_fd_central(closure, params, x, h=h, normalize=True)[1])
             else:
@@ -153,141 +154,154 @@ class NewtonCG(Module):
         if warm_start: x0 = self.get_state(params, 'prev_x', cls=TensorList) # initialized to 0 which is default anyway
         if solver == 'cg':
-            x = cg(A_mm=H_mm, b=b, x0_=x0, tol=tol, maxiter=maxiter, reg=reg)
+            d, _ = cg(A_mm=H_mm, b=b, x0=x0, tol=tol, maxiter=maxiter, miniter=self.defaults["miniter"],reg=reg)
         elif solver == 'minres':
-            x = minres(A_mm=H_mm, b=b, x0=x0, tol=tol, maxiter=maxiter, reg=reg, npc_terminate=False)
+            d = minres(A_mm=H_mm, b=b, x0=x0, tol=tol, maxiter=maxiter, reg=reg, npc_terminate=False)
         elif solver == 'minres_npc':
-            x = minres(A_mm=H_mm, b=b, x0=x0, tol=tol, maxiter=maxiter, reg=reg, npc_terminate=True)
+            d = minres(A_mm=H_mm, b=b, x0=x0, tol=tol, maxiter=maxiter, reg=reg, npc_terminate=True)
         else:
             raise ValueError(f"Unknown solver {solver}")
         if warm_start:
             assert x0 is not None
-            x0.copy_(x)
-        var.update = x
-        return var
+            x0.copy_(d)
+        var.update = d
-class TruncatedNewtonCG(Module):
-    """Trust region Newton's method with a matrix-free Steihaug-Toint conjugate gradient or MINRES solver.
+        self._num_hvps += self._num_hvps_last_step
+        return var
-    This optimizer implements Newton's method using a matrix-free conjugate
-    gradient (CG) solver to approximate the search direction. Instead of
-    forming the full Hessian matrix, it only requires Hessian-vector products
-    (HVPs). These can be calculated efficiently using automatic
-    differentiation or approximated using finite differences.
-    .. note::
-        In most cases NewtonCGSteihaug should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply Newton preconditioning to another module's output.
+class NewtonCGSteihaug(Module):
+    """Newton's method with trust region and a matrix-free Steihaug-Toint conjugate gradient solver.
-    .. note::
-        This module requires the a closure passed to the optimizer step,
-        as it needs to re-evaluate the loss and gradients for calculating HVPs.
-        The closure must accept a ``backward`` argument (refer to documentation).
+    Notes:
+        * In most cases NewtonCGSteihaug should be the first module in the chain because it relies on autograd. Use the ``inner`` argument if you wish to apply Newton preconditioning to another module's output.
-    .. warning::
-        CG may fail if hessian is not positive-definite.
+        * This module requires the a closure passed to the optimizer step, as it needs to re-evaluate the loss and gradients for calculating HVPs. The closure must accept a ``backward`` argument (refer to documentation).
     Args:
-        maxiter (int | None, optional):
-            Maximum number of iterations for the conjugate gradient solver.
-            By default, this is set to the number of dimensions in the
-            objective function, which is the theoretical upper bound for CG
-            convergence. Setting this to a smaller value (truncated Newton)
-            can still generate good search directions. Defaults to None.
         eta (float, optional):
-            whenever actual to predicted loss reduction ratio is larger than this, a step is accepted.
-        nplus (float, optional):
-            trust region multiplier on successful steps.
-        nminus (float, optional):
-            trust region multiplier on unsuccessful steps.
-        init (float, optional): initial trust region.
+            if ratio of actual to predicted rediction is larger than this, step is accepted. Defaults to 0.0.
+        nplus (float, optional): increase factor on successful steps. Defaults to 1.5.
+        nminus (float, optional): decrease factor on unsuccessful steps. Defaults to 0.75.
+        rho_good (float, optional):
+            if ratio of actual to predicted rediction is larger than this, trust region size is multiplied by `nplus`.
+        rho_bad (float, optional):
+            if ratio of actual to predicted rediction is less than this, trust region size is multiplied by `nminus`.
+        init (float, optional): Initial trust region value. Defaults to 1.
+        max_attempts (max_attempts, optional):
+            maximum number of trust radius reductions per step. A zero update vector is returned when
+            this limit is exceeded. Defaults to 10.
+        max_history (int, optional):
+            CG will store this many intermediate solutions, reusing them when trust radius is reduced
+            instead of re-running CG. Each solution storage requires 2N memory. Defaults to 100.
+        boundary_tol (float | None, optional):
+            The trust region only increases when suggested step's norm is at least `(1-boundary_tol)*trust_region`.
+            This prevents increasing trust region when solution is not on the boundary. Defaults to 1e-2.
+        maxiter (int | None, optional):
+            maximum number of CG iterations per step. Each iteration requies one backward pass if `hvp_method="forward"`, two otherwise. Defaults to None.
+        miniter (int, optional):
+            minimal number of CG iterations. This prevents making no progress
         tol (float, optional):
-            Relative tolerance for the conjugate gradient solver to determine
-            convergence. Defaults to 1e-4.
-        reg (float, optional):
-            Regularization parameter (damping) added to the Hessian diagonal.
-            This helps ensure the system is positive-definite. Defaults to 1e-8.
+            terminates CG when norm of the residual is less than this value. Defaults to 1e-8.
+            when initial guess is below tolerance. Defaults to 1.
+        reg (float, optional): hessian regularization. Defaults to 1e-8.
+        solver (str, optional): solver, "cg" or "minres". "cg" is recommended. Defaults to 'cg'.
+        adapt_tol (bool, optional):
+            if True, whenever trust radius collapses to smallest representable number,
+            the tolerance is multiplied by 0.1. Defaults to True.
+        npc_terminate (bool, optional):
+            whether to terminate CG/MINRES whenever negative curvature is detected. Defaults to False.
         hvp_method (str, optional):
-            Determines how Hessian-vector products are evaluated.
+            either "forward" to use forward formula which requires one backward pass per Hvp, or "central" to use a more accurate central formula which requires two backward passes. "forward" is usually accurate enough. Defaults to "forward".
+        h (float, optional): finite difference step size. Defaults to 1e-3.
-            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
-              This requires creating a graph for the gradient.
-            - ``"forward"``: Use a forward finite difference formula to
-              approximate the HVP. This requires one extra gradient evaluation.
-            - ``"central"``: Use a central finite difference formula for a
-              more accurate HVP approximation. This requires two extra
-              gradient evaluations.
-            Defaults to "autograd".
-        h (float, optional):
-            The step size for finite differences if :code:`hvp_method` is
-            ``"forward"`` or ``"central"``. Defaults to 1e-3.
         inner (Chainable | None, optional):
-            NewtonCG will attempt to apply preconditioning to the output of this module.
-    Examples:
-        Trust-region Newton-CG:
+            applies preconditioning to output of this module. Defaults to None.
-        .. code-block:: python
+    ### Examples:
+    Trust-region Newton-CG:
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.NewtonCGSteihaug(),
-            )
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.NewtonCGSteihaug(),
+    )
+    ```
-    Reference:
+    ### Reference:
         Steihaug, Trond. "The conjugate gradient method and trust regions in large scale optimization." SIAM Journal on Numerical Analysis 20.3 (1983): 626-637.
     """
     def __init__(
         self,
-        maxiter: int | None = None,
-        eta: float= 1e-6,
-        nplus: float = 2,
+        # trust region settings
+        eta: float= 0.0,
+        nplus: float = 3.5,
         nminus: float = 0.25,
+        rho_good: float = 0.99,
+        rho_bad: float = 1e-4,
         init: float = 1,
-        tol: float = 1e-4,
+        max_attempts: int = 100,
+        max_history: int = 100,
+        boundary_tol: float = 1e-6, # tuned
+        # cg settings
+        maxiter: int | None = None,
+        miniter: int = 1,
+        tol: float = 1e-8,
         reg: float = 1e-8,
-        hvp_method: Literal["forward", "central", "autograd"] = "autograd",
-        solver: Literal['cg', 'minres', 'minres_npc'] = 'cg',
-        h: float = 1e-3,
-        max_attempts: int = 10,
+        solver: Literal['cg', "minres"] = 'cg',
+        adapt_tol: bool = True,
+        npc_terminate: bool = False,
+        # hvp settings
+        hvp_method: Literal["forward", "central"] = "central",
+        h: float = 1e-3, # tuned 1e-4 or 1e-3
+        # inner
         inner: Chainable | None = None,
     ):
-        defaults = dict(tol=tol, maxiter=maxiter, reg=reg, hvp_method=hvp_method, h=h, eta=eta, nplus=nplus, nminus=nminus, init=init, max_attempts=max_attempts, solver=solver)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner']
         super().__init__(defaults,)
         if inner is not None:
             self.set_child('inner', inner)
+        self._num_hvps = 0
+        self._num_hvps_last_step = 0
     @torch.no_grad
     def step(self, var):
         params = TensorList(var.params)
         closure = var.closure
         if closure is None: raise RuntimeError('NewtonCG requires closure')
-        settings = self.settings[params[0]]
-        tol = settings['tol']
-        reg = settings['reg']
-        maxiter = settings['maxiter']
-        hvp_method = settings['hvp_method']
-        h = settings['h']
-        max_attempts = settings['max_attempts']
-        solver = settings['solver'].lower().strip()
+        tol = self.defaults['tol'] * self.global_state.get('tol_mul', 1)
+        solver = self.defaults['solver'].lower().strip()
+        (reg, maxiter, hvp_method, h, max_attempts, boundary_tol,
+         eta, nplus, nminus, rho_good, rho_bad, init, npc_terminate,
+         miniter, max_history, adapt_tol) = itemgetter(
+             "reg", "maxiter", "hvp_method", "h", "max_attempts", "boundary_tol",
+             "eta", "nplus", "nminus", "rho_good", "rho_bad", "init", "npc_terminate",
+             "miniter", "max_history", "adapt_tol",
+        )(self.defaults)
-        eta = settings['eta']
-        nplus = settings['nplus']
-        nminus = settings['nminus']
-        init = settings['init']
+        self._num_hvps_last_step = 0
         # ---------------------- Hessian vector product function --------------------- #
         if hvp_method == 'autograd':
             grad = var.get_grad(create_graph=True)
             def H_mm(x):
+                self._num_hvps_last_step += 1
                 with torch.enable_grad():
                     return TensorList(hvp(params, grad, x, retain_graph=True))
@@ -298,10 +312,12 @@ class TruncatedNewtonCG(Module):
             if hvp_method == 'forward':
                 def H_mm(x):
+                    self._num_hvps_last_step += 1
                     return TensorList(hvp_fd_forward(closure, params, x, h=h, g_0=grad, normalize=True)[1])
             elif hvp_method == 'central':
                 def H_mm(x):
+                    self._num_hvps_last_step += 1
                     return TensorList(hvp_fd_central(closure, params, x, h=h, normalize=True)[1])
             else:
@@ -314,61 +330,82 @@ class TruncatedNewtonCG(Module):
             b = apply_transform(self.children['inner'], b, params=params, grads=grad, var=var)
         b = as_tensorlist(b)
-        # ---------------------------------- run cg ---------------------------------- #
+        # ------------------------------- trust region ------------------------------- #
         success = False
-        x = None
+        d = None
+        x0 = [p.clone() for p in params]
+        solution = None
         while not success:
             max_attempts -= 1
             if max_attempts < 0: break
-            trust_region = self.global_state.get('trust_region', init)
-            if trust_region < 1e-8 or trust_region > 1e8:
-                trust_region = self.global_state['trust_region'] = init
-            if solver == 'cg':
-                x = steihaug_toint_cg(A_mm=H_mm, b=b, trust_region=trust_region, tol=tol, maxiter=maxiter, reg=reg)
-            elif solver == 'minres':
-                x = minres(A_mm=H_mm, b=b, trust_region=trust_region, tol=tol, maxiter=maxiter, reg=reg, npc_terminate=False)
-            elif solver == 'minres_npc':
-                x = minres(A_mm=H_mm, b=b, trust_region=trust_region, tol=tol, maxiter=maxiter, reg=reg, npc_terminate=True)
-            else:
-                raise ValueError(f"unknown solver {solver}")
-            # ------------------------------- trust region ------------------------------- #
-            Hx = H_mm(x)
-            pred_reduction = b.dot(x) - 0.5 * x.dot(Hx)
-            params -= x
-            loss_star = closure(False)
-            params += x
-            reduction = var.get_loss(False) - loss_star
-            rho = reduction / (pred_reduction.clip(min=1e-8))
-            # failed step
-            if rho < 0.25:
-                self.global_state['trust_region'] = trust_region * nminus
-            # very good step
-            elif rho > 0.75:
-                diff = trust_region - x.abs()
-                if (diff.global_min() / trust_region) > 1e-4: # hits boundary
-                    self.global_state['trust_region'] = trust_region * nplus
-            # if the ratio is high enough then accept the proposed step
-            if rho > eta:
-                success = True
+            trust_radius = self.global_state.get('trust_radius', init)
+            # -------------- make sure trust radius isn't too small or large ------------- #
+            finfo = torch.finfo(x0[0].dtype)
+            if trust_radius < finfo.tiny * 2:
+                trust_radius = self.global_state['trust_radius'] = init
+                if adapt_tol:
+                    self.global_state["tol_mul"] = self.global_state.get("tol_mul", 1) * 0.1
+            elif trust_radius > finfo.max / 2:
+                trust_radius = self.global_state['trust_radius'] = init
+            # ----------------------------------- solve ---------------------------------- #
+            d = None
+            if solution is not None and solution.history is not None:
+                d = find_within_trust_radius(solution.history, trust_radius)
+            if d is None:
+                if solver == 'cg':
+                    d, solution = cg(
+                        A_mm=H_mm,
+                        b=b,
+                        tol=tol,
+                        maxiter=maxiter,
+                        reg=reg,
+                        trust_radius=trust_radius,
+                        miniter=miniter,
+                        npc_terminate=npc_terminate,
+                        history_size=max_history,
+                    )
+                elif solver == 'minres':
+                    d = minres(A_mm=H_mm, b=b, trust_radius=trust_radius, tol=tol, maxiter=maxiter, reg=reg, npc_terminate=npc_terminate)
+                else:
+                    raise ValueError(f"unknown solver {solver}")
+            # ---------------------------- update trust radius --------------------------- #
+            self.global_state["trust_radius"], success = default_radius(
+                params=params,
+                closure=closure,
+                f=tofloat(var.get_loss(False)),
+                g=b,
+                H=H_mm,
+                d=d,
+                trust_radius=trust_radius,
+                eta=eta,
+                nplus=nplus,
+                nminus=nminus,
+                rho_good=rho_good,
+                rho_bad=rho_bad,
+                boundary_tol=boundary_tol,
+                init=init, # init isn't used because check_overflow=False
+                state=self.global_state, # not used
+                settings=self.defaults, # not used
+                check_overflow=False, # this is checked manually to adapt tolerance
+            )
-        assert x is not None
+        # --------------------------- assign new direction --------------------------- #
+        assert d is not None
         if success:
-            var.update = x
+            var.update = d
         else:
             var.update = params.zeros_like()
-        return var
+        self._num_hvps += self._num_hvps_last_step
+        return var

torchzero/modules/smoothing/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
 from .laplacian import LaplacianSmoothing
-from .gaussian import GaussianHomotopy
+from .sampling import GradientSampling

torchzero 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl