PyPI - torchzero - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

torchzero 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

tests/test_opts.py +54 -21
tests/test_tensorlist.py +2 -2
tests/test_vars.py +61 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +49 -49
torchzero/core/transform.py +219 -158
torchzero/modules/__init__.py +1 -0
torchzero/modules/clipping/clipping.py +10 -10
torchzero/modules/clipping/ema_clipping.py +14 -13
torchzero/modules/clipping/growth_clipping.py +16 -18
torchzero/modules/experimental/__init__.py +12 -3
torchzero/modules/experimental/absoap.py +50 -156
torchzero/modules/experimental/adadam.py +15 -14
torchzero/modules/experimental/adamY.py +17 -27
torchzero/modules/experimental/adasoap.py +19 -129
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/experimental/diagonal_higher_order_newton.py +225 -0
torchzero/modules/experimental/eigendescent.py +117 -0
torchzero/modules/experimental/etf.py +172 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +88 -0
torchzero/modules/experimental/reduce_outward_lr.py +8 -5
torchzero/modules/experimental/soapy.py +19 -146
torchzero/modules/experimental/spectral.py +79 -204
torchzero/modules/experimental/structured_newton.py +12 -12
torchzero/modules/experimental/subspace_preconditioners.py +13 -10
torchzero/modules/experimental/tada.py +38 -0
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +5 -5
torchzero/modules/grad_approximation/grad_approximator.py +21 -21
torchzero/modules/grad_approximation/rfdm.py +28 -15
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +256 -0
torchzero/modules/line_search/backtracking.py +42 -23
torchzero/modules/line_search/line_search.py +40 -40
torchzero/modules/line_search/scipy.py +18 -3
torchzero/modules/line_search/strong_wolfe.py +21 -32
torchzero/modules/line_search/trust_region.py +18 -6
torchzero/modules/lr/__init__.py +1 -1
torchzero/modules/lr/{step_size.py → adaptive.py} +22 -26
torchzero/modules/lr/lr.py +20 -16
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +73 -35
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +96 -54
torchzero/modules/momentum/momentum.py +24 -4
torchzero/modules/ops/accumulate.py +51 -21
torchzero/modules/ops/binary.py +36 -36
torchzero/modules/ops/debug.py +7 -7
torchzero/modules/ops/misc.py +128 -129
torchzero/modules/ops/multi.py +19 -19
torchzero/modules/ops/reduce.py +16 -16
torchzero/modules/ops/split.py +26 -26
torchzero/modules/ops/switch.py +4 -4
torchzero/modules/ops/unary.py +20 -20
torchzero/modules/ops/utility.py +37 -37
torchzero/modules/optimizers/adagrad.py +33 -24
torchzero/modules/optimizers/adam.py +31 -34
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/muon.py +6 -6
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +13 -16
torchzero/modules/optimizers/rprop.py +52 -49
torchzero/modules/optimizers/shampoo.py +17 -23
torchzero/modules/optimizers/soap.py +12 -19
torchzero/modules/optimizers/sophia_h.py +13 -13
torchzero/modules/projections/dct.py +4 -4
torchzero/modules/projections/fft.py +6 -6
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +57 -57
torchzero/modules/projections/structural.py +17 -17
torchzero/modules/quasi_newton/__init__.py +33 -4
torchzero/modules/quasi_newton/cg.py +67 -17
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +24 -24
torchzero/modules/quasi_newton/lbfgs.py +12 -12
torchzero/modules/quasi_newton/lsr1.py +11 -11
torchzero/modules/quasi_newton/olbfgs.py +19 -19
torchzero/modules/quasi_newton/quasi_newton.py +254 -47
torchzero/modules/second_order/newton.py +32 -20
torchzero/modules/second_order/newton_cg.py +13 -12
torchzero/modules/second_order/nystrom.py +21 -21
torchzero/modules/smoothing/gaussian.py +21 -21
torchzero/modules/smoothing/laplacian.py +7 -9
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +43 -9
torchzero/modules/wrappers/optim_wrapper.py +11 -11
torchzero/optim/wrappers/directsearch.py +244 -0
torchzero/optim/wrappers/fcmaes.py +97 -0
torchzero/optim/wrappers/mads.py +90 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +162 -13
torchzero/utils/__init__.py +2 -6
torchzero/utils/derivatives.py +2 -1
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +17 -4
{torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/METADATA +14 -14
torchzero-0.3.10.dist-info/RECORD +139 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero-0.3.9.dist-info/RECORD +0 -131
{torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/top_level.txt +0 -0

torchzero/modules/grad_approximation/grad_approximator.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Literal
 import torch
-from ...core import Module, Vars
+from ...core import Module, Var
 GradTarget = Literal['update', 'grad', 'closure']
 _Scalar = torch.Tensor | float
@@ -17,50 +17,50 @@ class GradApproximator(Module, ABC):
     Args:
         defaults (dict[str, Any] | None, optional): dict with defaults. Defaults to None.
         target (str, optional):
-            whether to set `vars.grad`, `vars.update` or 'vars.closure`. Defaults to 'closure'.
+            whether to set `var.grad`, `var.update` or 'var.closure`. Defaults to 'closure'.
     """
     def __init__(self, defaults: dict[str, Any] | None = None, target: GradTarget = 'closure'):
         super().__init__(defaults)
         self._target: GradTarget = target
     @abstractmethod
-    def approximate(self, closure: Callable, params: list[torch.Tensor], loss: _Scalar | None, vars: Vars) -> tuple[Iterable[torch.Tensor], _Scalar | None, _Scalar | None]:
+    def approximate(self, closure: Callable, params: list[torch.Tensor], loss: _Scalar | None, var: Var) -> tuple[Iterable[torch.Tensor], _Scalar | None, _Scalar | None]:
         """Returns a tuple: (grad, loss, loss_approx), make sure this resets parameters to their original values!"""
-    def pre_step(self, vars: Vars) -> Vars | None:
+    def pre_step(self, var: Var) -> Var | None:
         """This runs once before each step, whereas `approximate` may run multiple times per step if further modules
         evaluate gradients at multiple points. This is useful for example to pre-generate new random perturbations."""
-        return vars
+        return var
     @torch.no_grad
-    def step(self, vars):
-        ret = self.pre_step(vars)
-        if isinstance(ret, Vars): vars = ret
+    def step(self, var):
+        ret = self.pre_step(var)
+        if isinstance(ret, Var): var = ret
-        if vars.closure is None: raise RuntimeError("Gradient approximation requires closure")
-        params, closure, loss = vars.params, vars.closure, vars.loss
+        if var.closure is None: raise RuntimeError("Gradient approximation requires closure")
+        params, closure, loss = var.params, var.closure, var.loss
         if self._target == 'closure':
             def approx_closure(backward=True):
                 if backward:
                     # set loss to None because closure might be evaluated at different points
-                    grad, l, l_approx = self.approximate(closure=closure, params=params, loss=None, vars=vars)
+                    grad, l, l_approx = self.approximate(closure=closure, params=params, loss=None, var=var)
                     for p, g in zip(params, grad): p.grad = g
                     return l if l is not None else l_approx
                 return closure(False)
-            vars.closure = approx_closure
-            return vars
+            var.closure = approx_closure
+            return var
-        # if vars.grad is not None:
-        #     warnings.warn('Using grad approximator when `vars.grad` is already set.')
-        grad,loss,loss_approx = self.approximate(closure=closure, params=params, loss=loss, vars=vars)
-        if loss_approx is not None: vars.loss_approx = loss_approx
-        if loss is not None: vars.loss = vars.loss_approx = loss
-        if self._target == 'grad': vars.grad = list(grad)
-        elif self._target == 'update': vars.update = list(grad)
+        # if var.grad is not None:
+        #     warnings.warn('Using grad approximator when `var.grad` is already set.')
+        grad,loss,loss_approx = self.approximate(closure=closure, params=params, loss=loss, var=var)
+        if loss_approx is not None: var.loss_approx = loss_approx
+        if loss is not None: var.loss = var.loss_approx = loss
+        if self._target == 'grad': var.grad = list(grad)
+        elif self._target == 'update': var.update = list(grad)
         else: raise ValueError(self._target)
-        return vars
+        return var
 _FD_Formula = Literal['forward2', 'backward2', 'forward3', 'backward3', 'central2', 'central4']

torchzero/modules/grad_approximation/rfdm.py CHANGED Viewed

@@ -90,6 +90,19 @@ _RFD_FUNCS = {
 class RandomizedFDM(GradApproximator):
+    """_summary_
+    Args:
+        h (float, optional): finite difference step size of jvp_method is set to `forward` or `central`. Defaults to 1e-3.
+        n_samples (int, optional): number of random gradient samples. Defaults to 1.
+        formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
+        distribution (Distributions, optional): distribution. Defaults to "rademacher".
+            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
+        pre_generate (bool, optional):
+            whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
+        seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
+        target (GradTarget, optional): what to set on var. Defaults to "closure".
+    """
     PRE_MULTIPLY_BY_H = True
     def __init__(
         self,
@@ -99,8 +112,8 @@ class RandomizedFDM(GradApproximator):
         distribution: Distributions = "rademacher",
         beta: float = 0,
         pre_generate = True,
-        target: GradTarget = "closure",
         seed: int | None | torch.Generator = None,
+        target: GradTarget = "closure",
     ):
         defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution, beta=beta, pre_generate=pre_generate, seed=seed)
         super().__init__(defaults, target=target)
@@ -118,16 +131,16 @@ class RandomizedFDM(GradApproximator):
             else: self.global_state['generator'] = None
         return self.global_state['generator']
-    def pre_step(self, vars):
-        h, beta = self.get_settings('h', 'beta', params=vars.params)
-        settings = self.settings[vars.params[0]]
+    def pre_step(self, var):
+        h, beta = self.get_settings(var.params, 'h', 'beta')
+        settings = self.settings[var.params[0]]
         n_samples = settings['n_samples']
         distribution = settings['distribution']
         pre_generate = settings['pre_generate']
         if pre_generate:
-            params = TensorList(vars.params)
-            generator = self._get_generator(settings['seed'], vars.params)
+            params = TensorList(var.params)
+            generator = self._get_generator(settings['seed'], var.params)
             perturbations = [params.sample_like(distribution=distribution, generator=generator) for _ in range(n_samples)]
             if self.PRE_MULTIPLY_BY_H:
@@ -152,11 +165,11 @@ class RandomizedFDM(GradApproximator):
                 torch._foreach_lerp_(cur_flat, new_flat, betas)
     @torch.no_grad
-    def approximate(self, closure, params, loss, vars):
+    def approximate(self, closure, params, loss, var):
         params = TensorList(params)
         loss_approx = None
-        h = self.get_settings('h', params=vars.params, cls=NumberList)
+        h = NumberList(self.settings[p]['h'] for p in params)
         settings = self.settings[params[0]]
         n_samples = settings['n_samples']
         fd_fn = _RFD_FUNCS[settings['formula']]
@@ -220,29 +233,29 @@ class MeZO(GradApproximator):
             distribution=distribution, generator=torch.Generator(params[0].device).manual_seed(seed)
         ).mul_(h)
-    def pre_step(self, vars):
-        h = self.get_settings('h', params=vars.params)
-        settings = self.settings[vars.params[0]]
+    def pre_step(self, var):
+        h = NumberList(self.settings[p]['h'] for p in var.params)
+        settings = self.settings[var.params[0]]
         n_samples = settings['n_samples']
         distribution = settings['distribution']
-        step = vars.current_step
+        step = var.current_step
         # create functions that generate a deterministic perturbation from seed based on current step
         prt_fns = []
         for i in range(n_samples):
-            prt_fn = partial(self._seeded_perturbation, params=vars.params, distribution=distribution, seed=1_000_000*step + i, h=h)
+            prt_fn = partial(self._seeded_perturbation, params=var.params, distribution=distribution, seed=1_000_000*step + i, h=h)
             prt_fns.append(prt_fn)
         self.global_state['prt_fns'] = prt_fns
     @torch.no_grad
-    def approximate(self, closure, params, loss, vars):
+    def approximate(self, closure, params, loss, var):
         params = TensorList(params)
         loss_approx = None
-        h = self.get_settings('h', params=vars.params, cls=NumberList)
+        h = NumberList(self.settings[p]['h'] for p in params)
         settings = self.settings[params[0]]
         n_samples = settings['n_samples']
         fd_fn = _RFD_FUNCS[settings['formula']]

torchzero/modules/higher_order/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .higher_order_newton import HigherOrderNewton

torchzero/modules/higher_order/higher_order_newton.py ADDED Viewed

@@ -0,0 +1,256 @@
+import itertools
+import math
+import warnings
+from collections.abc import Callable
+from contextlib import nullcontext
+from functools import partial
+from typing import Any, Literal
+import numpy as np
+import scipy.optimize
+import torch
+from ...core import Chainable, Module, apply_transform
+from ...utils import TensorList, vec_to_tensors, vec_to_tensors_
+from ...utils.derivatives import (
+    hessian_list_to_mat,
+    jacobian_wrt,
+)
+_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
+def _poly_eval(s: np.ndarray, c, derivatives):
+    val = float(c)
+    for i,T in enumerate(derivatives, 1):
+        s1 = ''.join(_LETTERS[:i]) # abcd
+        s2 = ',...'.join(_LETTERS[:i]) # a,b,c,d
+        # this would make einsum('abcd,a,b,c,d', T, x, x, x, x)
+        val += np.einsum(f"...{s1},...{s2}", T, *(s for _ in range(i))) / math.factorial(i)
+    return val
+def _proximal_poly_v(x: np.ndarray, c, prox, x0: np.ndarray, derivatives):
+    if x.ndim == 2: x = x.T # DE passes (ndim, batch_size)
+    s = x - x0
+    val = _poly_eval(s, c, derivatives)
+    penalty = 0
+    if prox != 0: penalty = (prox / 2) * (s**2).sum(-1) # proximal penalty
+    return val + penalty
+def _proximal_poly_g(x: np.ndarray, c, prox, x0: np.ndarray, derivatives):
+    s = x - x0
+    g = derivatives[0].copy()
+    if len(derivatives) > 1:
+        for i, T in enumerate(derivatives[1:], 2):
+            s1 = ''.join(_LETTERS[:i]) # abcd
+            s2 = ','.join(_LETTERS[1:i]) # b,c,d
+            # this would make einsum('abcd,b,c,d->a', T, x, x, x)
+            g += np.einsum(f"{s1},{s2}->a", T, *(s for _ in range(i-1))) / math.factorial(i - 1)
+    g_prox = 0
+    if prox != 0: g_prox = prox * s
+    return g + g_prox
+def _proximal_poly_H(x: np.ndarray, c, prox, x0: np.ndarray, derivatives):
+    s = x - x0
+    n = x.shape[0]
+    if len(derivatives) == 1:
+        H = np.zeros(n, n)
+    else:
+        H = derivatives[1].copy()
+        if len(derivatives) > 2:
+            for i, T in enumerate(derivatives[2:], 3):
+                s1 = ''.join(_LETTERS[:i]) # abcd
+                s2 = ','.join(_LETTERS[2:i]) # c,d
+                # this would make einsum('abcd,c,d->ab', T, x, x, x)
+                H += np.einsum(f"{s1},{s2}->ab", T, *(s for _ in range(i-2))) / math.factorial(i - 2)
+    H_prox = 0
+    if prox != 0: H_prox = np.eye(n) * prox
+    return H + H_prox
+def _poly_minimize(trust_region, prox, de_iters: Any, c, x: torch.Tensor, derivatives):
+    derivatives = [T.detach().cpu().numpy().astype(np.float64) for T in derivatives]
+    x0 = x.detach().cpu().numpy().astype(np.float64) # taylor series center
+    bounds = None
+    if trust_region is not None: bounds = list(zip(x0 - trust_region, x0 + trust_region))
+    # if len(derivatives) is 1, only gradient is available, I use that to test proximal penalty and bounds
+    if bounds is None:
+        if len(derivatives) == 1: method = 'bfgs'
+        else: method = 'trust-exact'
+    else:
+        if len(derivatives) == 1: method = 'l-bfgs-b'
+        else: method = 'trust-constr'
+    x_init = x0.copy()
+    v0 = _proximal_poly_v(x0, c, prox, x0, derivatives)
+    if de_iters is not None and de_iters != 0:
+        if de_iters == -1: de_iters = None # let scipy decide
+        res = scipy.optimize.differential_evolution(
+            _proximal_poly_v,
+            bounds if bounds is not None else list(zip(x0 - 10, x0 + 10)),
+            args=(c, prox, x0.copy(), derivatives),
+            maxiter=de_iters,
+            vectorized=True,
+        )
+        if res.fun < v0: x_init = res.x
+    res = scipy.optimize.minimize(
+        _proximal_poly_v,
+        x_init,
+        method=method,
+        args=(c, prox, x0.copy(), derivatives),
+        jac=_proximal_poly_g,
+        hess=_proximal_poly_H,
+        bounds=bounds
+    )
+    return torch.from_numpy(res.x).to(x), res.fun
+class HigherOrderNewton(Module):
+    """
+    A basic arbitrary order newton's method with optional trust region and proximal penalty.
+    It is recommended to enable at least one of trust region or proximal penalty.
+    This constructs an nth order taylor approximation via autograd and minimizes it with
+    scipy.optimize.minimize trust region newton solvers with optional proximal penalty.
+    This uses n^order memory, where n is number of decision variables, and I am not aware
+    of any problems where this is more efficient than newton's method. It can minimize
+    rosenbrock in a single step, but that step probably takes more time than newton.
+    And there are way more efficient tensor methods out there but they tend to be
+    significantly more complex.
+    Args:
+        order (int, optional):
+            Order of the method, number of taylor series terms (orders of derivatives) used to approximate the function. Defaults to 4.
+        trust_method (str | None, optional):
+            Method used for trust region.
+            - "bounds" - the model is minimized within bounds defined by trust region.
+            - "proximal" - the model is minimized with penalty for going too far from current point.
+            - "none" - disables trust region.
+            Defaults to 'bounds'.
+        increase (float, optional): trust region multiplier on good steps. Defaults to 1.5.
+        decrease (float, optional): trust region multiplier on bad steps. Defaults to 0.75.
+        trust_init (float | None, optional):
+            initial trust region size. If none, defaults to 1 on :code:`trust_method="bounds"` and 0.1 on :code:`"proximal"`. Defaults to None.
+        trust_tol (float, optional):
+            Maximum ratio of expected loss reduction to actual reduction for trust region increase.
+            Should 1 or higer. Defaults to 2.
+        de_iters (int | None, optional):
+            If this is specified, the model is minimized via differential evolution first to possibly escape local minima,
+            then it is passed to scipy.optimize.minimize. Defaults to None.
+        vectorize (bool, optional): whether to enable vectorized jacobians (usually faster). Defaults to True.
+    """
+    def __init__(
+        self,
+        order: int = 4,
+        trust_method: Literal['bounds', 'proximal', 'none'] | None = 'bounds',
+        increase: float = 1.5,
+        decrease: float = 0.75,
+        trust_init: float | None = None,
+        trust_tol: float = 2,
+        de_iters: int | None = None,
+        vectorize: bool = True,
+    ):
+        if trust_init is None:
+            if trust_method == 'bounds': trust_init = 1
+            else: trust_init = 0.1
+        defaults = dict(order=order, trust_method=trust_method, increase=increase, decrease=decrease, trust_tol=trust_tol, trust_init=trust_init, vectorize=vectorize, de_iters=de_iters)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        params = TensorList(var.params)
+        closure = var.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        order = settings['order']
+        increase = settings['increase']
+        decrease = settings['decrease']
+        trust_tol = settings['trust_tol']
+        trust_init = settings['trust_init']
+        trust_method = settings['trust_method']
+        de_iters = settings['de_iters']
+        vectorize = settings['vectorize']
+        trust_value = self.global_state.get('trust_value', trust_init)
+        # ------------------------ calculate grad and hessian ------------------------ #
+        with torch.enable_grad():
+            loss = var.loss = var.loss_approx = closure(False)
+            g_list = torch.autograd.grad(loss, params, create_graph=True)
+            var.grad = list(g_list)
+            g = torch.cat([t.ravel() for t in g_list])
+            n = g.numel()
+            derivatives = [g]
+            T = g # current derivatives tensor
+            # get all derivative up to order
+            for o in range(2, order + 1):
+                is_last = o == order
+                T_list = jacobian_wrt([T], params, create_graph=not is_last, batched=vectorize)
+                with torch.no_grad() if is_last else nullcontext():
+                    # the shape is (ndim, ) * order
+                    T = hessian_list_to_mat(T_list).view(n, n, *T.shape[1:])
+                    derivatives.append(T)
+        x0 = torch.cat([p.ravel() for p in params])
+        if trust_method is None: trust_method = 'none'
+        else: trust_method = trust_method.lower()
+        if trust_method == 'none':
+            trust_region = None
+            prox = 0
+        elif trust_method == 'bounds':
+            trust_region = trust_value
+            prox = 0
+        elif trust_method == 'proximal':
+            trust_region = None
+            prox = 1 / trust_value
+        else:
+            raise ValueError(trust_method)
+        x_star, expected_loss = _poly_minimize(
+            trust_region=trust_region,
+            prox=prox,
+            de_iters=de_iters,
+            c=loss.item(),
+            x=x0,
+            derivatives=derivatives,
+        )
+        # trust region
+        if trust_method != 'none':
+            expected_reduction = loss - expected_loss
+            vec_to_tensors_(x_star, params)
+            loss_star = closure(False)
+            vec_to_tensors_(x0, params)
+            reduction = loss - loss_star
+            # failed step
+            if reduction <= 0:
+                x_star = x0
+                self.global_state['trust_value'] = trust_value * decrease
+            # very good step
+            elif expected_reduction / reduction <= trust_tol:
+                self.global_state['trust_value'] = trust_value * increase
+        difference = vec_to_tensors(x0 - x_star, params)
+        var.update = list(difference)
+        return var

torchzero/modules/line_search/backtracking.py CHANGED Viewed

@@ -14,7 +14,6 @@ def backtracking_line_search(
     beta: float = 0.5,
     c: float = 1e-4,
     maxiter: int = 10,
-    a_min: float | None = None,
     try_negative: bool = False,
 ) -> float | None:
     """
@@ -26,7 +25,6 @@ def backtracking_line_search(
         beta: The factor by which to decrease alpha in each iteration
         c: The constant for the Armijo sufficient decrease condition
         max_iter: Maximum number of backtracking iterations (default: 10).
-        min_alpha: Minimum allowable step size to prevent near-zero values (default: 1e-16).
     Returns:
         step size
@@ -45,10 +43,6 @@ def backtracking_line_search(
         # decrease alpha
         a *= beta
-        # alpha too small
-        if a_min is not None and a < a_min:
-            return a_min
     # fail
     if try_negative:
         def inv_objective(alpha): return f(-alpha)
@@ -59,7 +53,6 @@ def backtracking_line_search(
             beta=beta,
             c=c,
             maxiter=maxiter,
-            a_min=a_min,
             try_negative=False,
         )
         if v is not None: return -v
@@ -67,17 +60,28 @@ def backtracking_line_search(
     return None
 class Backtracking(LineSearch):
+    """Backtracking line search satisfying the Armijo condition.
+    Args:
+        init (float, optional): initial step size. Defaults to 1.0.
+        beta (float, optional): multiplies each consecutive step size by this value. Defaults to 0.5.
+        c (float, optional): acceptance value for Armijo condition. Defaults to 1e-4.
+        maxiter (int, optional): Maximum line search function evaluations. Defaults to 10.
+        adaptive (bool, optional):
+            when enabled, if line search failed, initial step size is reduced.
+            Otherwise it is reset to initial value. Defaults to True.
+        try_negative (bool, optional): Whether to perform line search in opposite direction on fail. Defaults to False.
+    """
     def __init__(
         self,
         init: float = 1.0,
         beta: float = 0.5,
         c: float = 1e-4,
         maxiter: int = 10,
-        min_alpha: float | None = None,
         adaptive=True,
         try_negative: bool = False,
     ):
-        defaults=dict(init=init,beta=beta,c=c,maxiter=maxiter,min_alpha=min_alpha,adaptive=adaptive, try_negative=try_negative)
+        defaults=dict(init=init,beta=beta,c=c,maxiter=maxiter,adaptive=adaptive, try_negative=try_negative)
         super().__init__(defaults=defaults)
         self.global_state['beta_scale'] = 1.0
@@ -86,20 +90,20 @@ class Backtracking(LineSearch):
         self.global_state['beta_scale'] = 1.0
     @torch.no_grad
-    def search(self, update, vars):
-        init, beta, c, maxiter, min_alpha, adaptive, try_negative = itemgetter(
-            'init', 'beta', 'c', 'maxiter', 'min_alpha', 'adaptive', 'try_negative')(self.settings[vars.params[0]])
+    def search(self, update, var):
+        init, beta, c, maxiter, adaptive, try_negative = itemgetter(
+            'init', 'beta', 'c', 'maxiter', 'adaptive', 'try_negative')(self.settings[var.params[0]])
-        objective = self.make_objective(vars=vars)
+        objective = self.make_objective(var=var)
         # # directional derivative
-        d = -sum(t.sum() for t in torch._foreach_mul(vars.get_grad(), vars.get_update()))
+        d = -sum(t.sum() for t in torch._foreach_mul(var.get_grad(), var.get_update()))
         # scale beta (beta is multiplicative and i think may be better than scaling initial step size)
         if adaptive: beta = beta * self.global_state['beta_scale']
         step_size = backtracking_line_search(objective, d, init=init,beta=beta,
-                                        c=c,maxiter=maxiter,a_min=min_alpha, try_negative=try_negative)
+                                        c=c,maxiter=maxiter, try_negative=try_negative)
         # found an alpha that reduces loss
         if step_size is not None:
@@ -114,19 +118,34 @@ def _lerp(start,end,weight):
     return start + weight * (end - start)
 class AdaptiveBacktracking(LineSearch):
+    """Adaptive backtracking line search. After each line search procedure, a new initial step size is set
+    such that optimal step size in the procedure would be found on the second line search iteration.
+    Args:
+        init (float, optional): step size for the first step. Defaults to 1.0.
+        beta (float, optional): multiplies each consecutive step size by this value. Defaults to 0.5.
+        c (float, optional): acceptance value for Armijo condition. Defaults to 1e-4.
+        maxiter (int, optional): Maximum line search function evaluations. Defaults to 10.
+        target_iters (int, optional):
+            target number of iterations that would be performed until optimal step size is found. Defaults to 1.
+        nplus (float, optional):
+            Multiplier to initial step size if it was found to be the optimal step size. Defaults to 2.0.
+        scale_beta (float, optional):
+            Momentum for initial step size, at 0 disables momentum. Defaults to 0.0.
+        try_negative (bool, optional): Whether to perform line search in opposite direction on fail. Defaults to False.
+    """
     def __init__(
         self,
         init: float = 1.0,
         beta: float = 0.5,
         c: float = 1e-4,
         maxiter: int = 20,
-        min_alpha: float | None = None,
         target_iters = 1,
         nplus = 2.0,
         scale_beta = 0.0,
         try_negative: bool = False,
     ):
-        defaults=dict(init=init,beta=beta,c=c,maxiter=maxiter,min_alpha=min_alpha,target_iters=target_iters,nplus=nplus,scale_beta=scale_beta, try_negative=try_negative)
+        defaults=dict(init=init,beta=beta,c=c,maxiter=maxiter,target_iters=target_iters,nplus=nplus,scale_beta=scale_beta, try_negative=try_negative)
         super().__init__(defaults=defaults)
         self.global_state['beta_scale'] = 1.0
@@ -138,15 +157,15 @@ class AdaptiveBacktracking(LineSearch):
         self.global_state['initial_scale'] = 1.0
     @torch.no_grad
-    def search(self, update, vars):
-        init, beta, c, maxiter, min_alpha, target_iters, nplus, scale_beta, try_negative=itemgetter(
-            'init','beta','c','maxiter','min_alpha','target_iters','nplus','scale_beta', 'try_negative')(self.settings[vars.params[0]])
+    def search(self, update, var):
+        init, beta, c, maxiter, target_iters, nplus, scale_beta, try_negative=itemgetter(
+            'init','beta','c','maxiter','target_iters','nplus','scale_beta', 'try_negative')(self.settings[var.params[0]])
-        objective = self.make_objective(vars=vars)
+        objective = self.make_objective(var=var)
         # directional derivative (0 if c = 0 because it is not needed)
         if c == 0: d = 0
-        else: d = -sum(t.sum() for t in torch._foreach_mul(vars.get_grad(), update))
+        else: d = -sum(t.sum() for t in torch._foreach_mul(var.get_grad(), update))
         # scale beta
         beta = beta * self.global_state['beta_scale']
@@ -155,7 +174,7 @@ class AdaptiveBacktracking(LineSearch):
         init = init * self.global_state['initial_scale']
         step_size = backtracking_line_search(objective, d, init=init, beta=beta,
-                                        c=c,maxiter=maxiter,a_min=min_alpha, try_negative=try_negative)
+                                        c=c,maxiter=maxiter, try_negative=try_negative)
         # found an alpha that reduces loss
         if step_size is not None:

torchzero 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

torchzero 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl