PyPI - torchzero - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +115 -68
tests/test_tensorlist.py +2 -2
tests/test_vars.py +62 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +185 -53
torchzero/core/transform.py +327 -159
torchzero/modules/__init__.py +3 -1
torchzero/modules/clipping/clipping.py +120 -23
torchzero/modules/clipping/ema_clipping.py +37 -22
torchzero/modules/clipping/growth_clipping.py +20 -21
torchzero/modules/experimental/__init__.py +30 -4
torchzero/modules/experimental/absoap.py +53 -156
torchzero/modules/experimental/adadam.py +22 -15
torchzero/modules/experimental/adamY.py +21 -25
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +37 -8
torchzero/modules/experimental/adasoap.py +24 -129
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +120 -0
torchzero/modules/experimental/etf.py +195 -0
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +49 -50
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +92 -0
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +10 -7
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +20 -10
torchzero/modules/experimental/tensor_adagrad.py +42 -0
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +31 -4
torchzero/modules/grad_approximation/forward_gradient.py +17 -7
torchzero/modules/grad_approximation/grad_approximator.py +69 -24
torchzero/modules/grad_approximation/rfdm.py +310 -50
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +319 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +75 -31
torchzero/modules/line_search/line_search.py +107 -49
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +20 -5
torchzero/modules/line_search/strong_wolfe.py +52 -36
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/misc/split.py +103 -0
torchzero/modules/{ops → misc}/switch.py +48 -7
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +115 -40
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +145 -76
torchzero/modules/momentum/momentum.py +25 -4
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +51 -25
torchzero/modules/ops/binary.py +108 -62
torchzero/modules/ops/multi.py +95 -34
torchzero/modules/ops/reduce.py +31 -23
torchzero/modules/ops/unary.py +37 -21
torchzero/modules/ops/utility.py +53 -45
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +48 -29
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +35 -37
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/optimizers/ladagrad.py +183 -0
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +32 -7
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +19 -19
torchzero/modules/optimizers/rprop.py +89 -52
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +55 -27
torchzero/modules/optimizers/soap.py +40 -37
torchzero/modules/optimizers/sophia_h.py +82 -25
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +4 -2
torchzero/modules/projections/projection.py +212 -118
torchzero/modules/quasi_newton/__init__.py +44 -5
torchzero/modules/quasi_newton/cg.py +190 -39
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +102 -58
torchzero/modules/quasi_newton/quasi_newton.py +1032 -177
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +245 -54
torchzero/modules/second_order/newton_cg.py +311 -21
torchzero/modules/second_order/nystrom.py +124 -21
torchzero/modules/smoothing/gaussian.py +55 -21
torchzero/modules/smoothing/laplacian.py +20 -12
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +126 -10
torchzero/modules/wrappers/optim_wrapper.py +40 -12
torchzero/optim/wrappers/directsearch.py +281 -0
torchzero/optim/wrappers/fcmaes.py +105 -0
torchzero/optim/wrappers/mads.py +89 -0
torchzero/optim/wrappers/nevergrad.py +20 -5
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +167 -16
torchzero/utils/__init__.py +3 -7
torchzero/utils/derivatives.py +5 -4
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +27 -4
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/METADATA +76 -51
torchzero-0.3.11.dist-info/RECORD +159 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/soapy.py +0 -290
torchzero/modules/experimental/spectral.py +0 -288
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/lr.py +0 -59
torchzero/modules/lr/step_size.py +0 -97
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -419
torchzero/modules/ops/split.py +0 -75
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.9.dist-info/RECORD +0 -131
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/higher_order/higher_order_newton.py ADDED Viewed

@@ -0,0 +1,319 @@
+import itertools
+import math
+import warnings
+from collections.abc import Callable
+from contextlib import nullcontext
+from functools import partial
+from typing import Any, Literal
+import numpy as np
+import scipy.optimize
+import torch
+from ...core import Chainable, Module, apply_transform
+from ...utils import TensorList, vec_to_tensors, vec_to_tensors_
+from ...utils.derivatives import (
+    hessian_list_to_mat,
+    jacobian_wrt,
+)
+_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
+def _poly_eval(s: np.ndarray, c, derivatives):
+    val = float(c)
+    for i,T in enumerate(derivatives, 1):
+        s1 = ''.join(_LETTERS[:i]) # abcd
+        s2 = ',...'.join(_LETTERS[:i]) # a,b,c,d
+        # this would make einsum('abcd,a,b,c,d', T, x, x, x, x)
+        val += np.einsum(f"...{s1},...{s2}", T, *(s for _ in range(i))) / math.factorial(i)
+    return val
+def _proximal_poly_v(x: np.ndarray, c, prox, x0: np.ndarray, derivatives):
+    if x.ndim == 2: x = x.T # DE passes (ndim, batch_size)
+    s = x - x0
+    val = _poly_eval(s, c, derivatives)
+    penalty = 0
+    if prox != 0: penalty = (prox / 2) * (s**2).sum(-1) # proximal penalty
+    return val + penalty
+def _proximal_poly_g(x: np.ndarray, c, prox, x0: np.ndarray, derivatives):
+    s = x - x0
+    g = derivatives[0].copy()
+    if len(derivatives) > 1:
+        for i, T in enumerate(derivatives[1:], 2):
+            s1 = ''.join(_LETTERS[:i]) # abcd
+            s2 = ','.join(_LETTERS[1:i]) # b,c,d
+            # this would make einsum('abcd,b,c,d->a', T, x, x, x)
+            g += np.einsum(f"{s1},{s2}->a", T, *(s for _ in range(i-1))) / math.factorial(i - 1)
+    g_prox = 0
+    if prox != 0: g_prox = prox * s
+    return g + g_prox
+def _proximal_poly_H(x: np.ndarray, c, prox, x0: np.ndarray, derivatives):
+    s = x - x0
+    n = x.shape[0]
+    if len(derivatives) == 1:
+        H = np.zeros(n, n)
+    else:
+        H = derivatives[1].copy()
+        if len(derivatives) > 2:
+            for i, T in enumerate(derivatives[2:], 3):
+                s1 = ''.join(_LETTERS[:i]) # abcd
+                s2 = ','.join(_LETTERS[2:i]) # c,d
+                # this would make einsum('abcd,c,d->ab', T, x, x, x)
+                H += np.einsum(f"{s1},{s2}->ab", T, *(s for _ in range(i-2))) / math.factorial(i - 2)
+    H_prox = 0
+    if prox != 0: H_prox = np.eye(n) * prox
+    return H + H_prox
+def _poly_minimize(trust_region, prox, de_iters: Any, c, x: torch.Tensor, derivatives):
+    derivatives = [T.detach().cpu().numpy().astype(np.float64) for T in derivatives]
+    x0 = x.detach().cpu().numpy().astype(np.float64) # taylor series center
+    # notes
+    # 1. since we have exact hessian we use trust methods
+    # 2. if len(derivatives) is 1, only gradient is available,
+    # thus use slsqp depending on whether trust region is enabled
+    # this is just so that I can test that trust region works
+    if trust_region is None:
+        if len(derivatives) == 1: raise RuntimeError("trust region must be enabled because 1st order has no minima")
+        method = 'trust-exact'
+        de_bounds = list(zip(x0 - 10, x0 + 10))
+        constraints = None
+    else:
+        if len(derivatives) == 1: method = 'slsqp'
+        else: method = 'trust-constr'
+        de_bounds = list(zip(x0 - trust_region, x0 + trust_region))
+        def l2_bound_f(x):
+            if x.ndim == 2: return np.sum((x - x0[:,None])**2, axis=0)[None,:] # DE passes (ndim, batch_size) and expects (M, S)
+            return np.sum((x - x0)**2, axis=0)
+        def l2_bound_g(x):
+            return 2 * (x - x0)
+        def l2_bound_h(x, v):
+            return v[0] * 2 * np.eye(x0.shape[0])
+        constraint = scipy.optimize.NonlinearConstraint(
+            fun=l2_bound_f,
+            lb=0, # 0 <= ||x-x0||^2
+            ub=trust_region**2, # ||x-x0||^2 <= R^2
+            jac=l2_bound_g, # pyright:ignore[reportArgumentType]
+            hess=l2_bound_h,
+            keep_feasible=False
+        )
+        constraints = [constraint]
+    x_init = x0.copy()
+    v0 = _proximal_poly_v(x0, c, prox, x0, derivatives)
+    # ---------------------------------- run DE ---------------------------------- #
+    if de_iters is not None and de_iters != 0:
+        if de_iters == -1: de_iters = None # let scipy decide
+        # DE needs bounds so use linf ig
+        res = scipy.optimize.differential_evolution(
+            _proximal_poly_v,
+            de_bounds,
+            args=(c, prox, x0.copy(), derivatives),
+            maxiter=de_iters,
+            vectorized=True,
+            constraints = constraints,
+            updating='deferred',
+        )
+        if res.fun < v0 and np.all(np.isfinite(res.x)): x_init = res.x
+    # ------------------------------- run minimize ------------------------------- #
+    try:
+        res = scipy.optimize.minimize(
+            _proximal_poly_v,
+            x_init,
+            method=method,
+            args=(c, prox, x0.copy(), derivatives),
+            jac=_proximal_poly_g,
+            hess=_proximal_poly_H,
+            constraints = constraints,
+        )
+    except ValueError:
+        return x, -float('inf')
+    return torch.from_numpy(res.x).to(x), res.fun
+class HigherOrderNewton(Module):
+    """A basic arbitrary order newton's method with optional trust region and proximal penalty.
+    This constructs an nth order taylor approximation via autograd and minimizes it with
+    scipy.optimize.minimize trust region newton solvers with optional proximal penalty.
+    .. note::
+        In most cases HigherOrderNewton should be the first module in the chain because it relies on extra autograd. Use the :code:`inner` argument if you wish to apply Newton preconditioning to another module's output.
+    .. note::
+        This module requires the a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients for calculating higher order derivatives.
+        The closure must accept a ``backward`` argument (refer to documentation).
+    .. warning::
+        this uses roughly O(N^order) memory and solving the subproblem can be very expensive.
+    .. warning::
+        "none" and "proximal" trust methods may generate subproblems that have no minima, causing divergence.
+    Args:
+        order (int, optional):
+            Order of the method, number of taylor series terms (orders of derivatives) used to approximate the function. Defaults to 4.
+        trust_method (str | None, optional):
+            Method used for trust region.
+            - "bounds" - the model is minimized within bounds defined by trust region.
+            - "proximal" - the model is minimized with penalty for going too far from current point.
+            - "none" - disables trust region.
+            Defaults to 'bounds'.
+        increase (float, optional): trust region multiplier on good steps. Defaults to 1.5.
+        decrease (float, optional): trust region multiplier on bad steps. Defaults to 0.75.
+        trust_init (float | None, optional):
+            initial trust region size. If none, defaults to 1 on :code:`trust_method="bounds"` and 0.1 on :code:`"proximal"`. Defaults to None.
+        trust_tol (float, optional):
+            Maximum ratio of expected loss reduction to actual reduction for trust region increase.
+            Should 1 or higer. Defaults to 2.
+        de_iters (int | None, optional):
+            If this is specified, the model is minimized via differential evolution first to possibly escape local minima,
+            then it is passed to scipy.optimize.minimize. Defaults to None.
+        vectorize (bool, optional): whether to enable vectorized jacobians (usually faster). Defaults to True.
+    """
+    def __init__(
+        self,
+        order: int = 4,
+        trust_method: Literal['bounds', 'proximal', 'none'] | None = 'bounds',
+        nplus: float = 2,
+        nminus: float = 0.25,
+        init: float | None = None,
+        eta: float = 1e-6,
+        max_attempts = 10,
+        de_iters: int | None = None,
+        vectorize: bool = True,
+    ):
+        if init is None:
+            if trust_method == 'bounds': init = 1
+            else: init = 0.1
+        defaults = dict(order=order, trust_method=trust_method, nplus=nplus, nminus=nminus, eta=eta, init=init, vectorize=vectorize, de_iters=de_iters, max_attempts=max_attempts)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        params = TensorList(var.params)
+        closure = var.closure
+        if closure is None: raise RuntimeError('HigherOrderNewton requires closure')
+        settings = self.settings[params[0]]
+        order = settings['order']
+        nplus = settings['nplus']
+        nminus = settings['nminus']
+        eta = settings['eta']
+        init = settings['init']
+        trust_method = settings['trust_method']
+        de_iters = settings['de_iters']
+        max_attempts = settings['max_attempts']
+        vectorize = settings['vectorize']
+        # ------------------------ calculate grad and hessian ------------------------ #
+        with torch.enable_grad():
+            loss = var.loss = var.loss_approx = closure(False)
+            g_list = torch.autograd.grad(loss, params, create_graph=True)
+            var.grad = list(g_list)
+            g = torch.cat([t.ravel() for t in g_list])
+            n = g.numel()
+            derivatives = [g]
+            T = g # current derivatives tensor
+            # get all derivative up to order
+            for o in range(2, order + 1):
+                is_last = o == order
+                T_list = jacobian_wrt([T], params, create_graph=not is_last, batched=vectorize)
+                with torch.no_grad() if is_last else nullcontext():
+                    # the shape is (ndim, ) * order
+                    T = hessian_list_to_mat(T_list).view(n, n, *T.shape[1:])
+                    derivatives.append(T)
+        x0 = torch.cat([p.ravel() for p in params])
+        success = False
+        x_star = None
+        while not success:
+            max_attempts -= 1
+            if max_attempts < 0: break
+            # load trust region value
+            trust_value = self.global_state.get('trust_region', init)
+            if trust_value < 1e-8 or trust_value > 1e16: trust_value = self.global_state['trust_region'] = settings['init']
+            if trust_method is None: trust_method = 'none'
+            else: trust_method = trust_method.lower()
+            if trust_method == 'none':
+                trust_region = None
+                prox = 0
+            elif trust_method == 'bounds':
+                trust_region = trust_value
+                prox = 0
+            elif trust_method == 'proximal':
+                trust_region = None
+                prox = 1 / trust_value
+            else:
+                raise ValueError(trust_method)
+            # minimize the model
+            x_star, expected_loss = _poly_minimize(
+                trust_region=trust_region,
+                prox=prox,
+                de_iters=de_iters,
+                c=loss.item(),
+                x=x0,
+                derivatives=derivatives,
+            )
+            # update trust region
+            if trust_method == 'none':
+                success = True
+            else:
+                pred_reduction = loss - expected_loss
+                vec_to_tensors_(x_star, params)
+                loss_star = closure(False)
+                vec_to_tensors_(x0, params)
+                reduction = loss - loss_star
+                rho = reduction / (max(pred_reduction, 1e-8))
+                # failed step
+                if rho < 0.25:
+                    self.global_state['trust_region'] = trust_value * nminus
+                # very good step
+                elif rho > 0.75:
+                    diff = trust_value - (x0 - x_star).abs_()
+                    if (diff.amin() / trust_value) > 1e-4: # hits boundary
+                        self.global_state['trust_region'] = trust_value * nplus
+                # if the ratio is high enough then accept the proposed step
+                success = rho > eta
+        assert x_star is not None
+        if success:
+            difference = vec_to_tensors(x0 - x_star, params)
+            var.update = list(difference)
+        else:
+            var.update = params.zeros_like()
+        return var

torchzero/modules/line_search/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from .line_search import LineSearch, GridLineSearch
-from .backtracking import backtracking_line_search, Backtracking, AdaptiveBacktracking
-from .strong_wolfe import StrongWolfe
+from .adaptive import AdaptiveLineSearch
+from .backtracking import AdaptiveBacktracking, Backtracking
+from .line_search import LineSearchBase
 from .scipy import ScipyMinimizeScalar
-from .trust_region import TrustRegion
+from .strong_wolfe import StrongWolfe

torchzero/modules/line_search/adaptive.py ADDED Viewed

@@ -0,0 +1,99 @@
+import math
+from collections.abc import Callable
+from operator import itemgetter
+import torch
+from .line_search import LineSearchBase
+def adaptive_tracking(
+    f,
+    x_0,
+    maxiter: int,
+    nplus: float = 2,
+    nminus: float = 0.5,
+):
+    f_0 = f(0)
+    t = x_0
+    f_t = f(t)
+    # backtrack
+    if f_t > f_0:
+        while f_t > f_0:
+            maxiter -= 1
+            if maxiter < 0: return 0, f_0
+            t = t*nminus
+            f_t = f(t)
+        return t, f_t
+    # forwardtrack
+    f_prev = f_t
+    t *= nplus
+    f_t = f(t)
+    if f_prev < f_t: return t / nplus, f_prev
+    while f_prev >= f_t:
+        maxiter -= 1
+        if maxiter < 0: return t, f_t
+        f_prev = f_t
+        t *= nplus
+        f_t = f(t)
+    return t / nplus, f_prev
+class AdaptiveLineSearch(LineSearchBase):
+    """Adaptive line search, similar to backtracking but also has forward tracking mode.
+    Currently doesn't check for weak curvature condition.
+    Args:
+        init (float, optional): initial step size. Defaults to 1.0.
+        beta (float, optional): multiplies each consecutive step size by this value. Defaults to 0.5.
+        maxiter (int, optional): Maximum line search function evaluations. Defaults to 10.
+        adaptive (bool, optional):
+            when enabled, if line search failed, beta size is reduced.
+            Otherwise it is reset to initial value. Defaults to True.
+    """
+    def __init__(
+        self,
+        init: float = 1.0,
+        nplus: float = 2,
+        nminus: float = 0.5,
+        maxiter: int = 10,
+        adaptive=True,
+    ):
+        defaults=dict(init=init,nplus=nplus,nminus=nminus,maxiter=maxiter,adaptive=adaptive,)
+        super().__init__(defaults=defaults)
+        self.global_state['beta_scale'] = 1.0
+    def reset(self):
+        super().reset()
+        self.global_state['beta_scale'] = 1.0
+    @torch.no_grad
+    def search(self, update, var):
+        init, nplus, nminus, maxiter, adaptive = itemgetter(
+            'init', 'nplus', 'nminus', 'maxiter', 'adaptive')(self.settings[var.params[0]])
+        objective = self.make_objective(var=var)
+        # # directional derivative
+        # d = -sum(t.sum() for t in torch._foreach_mul(var.get_grad(), var.get_update()))
+        # scale beta (beta is multiplicative and i think may be better than scaling initial step size)
+        beta_scale = self.global_state.get('beta_scale', 1)
+        x_prev = self.global_state.get('prev_x', 1)
+        if adaptive: nminus = nminus * beta_scale
+        step_size, f = adaptive_tracking(objective, x_prev, maxiter, nplus=nplus, nminus=nminus)
+        # found an alpha that reduces loss
+        if step_size != 0:
+            self.global_state['beta_scale'] = min(1.0, self.global_state['beta_scale'] * math.sqrt(1.5))
+            return step_size
+        # on fail reduce beta scale value
+        self.global_state['beta_scale'] /= 1.5
+        return 0

torchzero/modules/line_search/backtracking.py CHANGED Viewed

@@ -4,7 +4,7 @@ from operator import itemgetter
 import torch
-from .line_search import LineSearch
+from .line_search import LineSearchBase
 def backtracking_line_search(
@@ -14,19 +14,17 @@ def backtracking_line_search(
     beta: float = 0.5,
     c: float = 1e-4,
     maxiter: int = 10,
-    a_min: float | None = None,
     try_negative: bool = False,
 ) -> float | None:
     """
     Args:
-        objective_fn: evaluates step size along some descent direction.
-        dir_derivative: directional derivative along the descent direction.
-        alpha_init: initial step size.
+        f: evaluates step size along some descent direction.
+        g_0: directional derivative along the descent direction.
+        init: initial step size.
         beta: The factor by which to decrease alpha in each iteration
         c: The constant for the Armijo sufficient decrease condition
-        max_iter: Maximum number of backtracking iterations (default: 10).
-        min_alpha: Minimum allowable step size to prevent near-zero values (default: 1e-16).
+        maxiter: Maximum number of backtracking iterations (default: 10).
     Returns:
         step size
@@ -34,21 +32,21 @@ def backtracking_line_search(
     a = init
     f_x = f(0)
+    f_prev = None
     for iteration in range(maxiter):
         f_a = f(a)
-        if f_a <= f_x + c * a * min(g_0, 0): # pyright: ignore[reportArgumentType]
+        if (f_prev is not None) and (f_a > f_prev) and (f_prev < f_x): return a / beta
+        f_prev = f_a
+        if f_a < f_x + c * a * min(g_0, 0): # pyright: ignore[reportArgumentType]
             # found an acceptable alpha
             return a
         # decrease alpha
         a *= beta
-        # alpha too small
-        if a_min is not None and a < a_min:
-            return a_min
     # fail
     if try_negative:
         def inv_objective(alpha): return f(-alpha)
@@ -59,25 +57,56 @@ def backtracking_line_search(
             beta=beta,
             c=c,
             maxiter=maxiter,
-            a_min=a_min,
             try_negative=False,
         )
         if v is not None: return -v
     return None
-class Backtracking(LineSearch):
+class Backtracking(LineSearchBase):
+    """Backtracking line search satisfying the Armijo condition.
+    Args:
+        init (float, optional): initial step size. Defaults to 1.0.
+        beta (float, optional): multiplies each consecutive step size by this value. Defaults to 0.5.
+        c (float, optional): acceptance value for Armijo condition. Defaults to 1e-4.
+        maxiter (int, optional): Maximum line search function evaluations. Defaults to 10.
+        adaptive (bool, optional):
+            when enabled, if line search failed, beta is reduced.
+            Otherwise it is reset to initial value. Defaults to True.
+        try_negative (bool, optional): Whether to perform line search in opposite direction on fail. Defaults to False.
+    Examples:
+        Gradient descent with backtracking line search:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Backtracking()
+            )
+        LBFGS with backtracking line search:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.LBFGS(),
+                tz.m.Backtracking()
+            )
+    """
     def __init__(
         self,
         init: float = 1.0,
         beta: float = 0.5,
         c: float = 1e-4,
         maxiter: int = 10,
-        min_alpha: float | None = None,
         adaptive=True,
         try_negative: bool = False,
     ):
-        defaults=dict(init=init,beta=beta,c=c,maxiter=maxiter,min_alpha=min_alpha,adaptive=adaptive, try_negative=try_negative)
+        defaults=dict(init=init,beta=beta,c=c,maxiter=maxiter,adaptive=adaptive, try_negative=try_negative)
         super().__init__(defaults=defaults)
         self.global_state['beta_scale'] = 1.0
@@ -86,20 +115,20 @@ class Backtracking(LineSearch):
         self.global_state['beta_scale'] = 1.0
     @torch.no_grad
-    def search(self, update, vars):
-        init, beta, c, maxiter, min_alpha, adaptive, try_negative = itemgetter(
-            'init', 'beta', 'c', 'maxiter', 'min_alpha', 'adaptive', 'try_negative')(self.settings[vars.params[0]])
+    def search(self, update, var):
+        init, beta, c, maxiter, adaptive, try_negative = itemgetter(
+            'init', 'beta', 'c', 'maxiter', 'adaptive', 'try_negative')(self.settings[var.params[0]])
-        objective = self.make_objective(vars=vars)
+        objective = self.make_objective(var=var)
         # # directional derivative
-        d = -sum(t.sum() for t in torch._foreach_mul(vars.get_grad(), vars.get_update()))
+        d = -sum(t.sum() for t in torch._foreach_mul(var.get_grad(), var.get_update()))
         # scale beta (beta is multiplicative and i think may be better than scaling initial step size)
         if adaptive: beta = beta * self.global_state['beta_scale']
         step_size = backtracking_line_search(objective, d, init=init,beta=beta,
-                                        c=c,maxiter=maxiter,a_min=min_alpha, try_negative=try_negative)
+                                        c=c,maxiter=maxiter, try_negative=try_negative)
         # found an alpha that reduces loss
         if step_size is not None:
@@ -113,20 +142,35 @@ class Backtracking(LineSearch):
 def _lerp(start,end,weight):
     return start + weight * (end - start)
-class AdaptiveBacktracking(LineSearch):
+class AdaptiveBacktracking(LineSearchBase):
+    """Adaptive backtracking line search. After each line search procedure, a new initial step size is set
+    such that optimal step size in the procedure would be found on the second line search iteration.
+    Args:
+        init (float, optional): step size for the first step. Defaults to 1.0.
+        beta (float, optional): multiplies each consecutive step size by this value. Defaults to 0.5.
+        c (float, optional): acceptance value for Armijo condition. Defaults to 1e-4.
+        maxiter (int, optional): Maximum line search function evaluations. Defaults to 10.
+        target_iters (int, optional):
+            target number of iterations that would be performed until optimal step size is found. Defaults to 1.
+        nplus (float, optional):
+            Multiplier to initial step size if it was found to be the optimal step size. Defaults to 2.0.
+        scale_beta (float, optional):
+            Momentum for initial step size, at 0 disables momentum. Defaults to 0.0.
+        try_negative (bool, optional): Whether to perform line search in opposite direction on fail. Defaults to False.
+    """
     def __init__(
         self,
         init: float = 1.0,
         beta: float = 0.5,
         c: float = 1e-4,
         maxiter: int = 20,
-        min_alpha: float | None = None,
         target_iters = 1,
         nplus = 2.0,
         scale_beta = 0.0,
         try_negative: bool = False,
     ):
-        defaults=dict(init=init,beta=beta,c=c,maxiter=maxiter,min_alpha=min_alpha,target_iters=target_iters,nplus=nplus,scale_beta=scale_beta, try_negative=try_negative)
+        defaults=dict(init=init,beta=beta,c=c,maxiter=maxiter,target_iters=target_iters,nplus=nplus,scale_beta=scale_beta, try_negative=try_negative)
         super().__init__(defaults=defaults)
         self.global_state['beta_scale'] = 1.0
@@ -138,15 +182,15 @@ class AdaptiveBacktracking(LineSearch):
         self.global_state['initial_scale'] = 1.0
     @torch.no_grad
-    def search(self, update, vars):
-        init, beta, c, maxiter, min_alpha, target_iters, nplus, scale_beta, try_negative=itemgetter(
-            'init','beta','c','maxiter','min_alpha','target_iters','nplus','scale_beta', 'try_negative')(self.settings[vars.params[0]])
+    def search(self, update, var):
+        init, beta, c, maxiter, target_iters, nplus, scale_beta, try_negative=itemgetter(
+            'init','beta','c','maxiter','target_iters','nplus','scale_beta', 'try_negative')(self.settings[var.params[0]])
-        objective = self.make_objective(vars=vars)
+        objective = self.make_objective(var=var)
         # directional derivative (0 if c = 0 because it is not needed)
         if c == 0: d = 0
-        else: d = -sum(t.sum() for t in torch._foreach_mul(vars.get_grad(), update))
+        else: d = -sum(t.sum() for t in torch._foreach_mul(var.get_grad(), update))
         # scale beta
         beta = beta * self.global_state['beta_scale']
@@ -155,7 +199,7 @@ class AdaptiveBacktracking(LineSearch):
         init = init * self.global_state['initial_scale']
         step_size = backtracking_line_search(objective, d, init=init, beta=beta,
-                                        c=c,maxiter=maxiter,a_min=min_alpha, try_negative=try_negative)
+                                        c=c,maxiter=maxiter, try_negative=try_negative)
         # found an alpha that reduces loss
         if step_size is not None:

torchzero 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl