PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

tests/test_opts.py +95 -76
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +229 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/spsa1.py +93 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/__init__.py +1 -1
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +6 -7
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +114 -175
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +16 -4
torchzero/modules/line_search/strong_wolfe.py +319 -220
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +253 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +207 -170
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +99 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +122 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/optimizer.py +2 -2
torchzero/utils/python_tools.py +7 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.14.dist-info/METADATA +14 -0
torchzero-0.3.14.dist-info/RECORD +167 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/WHEEL +0 -0

torchzero/modules/line_search/strong_wolfe.py CHANGED Viewed

@@ -1,276 +1,375 @@
-"""this needs to be reworked maybe but it also works"""
 import math
 import warnings
 from operator import itemgetter
+from typing import Literal
+import numpy as np
 import torch
 from torch.optim.lbfgs import _cubic_interpolate
-from .line_search import LineSearchBase
-from ...utils import totensor
+from ...utils import as_tensorlist, totensor
+from ._polyinterp import polyinterp, polyinterp2
+from .line_search import LineSearchBase, TerminationCondition, termination_condition
+from ..step_size.adaptive import _bb_geom
+def _totensor(x):
+    if not isinstance(x, torch.Tensor): return torch.tensor(x, dtype=torch.float32)
+    return x
+def _within_bounds(x, bounds):
+    if bounds is None: return True
+    lb,ub = bounds
+    if lb is not None and x < lb: return False
+    if ub is not None and x > ub: return False
+    return True
+def _apply_bounds(x, bounds):
+    if bounds is None: return True
+    lb,ub = bounds
+    if lb is not None and x < lb: return lb
+    if ub is not None and x > ub: return ub
+    return x
+class _StrongWolfe:
+    def __init__(
+        self,
+        f,
+        f_0,
+        g_0,
+        d_norm,
+        a_init,
+        a_max,
+        c1,
+        c2,
+        maxiter,
+        maxeval,
+        maxzoom,
+        tol_change,
+        interpolation: Literal["quadratic", "cubic", "bisection", "polynomial", "polynomial2"],
+    ):
+        self._f = f
+        self.f_0 = f_0
+        self.g_0 = g_0
+        self.d_norm = d_norm
+        self.a_init = a_init
+        self.a_max = a_max
+        self.c1 = c1
+        self.c2 = c2
+        self.maxiter = maxiter
+        if maxeval is None: maxeval = float('inf')
+        self.maxeval = maxeval
+        self.tol_change = tol_change
+        self.num_evals = 0
+        self.maxzoom = maxzoom
+        self.interpolation = interpolation
+        self.history = {}
+    def f(self, a):
+        if a in self.history: return self.history[a]
+        self.num_evals += 1
+        f_a, g_a = self._f(a)
+        self.history[a] = (f_a, g_a)
+        return f_a, g_a
+    def interpolate(self, a_lo, f_lo, g_lo, a_hi, f_hi, g_hi, bounds=None):
+        if self.interpolation == 'cubic':
+            # pytorch cubic interpolate needs tensors
+            a_lo = _totensor(a_lo); f_lo = _totensor(f_lo); g_lo = _totensor(g_lo)
+            a_hi = _totensor(a_hi); f_hi = _totensor(f_hi); g_hi = _totensor(g_hi)
+            return float(_cubic_interpolate(x1=a_lo, f1=f_lo, g1=g_lo, x2=a_hi, f2=f_hi, g2=g_hi, bounds=bounds))
+        if self.interpolation == 'bisection':
+            return _apply_bounds(a_lo + 0.5 * (a_hi - a_lo), bounds)
+        if self.interpolation == 'quadratic':
+            a = a_hi - a_lo
+            denom = 2 * (f_hi - f_lo - g_lo*a)
+            if denom > 1e-32:
+                num = g_lo * a**2
+                a_min = num / -denom
+                return _apply_bounds(a_min, bounds)
+            return _apply_bounds(a_lo + 0.5 * (a_hi - a_lo), bounds)
+        if self.interpolation in ('polynomial', 'polynomial2'):
+            finite_history = [(a, f, g) for a, (f,g) in self.history.items() if math.isfinite(a) and math.isfinite(f) and math.isfinite(g)]
+            if bounds is None: bounds = (None, None)
+            polyinterp_fn = polyinterp if self.interpolation == 'polynomial' else polyinterp2
+            try:
+                return  _apply_bounds(polyinterp_fn(np.array(finite_history), *bounds), bounds) # pyright:ignore[reportArgumentType]
+            except torch.linalg.LinAlgError:
+                return _apply_bounds(a_lo + 0.5 * (a_hi - a_lo), bounds)
+        else:
+            raise ValueError(self.interpolation)
+    def zoom(self, a_lo, f_lo, g_lo, a_hi, f_hi, g_hi):
+        if a_lo >= a_hi:
+            a_hi, f_hi, g_hi, a_lo, f_lo, g_lo = a_lo, f_lo, g_lo, a_hi, f_hi, g_hi
+        insuf_progress = False
+        for _ in range(self.maxzoom):
+            if self.num_evals >= self.maxeval: break
+            if (a_hi - a_lo) * self.d_norm < self.tol_change: break # small bracket
+            if not (math.isfinite(f_hi) and math.isfinite(g_hi)):
+                a_hi = a_hi / 2
+                f_hi, g_hi = self.f(a_hi)
+                continue
+            a_j = self.interpolate(a_lo, f_lo, g_lo, a_hi, f_hi, g_hi, bounds=(a_lo, min(a_hi, self.a_max)))
+            # this part is from https://github.com/pytorch/pytorch/blob/main/torch/optim/lbfgs.py:
+            eps = 0.1 * (a_hi - a_lo)
+            if min(a_hi - a_j, a_j - a_lo) < eps:
+                # interpolation close to boundary
+                if insuf_progress or a_j >= a_hi or a_j <= a_lo:
+                    # evaluate at 0.1 away from boundary
+                    if abs(a_j - a_hi) < abs(a_j - a_lo):
+                        a_j = a_hi - eps
+                    else:
+                        a_j = a_lo + eps
+                    insuf_progress = False
+                else:
+                    insuf_progress = True
+            else:
+                insuf_progress = False
+            f_j, g_j = self.f(a_j)
-def _zoom(f,
-          a_l, a_h,
-          f_l, g_l,
-          f_h, g_h,
-          f_0, g_0,
-          c1, c2,
-          maxzoom):
+            if f_j > self.f_0 + self.c1*a_j*self.g_0 or f_j > f_lo:
+                a_hi, f_hi, g_hi = a_j, f_j, g_j
-    for i in range(maxzoom):
-        a_j = _cubic_interpolate(
-            *(totensor(i) for i in (a_l, f_l, g_l, a_h, f_h, g_h))
+            else:
+                if abs(g_j) <= -self.c2 * self.g_0:
+                    return a_j, f_j, g_j
-        )
+                if g_j * (a_hi - a_lo) >= 0:
+                    a_hi, f_hi, g_hi = a_lo, f_lo, g_lo
-        # if interpolation fails or produces endpoint, bisect
-        delta = abs(a_h - a_l)
-        if a_j is None or a_j == a_l or a_j == a_h:
-            a_j = a_l + 0.5 * delta
+                a_lo, f_lo, g_lo = a_j, f_j, g_j
+        # fail
+        return None, None, None
-        f_j, g_j = f(a_j)
+    def search(self):
+        a_i = min(self.a_init, self.a_max)
+        f_i = g_i = None
+        a_prev = 0
+        f_prev = self.f_0
+        g_prev = self.g_0
+        for i in range(self.maxiter):
+            if self.num_evals >= self.maxeval: break
+            f_i, g_i = self.f(a_i)
-        # check armijo
-        armijo = f_j <= f_0 + c1 * a_j * g_0
+            if f_i > self.f_0 + self.c1*a_i*self.g_0 or (i > 0 and f_i > f_prev):
+                return self.zoom(a_prev, f_prev, g_prev, a_i, f_i, g_i)
-        # check strong wolfe
-        wolfe = abs(g_j) <= c2 * abs(g_0)
+            if abs(g_i) <= -self.c2 * self.g_0:
+                return a_i, f_i, g_i
+            if g_i >= 0:
+                return self.zoom(a_i, f_i, g_i, a_prev, f_prev, g_prev)
+            # from pytorch
+            min_step = a_i + 0.01 * (a_i - a_prev)
+            max_step = a_i * 10
+            a_i_next = self.interpolate(a_prev, f_prev, g_prev, a_i, f_i, g_i, bounds=(min_step, min(max_step, self.a_max)))
+            # a_i_next = self.interpolate(a_prev, f_prev, g_prev, a_i, f_i, g_i, bounds=(0, self.a_max))
+            a_prev, f_prev, g_prev = a_i, f_i, g_i
+            a_i = a_i_next
+        if self.num_evals < self.maxeval:
+            assert f_i is not None and g_i is not None
+            return self.zoom(0, self.f_0, self.g_0, a_i, f_i, g_i)
+        return None, None, None
-        # minimum between alpha_low and alpha_j
-        if not armijo or f_j >= f_l:
-            a_h = a_j
-            f_h = f_j
-            g_h = g_j
-        else:
-            # alpha_j satisfies armijo
-            if wolfe:
-                return a_j, f_j
-            # minimum between alpha_j and alpha_high
-            if g_j * (a_h - a_l) >= 0:
-                # between alpha_low and alpha_j
-                # a_h = a_l
-                # f_h = f_l
-                # g_h = g_l
-                a_h = a_j
-                f_h = f_j
-                g_h = g_j
-            # is this messing it up?
-            else:
-                a_l = a_j
-                f_l = f_j
-                g_l = g_j
-        # check if interval too small
-        delta = abs(a_h - a_l)
-        if delta <= 1e-9 or delta <= 1e-6 * max(abs(a_l), abs(a_h)):
-            l_satisfies_wolfe = (f_l <= f_0 + c1 * a_l * g_0) and (abs(g_l) <= c2 * abs(g_0))
-            h_satisfies_wolfe = (f_h <= f_0 + c1 * a_h * g_0) and (abs(g_h) <= c2 * abs(g_0))
-            if l_satisfies_wolfe and h_satisfies_wolfe: return a_l if f_l <= f_h else a_h, f_h
-            if l_satisfies_wolfe: return a_l, f_l
-            if h_satisfies_wolfe: return a_h, f_h
-            if f_l <= f_0 + c1 * a_l * g_0: return a_l, f_l
-            return None,None
-        if a_j is None or a_j == a_l or a_j == a_h:
-            a_j = a_l + 0.5 * delta
-    return None,None
-def strong_wolfe(
-    f,
-    f_0,
-    g_0,
-    init: float = 1.0,
-    c1: float = 1e-4,
-    c2: float = 0.9,
-    maxiter: int = 25,
-    maxzoom: int = 15,
-    # a_max: float = 1e30,
-    expand: float = 2.0,  # Factor to increase alpha in bracketing
-    plus_minus: bool = False,
-) -> tuple[float,float] | tuple[None,None]:
-    a_prev = 0.0
-    if g_0 == 0: return None,None
-    if g_0 > 0:
-        # if direction is not a descent direction, perform line search in opposite direction
-        if plus_minus:
-            def inverted_objective(alpha):
-                l, g = f(-alpha)
-                return l, -g
-            a, v = strong_wolfe(
-                inverted_objective,
-                init=init,
-                f_0=f_0,
-                g_0=-g_0,
-                c1=c1,
-                c2=c2,
-                maxiter=maxiter,
-                # a_max=a_max,
-                expand=expand,
-                plus_minus=False,
-            )
-            if a is not None and v is not None: return -a, v
-        return None, None
-    f_prev = f_0
-    g_prev = g_0
-    a_cur = init
-    # bracket
-    for i in range(maxiter):
-        f_cur, g_cur = f(a_cur)
-        # check armijo
-        armijo_violated = f_cur > f_0 + c1 * a_cur * g_0
-        func_increased = f_cur >= f_prev and i > 0
-        if armijo_violated or func_increased:
-            return _zoom(f,
-                         a_prev, a_cur,
-                         f_prev, g_prev,
-                         f_cur, g_cur,
-                         f_0, g_0,
-                         c1, c2,
-                         maxzoom=maxzoom,
-                         )
-        # check strong wolfe
-        if abs(g_cur) <= c2 * abs(g_0):
-            return a_cur, f_cur
-        # minimum is bracketed
-        if g_cur >= 0:
-            return _zoom(f,
-                        #alpha_curr, alpha_prev,
-                        a_prev, a_cur,
-                        #phi_curr, phi_prime_curr,
-                        f_prev, g_prev,
-                        f_cur, g_cur,
-                        f_0, g_0,
-                        c1, c2,
-                        maxzoom=maxzoom,)
-        # otherwise continue bracketing
-        a_next = a_cur * expand
-        # update previous point and continue loop with increased step size
-        a_prev = a_cur
-        f_prev = f_cur
-        g_prev = g_cur
-        a_cur = a_next
-    # max iters reached
-    return None, None
-def _notfinite(x):
-    if isinstance(x, torch.Tensor): return not torch.isfinite(x).all()
-    return not math.isfinite(x)
 class StrongWolfe(LineSearchBase):
-    """Cubic interpolation line search satisfying Strong Wolfe condition.
+    """Interpolation line search satisfying Strong Wolfe condition.
     Args:
-        init (float, optional): Initial step size. Defaults to 1.0.
-        c1 (float, optional): Acceptance value for weak wolfe condition. Defaults to 1e-4.
-        c2 (float, optional): Acceptance value for strong wolfe condition (set to 0.1 for conjugate gradient). Defaults to 0.9.
-        maxiter (int, optional): Maximum number of line search iterations. Defaults to 25.
-        maxzoom (int, optional): Maximum number of zoom iterations. Defaults to 10.
-        expand (float, optional): Expansion factor (multipler to step size when weak condition not satisfied). Defaults to 2.0.
-        use_prev (bool, optional):
-            if True, previous step size is used as the initial step size on the next step.
+        c1 (float, optional): sufficient descent condition. Defaults to 1e-4.
+        c2 (float, optional): strong curvature condition. For CG set to 0.1. Defaults to 0.9.
+        a_init (str, optional):
+            strategy for initializing the initial step size guess.
+            - "fixed" - uses a fixed value specified in `init_value` argument.
+            - "first-order" - assumes first-order change in the function at iterate will be the same as that obtained at the previous step.
+            - "quadratic" - interpolates quadratic to f(x_{-1}) and f_x.
+            - "quadratic-clip" - same as quad, but uses min(1, 1.01*alpha) as described in Numerical Optimization.
+            - "previous" - uses final step size found on previous iteration.
+            For 2nd order methods it is usually best to leave at "fixed".
+            For methods that do not produce well scaled search directions, e.g. conjugate gradient,
+            "first-order" or "quadratic-clip" are recommended. Defaults to 'init'.
+        a_max (float, optional): upper bound for the proposed step sizes. Defaults to 1e12.
+        init_value (float, optional):
+            initial step size. Used when ``a_init``="fixed", and with other strategies as fallback value. Defaults to 1.
+        maxiter (int, optional): maximum number of line search iterations. Defaults to 25.
+        maxzoom (int, optional): maximum number of zoom iterations. Defaults to 10.
+        maxeval (int | None, optional): maximum number of function evaluations. Defaults to None.
+        tol_change (float, optional): tolerance, terminates on small brackets. Defaults to 1e-9.
+        interpolation (str, optional):
+            What type of interpolation to use.
+            - "bisection" - uses the middle point. This is robust, especially if the objective function is non-smooth, however it may need more function evaluations.
+            - "quadratic" - minimizes a quadratic model, generally outperformed by "cubic".
+            - "cubic" - minimizes a cubic model - this is the most widely used interpolation strategy.
+            - "polynomial" - fits a a polynomial to all points obtained during line search.
+            - "polynomial2" - alternative polynomial fit, where if a point is outside of bounds, a lower degree polynomial is tried.
+            This may have faster convergence than "cubic" and "polynomial".
+            Defaults to 'cubic'.
         adaptive (bool, optional):
-            when enabled, if line search failed, initial step size is reduced.
-            Otherwise it is reset to initial value. Defaults to True.
+            if True, the initial step size will be halved when line search failed to find a good direction.
+            When a good direction is found, initial step size is reset to the original value. Defaults to True.
+        fallback (bool, optional):
+            if True, when no point satisfied strong wolfe criteria,
+            returns a point with value lower than initial value that doesn't satisfy the criteria. Defaults to False.
         plus_minus (bool, optional):
-            If enabled and the direction is not descent direction, performs line search in opposite direction. Defaults to False.
+            if True, enables the plus-minus variant, where if curvature is negative, line search is performed
+            in the opposite direction. Defaults to False.
-    Examples:
-        Conjugate gradient method with strong wolfe line search. Nocedal, Wright recommend setting c2 to 0.1 for CG.
-        .. code-block:: python
+    ## Examples:
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.PolakRibiere(),
-                tz.m.StrongWolfe(c2=0.1)
-            )
+    Conjugate gradient method with strong wolfe line search. Nocedal, Wright recommend setting c2 to 0.1 for CG. Since CG doesn't produce well scaled directions, initial alpha can be determined from function values by ``a_init="first-order"``.
-        LBFGS strong wolfe line search:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.PolakRibiere(),
+        tz.m.StrongWolfe(c2=0.1, a_init="first-order")
+    )
+    ```
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LBFGS(),
-                tz.m.StrongWolfe()
-            )
+    LBFGS strong wolfe line search:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LBFGS(),
+        tz.m.StrongWolfe()
+    )
+    ```
     """
     def __init__(
         self,
-        init: float = 1.0,
         c1: float = 1e-4,
         c2: float = 0.9,
+        a_init: Literal['first-order', 'quadratic', 'quadratic-clip', 'previous', 'fixed'] = 'fixed',
+        a_max: float = 1e12,
+        init_value: float = 1,
         maxiter: int = 25,
         maxzoom: int = 10,
-        # a_max: float = 1e10,
-        expand: float = 2.0,
-        use_prev: bool = False,
+        maxeval: int | None = None,
+        tol_change: float = 1e-9,
+        interpolation: Literal["quadratic", "cubic", "bisection", "polynomial", 'polynomial2'] = 'cubic',
         adaptive = True,
+        fallback:bool = False,
         plus_minus = False,
     ):
-        defaults=dict(init=init,c1=c1,c2=c2,maxiter=maxiter,maxzoom=maxzoom,
-                      expand=expand, adaptive=adaptive, plus_minus=plus_minus,use_prev=use_prev)
+        defaults=dict(init_value=init_value,init=a_init,a_max=a_max,c1=c1,c2=c2,maxiter=maxiter,maxzoom=maxzoom, fallback=fallback,
+                      maxeval=maxeval, adaptive=adaptive, interpolation=interpolation, plus_minus=plus_minus, tol_change=tol_change)
         super().__init__(defaults=defaults)
         self.global_state['initial_scale'] = 1.0
-        self.global_state['beta_scale'] = 1.0
     @torch.no_grad
     def search(self, update, var):
+        self._g_prev = self._f_prev = None
         objective = self.make_objective_with_derivative(var=var)
-        init, c1, c2, maxiter, maxzoom, expand, adaptive, plus_minus, use_prev = itemgetter(
-            'init', 'c1', 'c2', 'maxiter', 'maxzoom',
-            'expand', 'adaptive', 'plus_minus', 'use_prev')(self.settings[var.params[0]])
-        f_0, g_0 = objective(0)
-        if use_prev: init = self.global_state.get('prev_alpha', init)
-        step_size,f_a = strong_wolfe(
-            objective,
-            f_0=f_0, g_0=g_0,
-            init=init * self.global_state.setdefault("initial_scale", 1),
+        init_value, init, c1, c2, a_max, maxiter, maxzoom, maxeval, interpolation, adaptive, plus_minus, fallback, tol_change = itemgetter(
+            'init_value', 'init', 'c1', 'c2', 'a_max', 'maxiter', 'maxzoom',
+            'maxeval', 'interpolation', 'adaptive', 'plus_minus', 'fallback', 'tol_change')(self.defaults)
+        dir = as_tensorlist(var.get_update())
+        grad_list = var.get_grad()
+        g_0 = -sum(t.sum() for t in torch._foreach_mul(grad_list, dir))
+        f_0 = var.get_loss(False)
+        dir_norm = dir.global_vector_norm()
+        inverted = False
+        if plus_minus and g_0 > 0:
+            original_objective = objective
+            def inverted_objective(a):
+                l, g_a = original_objective(-a)
+                return l, -g_a
+            objective = inverted_objective
+            inverted = True
+        # --------------------- determine initial step size guess -------------------- #
+        init = init.lower().strip()
+        a_init = init_value
+        if init == 'fixed':
+            pass # use init_value
+        elif init == 'previous':
+            if 'a_prev' in self.global_state:
+                a_init = self.global_state['a_prev']
+        elif init == 'first-order':
+            if 'g_prev' in self.global_state and g_0 < -torch.finfo(dir[0].dtype).tiny * 2:
+                a_prev = self.global_state['a_prev']
+                g_prev = self.global_state['g_prev']
+                if g_prev < 0:
+                    a_init = a_prev * g_prev / g_0
+        elif init in ('quadratic', 'quadratic-clip'):
+            if 'f_prev' in self.global_state and g_0 < -torch.finfo(dir[0].dtype).tiny * 2:
+                f_prev = self.global_state['f_prev']
+                if f_0 < f_prev:
+                    a_init = 2 * (f_0 - f_prev) / g_0
+                    if init == 'quadratic-clip': a_init = min(1, 1.01*a_init)
+        else:
+            raise ValueError(init)
+        if adaptive:
+            a_init *= self.global_state.get('initial_scale', 1)
+        strong_wolfe = _StrongWolfe(
+            f=objective,
+            f_0=f_0,
+            g_0=g_0,
+            d_norm=dir_norm,
+            a_init=a_init,
+            a_max=a_max,
             c1=c1,
             c2=c2,
             maxiter=maxiter,
             maxzoom=maxzoom,
-            expand=expand,
-            plus_minus=plus_minus,
+            maxeval=maxeval,
+            tol_change=tol_change,
+            interpolation=interpolation,
         )
-        if f_a is not None and (f_a > f_0 or _notfinite(f_a)): step_size = None
-        if step_size is not None and step_size != 0 and not _notfinite(step_size):
-            self.global_state['initial_scale'] = min(1.0, self.global_state['initial_scale'] * math.sqrt(2))
-            self.global_state['prev_alpha'] = step_size
-            return step_size
+        a, f_a, g_a = strong_wolfe.search()
+        if inverted and a is not None: a = -a
+        if f_a is not None and (f_a > f_0 or not math.isfinite(f_a)): a = None
+        if fallback:
+            if a is None or a==0 or not math.isfinite(a):
+                lowest = min(strong_wolfe.history.items(), key=lambda x: x[1][0])
+                if lowest[1][0] < f_0:
+                    a = lowest[0]
+                    f_a, g_a = lowest[1]
+                    if inverted: a = -a
+        if a is not None and a != 0 and math.isfinite(a):
+            self.global_state['initial_scale'] = 1
+            self.global_state['a_prev'] = a
+            self.global_state['f_prev'] = f_0
+            self.global_state['g_prev'] = g_0
+            return a
+        # fail
+        if adaptive:
+            self.global_state['initial_scale'] = self.global_state.get('initial_scale', 1) * 0.5
+            finfo = torch.finfo(dir[0].dtype)
+            if self.global_state['initial_scale'] < finfo.tiny * 2:
+                self.global_state['initial_scale'] = finfo.max / 2
-        if adaptive: self.global_state['initial_scale'] *= 0.5
         return 0

torchzero/modules/misc/__init__.py CHANGED Viewed

@@ -1,6 +1,13 @@
 from .debug import PrintLoss, PrintParams, PrintShape, PrintUpdate
 from .escape import EscapeAnnealing
 from .gradient_accumulation import GradientAccumulation
+from .homotopy import (
+    ExpHomotopy,
+    LambdaHomotopy,
+    LogHomotopy,
+    SqrtHomotopy,
+    SquareHomotopy,
+)
 from .misc import (
     DivByLoss,
     FillLoss,
@@ -20,6 +27,7 @@ from .misc import (
     RandomHvp,
     Relative,
     UpdateSign,
+    SaveBest,
 )
 from .multistep import Multistep, NegateOnLossIncrease, Online, Sequential
 from .regularization import Dropout, PerturbWeights, WeightDropout

torchzero/modules/misc/debug.py CHANGED Viewed

@@ -12,7 +12,7 @@ class PrintUpdate(Module):
         super().__init__(defaults)
     def step(self, var):
-        self.settings[var.params[0]]["print_fn"](f'{self.settings[var.params[0]]["text"]}{var.update}')
+        self.defaults["print_fn"](f'{self.defaults["text"]}{var.update}')
         return var
 class PrintShape(Module):
@@ -23,7 +23,7 @@ class PrintShape(Module):
     def step(self, var):
         shapes = [u.shape for u in var.update] if var.update is not None else None
-        self.settings[var.params[0]]["print_fn"](f'{self.settings[var.params[0]]["text"]}{shapes}')
+        self.defaults["print_fn"](f'{self.defaults["text"]}{shapes}')
         return var
 class PrintParams(Module):
@@ -33,7 +33,7 @@ class PrintParams(Module):
         super().__init__(defaults)
     def step(self, var):
-        self.settings[var.params[0]]["print_fn"](f'{self.settings[var.params[0]]["text"]}{var.params}')
+        self.defaults["print_fn"](f'{self.defaults["text"]}{var.params}')
         return var
@@ -44,5 +44,5 @@ class PrintLoss(Module):
         super().__init__(defaults)
     def step(self, var):
-        self.settings[var.params[0]]["print_fn"](f'{self.settings[var.params[0]]["text"]}{var.get_loss(False)}')
+        self.defaults["print_fn"](f'{self.defaults["text"]}{var.get_loss(False)}')
         return var

torchzero 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl