PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/line_search/strong_wolfe.py CHANGED Viewed

@@ -1,249 +1,377 @@
 import math
 import warnings
 from operator import itemgetter
+from typing import Literal
+import numpy as np
 import torch
 from torch.optim.lbfgs import _cubic_interpolate
-from .line_search import LineSearch
-from .backtracking import backtracking_line_search
-from ...utils import totensor
+from ...utils import as_tensorlist, totensor
+from ._polyinterp import polyinterp, polyinterp2
+from .line_search import LineSearchBase, TerminationCondition, termination_condition
+from ..step_size.adaptive import _bb_geom
+def _totensor(x):
+    if not isinstance(x, torch.Tensor): return torch.tensor(x, dtype=torch.float32)
+    return x
+def _within_bounds(x, bounds):
+    if bounds is None: return True
+    lb,ub = bounds
+    if lb is not None and x < lb: return False
+    if ub is not None and x > ub: return False
+    return True
+def _apply_bounds(x, bounds):
+    if bounds is None: return True
+    lb,ub = bounds
+    if lb is not None and x < lb: return lb
+    if ub is not None and x > ub: return ub
+    return x
+class _StrongWolfe:
+    def __init__(
+        self,
+        f,
+        f_0,
+        g_0,
+        d_norm,
+        a_init,
+        a_max,
+        c1,
+        c2,
+        maxiter,
+        maxeval,
+        maxzoom,
+        tol_change,
+        interpolation: Literal["quadratic", "cubic", "bisection", "polynomial", "polynomial2"],
+    ):
+        self._f = f
+        self.f_0 = f_0
+        self.g_0 = g_0
+        self.d_norm = d_norm
+        self.a_init = a_init
+        self.a_max = a_max
+        self.c1 = c1
+        self.c2 = c2
+        self.maxiter = maxiter
+        if maxeval is None: maxeval = float('inf')
+        self.maxeval = maxeval
+        self.tol_change = tol_change
+        self.num_evals = 0
+        self.maxzoom = maxzoom
+        self.interpolation = interpolation
+        self.history = {}
+    def f(self, a):
+        if a in self.history: return self.history[a]
+        self.num_evals += 1
+        f_a, g_a = self._f(a)
+        self.history[a] = (f_a, g_a)
+        return f_a, g_a
+    def interpolate(self, a_lo, f_lo, g_lo, a_hi, f_hi, g_hi, bounds=None):
+        if self.interpolation == 'cubic':
+            # pytorch cubic interpolate needs tensors
+            a_lo = _totensor(a_lo); f_lo = _totensor(f_lo); g_lo = _totensor(g_lo)
+            a_hi = _totensor(a_hi); f_hi = _totensor(f_hi); g_hi = _totensor(g_hi)
+            return float(_cubic_interpolate(x1=a_lo, f1=f_lo, g1=g_lo, x2=a_hi, f2=f_hi, g2=g_hi, bounds=bounds))
+        if self.interpolation == 'bisection':
+            return _apply_bounds(a_lo + 0.5 * (a_hi - a_lo), bounds)
+        if self.interpolation == 'quadratic':
+            a = a_hi - a_lo
+            denom = 2 * (f_hi - f_lo - g_lo*a)
+            if denom > 1e-32:
+                num = g_lo * a**2
+                a_min = num / -denom
+                return _apply_bounds(a_min, bounds)
+            return _apply_bounds(a_lo + 0.5 * (a_hi - a_lo), bounds)
+        if self.interpolation in ('polynomial', 'polynomial2'):
+            finite_history = [(a, f, g) for a, (f,g) in self.history.items() if math.isfinite(a) and math.isfinite(f) and math.isfinite(g)]
+            if bounds is None: bounds = (None, None)
+            polyinterp_fn = polyinterp if self.interpolation == 'polynomial' else polyinterp2
+            try:
+                return  _apply_bounds(polyinterp_fn(np.array(finite_history), *bounds), bounds) # pyright:ignore[reportArgumentType]
+            except torch.linalg.LinAlgError:
+                return _apply_bounds(a_lo + 0.5 * (a_hi - a_lo), bounds)
+        else:
+            raise ValueError(self.interpolation)
+    def zoom(self, a_lo, f_lo, g_lo, a_hi, f_hi, g_hi):
+        if a_lo >= a_hi:
+            a_hi, f_hi, g_hi, a_lo, f_lo, g_lo = a_lo, f_lo, g_lo, a_hi, f_hi, g_hi
+        insuf_progress = False
+        for _ in range(self.maxzoom):
+            if self.num_evals >= self.maxeval: break
+            if (a_hi - a_lo) * self.d_norm < self.tol_change: break # small bracket
+            if not (math.isfinite(f_hi) and math.isfinite(g_hi)):
+                a_hi = a_hi / 2
+                f_hi, g_hi = self.f(a_hi)
+                continue
+            a_j = self.interpolate(a_lo, f_lo, g_lo, a_hi, f_hi, g_hi, bounds=(a_lo, min(a_hi, self.a_max)))
+            # this part is from https://github.com/pytorch/pytorch/blob/main/torch/optim/lbfgs.py:
+            eps = 0.1 * (a_hi - a_lo)
+            if min(a_hi - a_j, a_j - a_lo) < eps:
+                # interpolation close to boundary
+                if insuf_progress or a_j >= a_hi or a_j <= a_lo:
+                    # evaluate at 0.1 away from boundary
+                    if abs(a_j - a_hi) < abs(a_j - a_lo):
+                        a_j = a_hi - eps
+                    else:
+                        a_j = a_lo + eps
+                    insuf_progress = False
+                else:
+                    insuf_progress = True
+            else:
+                insuf_progress = False
+            f_j, g_j = self.f(a_j)
-def _zoom(f,
-          a_l, a_h,
-          f_l, g_l,
-          f_h, g_h,
-          f_0, g_0,
-          c1, c2,
-          maxzoom):
+            if f_j > self.f_0 + self.c1*a_j*self.g_0 or f_j > f_lo:
+                a_hi, f_hi, g_hi = a_j, f_j, g_j
-    for i in range(maxzoom):
-        a_j = _cubic_interpolate(
-            *(totensor(i) for i in (a_l, f_l, g_l, a_h, f_h, g_h))
+            else:
+                if abs(g_j) <= -self.c2 * self.g_0:
+                    return a_j, f_j, g_j
-        )
+                if g_j * (a_hi - a_lo) >= 0:
+                    a_hi, f_hi, g_hi = a_lo, f_lo, g_lo
-        # if interpolation fails or produces endpoint, bisect
-        delta = abs(a_h - a_l)
-        if a_j is None or a_j == a_l or a_j == a_h:
-            a_j = a_l + 0.5 * delta
+                a_lo, f_lo, g_lo = a_j, f_j, g_j
+        # fail
+        return None, None, None
-        f_j, g_j = f(a_j)
+    def search(self):
+        a_i = min(self.a_init, self.a_max)
+        f_i = g_i = None
+        a_prev = 0
+        f_prev = self.f_0
+        g_prev = self.g_0
+        for i in range(self.maxiter):
+            if self.num_evals >= self.maxeval: break
+            f_i, g_i = self.f(a_i)
-        # check armijo
-        armijo = f_j <= f_0 + c1 * a_j * g_0
+            if f_i > self.f_0 + self.c1*a_i*self.g_0 or (i > 0 and f_i > f_prev):
+                return self.zoom(a_prev, f_prev, g_prev, a_i, f_i, g_i)
-        # check strong wolfe
-        wolfe = abs(g_j) <= c2 * abs(g_0)
+            if abs(g_i) <= -self.c2 * self.g_0:
+                return a_i, f_i, g_i
+            if g_i >= 0:
+                return self.zoom(a_i, f_i, g_i, a_prev, f_prev, g_prev)
-        # minimum between alpha_low and alpha_j
-        if not armijo or f_j >= f_l:
-            a_h = a_j
-            f_h = f_j
-            g_h = g_j
-        else:
-            # alpha_j satisfies armijo
-            if wolfe:
-                return a_j, f_j
-            # minimum between alpha_j and alpha_high
-            if g_j * (a_h - a_l) >= 0:
-                # between alpha_low and alpha_j
-                # a_h = a_l
-                # f_h = f_l
-                # g_h = g_l
-                a_h = a_j
-                f_h = f_j
-                g_h = g_j
-            # is this messing it up?
-            else:
-                a_l = a_j
-                f_l = f_j
-                g_l = g_j
-        # check if interval too small
-        delta = abs(a_h - a_l)
-        if delta <= 1e-9 or delta <= 1e-6 * max(abs(a_l), abs(a_h)):
-            l_satisfies_wolfe = (f_l <= f_0 + c1 * a_l * g_0) and (abs(g_l) <= c2 * abs(g_0))
-            h_satisfies_wolfe = (f_h <= f_0 + c1 * a_h * g_0) and (abs(g_h) <= c2 * abs(g_0))
-            if l_satisfies_wolfe and h_satisfies_wolfe: return a_l if f_l <= f_h else a_h, f_h
-            if l_satisfies_wolfe: return a_l, f_l
-            if h_satisfies_wolfe: return a_h, f_h
-            if f_l <= f_0 + c1 * a_l * g_0: return a_l, f_l
-            return None,None
-        if a_j is None or a_j == a_l or a_j == a_h:
-            a_j = a_l + 0.5 * delta
-    return None,None
-def strong_wolfe(
-    f,
-    f_0,
-    g_0,
-    init: float = 1.0,
-    c1: float = 1e-4,
-    c2: float = 0.9,
-    maxiter: int = 25,
-    maxzoom: int = 15,
-    # a_max: float = 1e30,
-    expand: float = 2.0,  # Factor to increase alpha in bracketing
-    plus_minus: bool = False,
-) -> tuple[float,float] | tuple[None,None]:
-    a_prev = 0.0
-    if g_0 == 0: return None,None
-    if g_0 > 0:
-        # if direction is not a descent direction, perform line search in opposite direction
-        if plus_minus:
-            def inverted_objective(alpha):
-                l, g = f(-alpha)
-                return l, -g
-            a, v = strong_wolfe(
-                inverted_objective,
-                init=init,
-                f_0=f_0,
-                g_0=-g_0,
-                c1=c1,
-                c2=c2,
-                maxiter=maxiter,
-                # a_max=a_max,
-                expand=expand,
-                plus_minus=False,
-            )
-            if a is not None and v is not None: return -a, v
-        return None, None
-    f_prev = f_0
-    g_prev = g_0
-    a_cur = init
-    # bracket
-    for i in range(maxiter):
-        f_cur, g_cur = f(a_cur)
-        # check armijo
-        armijo_violated = f_cur > f_0 + c1 * a_cur * g_0
-        func_increased = f_cur >= f_prev and i > 0
-        if armijo_violated or func_increased:
-            return _zoom(f,
-                         a_prev, a_cur,
-                         f_prev, g_prev,
-                         f_cur, g_cur,
-                         f_0, g_0,
-                         c1, c2,
-                         maxzoom=maxzoom,
-                         )
-        # check strong wolfe
-        if abs(g_cur) <= c2 * abs(g_0):
-            return a_cur, f_cur
-        # minimum is bracketed
-        if g_cur >= 0:
-            return _zoom(f,
-                        #alpha_curr, alpha_prev,
-                        a_prev, a_cur,
-                        #phi_curr, phi_prime_curr,
-                        f_prev, g_prev,
-                        f_cur, g_cur,
-                        f_0, g_0,
-                        c1, c2,
-                        maxzoom=maxzoom,)
-        # otherwise continue bracketing
-        a_next = a_cur * expand
-        # update previous point and continue loop with increased step size
-        a_prev = a_cur
-        f_prev = f_cur
-        g_prev = g_cur
-        a_cur = a_next
-    # max iters reached
-    return None, None
-def _notfinite(x):
-    if isinstance(x, torch.Tensor): return not torch.isfinite(x).all()
-    return not math.isfinite(x)
-class StrongWolfe(LineSearch):
-    """Cubic interpolation line search satisfying Strong Wolfe condition.
+            # from pytorch
+            min_step = a_i + 0.01 * (a_i - a_prev)
+            max_step = a_i * 10
+            a_i_next = self.interpolate(a_prev, f_prev, g_prev, a_i, f_i, g_i, bounds=(min_step, min(max_step, self.a_max)))
+            # a_i_next = self.interpolate(a_prev, f_prev, g_prev, a_i, f_i, g_i, bounds=(0, self.a_max))
+            a_prev, f_prev, g_prev = a_i, f_i, g_i
+            a_i = a_i_next
+        if self.num_evals < self.maxeval:
+            assert f_i is not None and g_i is not None
+            return self.zoom(0, self.f_0, self.g_0, a_i, f_i, g_i)
+        return None, None, None
+class StrongWolfe(LineSearchBase):
+    """Interpolation line search satisfying Strong Wolfe condition.
     Args:
-        init (float, optional): Initial step size. Defaults to 1.0.
-        c1 (float, optional): Acceptance value for weak wolfe condition. Defaults to 1e-4.
-        c2 (float, optional): Acceptance value for strong wolfe condition (set to 0.1 for conjugate gradient). Defaults to 0.9.
-        maxiter (int, optional): Maximum number of line search iterations. Defaults to 25.
-        maxzoom (int, optional): Maximum number of zoom iterations. Defaults to 10.
-        expand (float, optional): Expansion factor (multipler to step size when weak condition not satisfied). Defaults to 2.0.
+        c1 (float, optional): sufficient descent condition. Defaults to 1e-4.
+        c2 (float, optional): strong curvature condition. For CG set to 0.1. Defaults to 0.9.
+        a_init (str, optional):
+            strategy for initializing the initial step size guess.
+            - "fixed" - uses a fixed value specified in `init_value` argument.
+            - "first-order" - assumes first-order change in the function at iterate will be the same as that obtained at the previous step.
+            - "quadratic" - interpolates quadratic to f(x_{-1}) and f_x.
+            - "quadratic-clip" - same as quad, but uses min(1, 1.01*alpha) as described in Numerical Optimization.
+            - "previous" - uses final step size found on previous iteration.
+            For 2nd order methods it is usually best to leave at "fixed".
+            For methods that do not produce well scaled search directions, e.g. conjugate gradient,
+            "first-order" or "quadratic-clip" are recommended. Defaults to 'init'.
+        a_max (float, optional): upper bound for the proposed step sizes. Defaults to 1e12.
+        init_value (float, optional):
+            initial step size. Used when ``a_init``="fixed", and with other strategies as fallback value. Defaults to 1.
+        maxiter (int, optional): maximum number of line search iterations. Defaults to 25.
+        maxzoom (int, optional): maximum number of zoom iterations. Defaults to 10.
+        maxeval (int | None, optional): maximum number of function evaluations. Defaults to None.
+        tol_change (float, optional): tolerance, terminates on small brackets. Defaults to 1e-9.
+        interpolation (str, optional):
+            What type of interpolation to use.
+            - "bisection" - uses the middle point. This is robust, especially if the objective function is non-smooth, however it may need more function evaluations.
+            - "quadratic" - minimizes a quadratic model, generally outperformed by "cubic".
+            - "cubic" - minimizes a cubic model - this is the most widely used interpolation strategy.
+            - "polynomial" - fits a a polynomial to all points obtained during line search.
+            - "polynomial2" - alternative polynomial fit, where if a point is outside of bounds, a lower degree polynomial is tried.
+            This may have faster convergence than "cubic" and "polynomial".
+            Defaults to 'cubic'.
         adaptive (bool, optional):
-            when enabled, if line search failed, initial step size is reduced.
-            Otherwise it is reset to initial value. Defaults to True.
+            if True, the initial step size will be halved when line search failed to find a good direction.
+            When a good direction is found, initial step size is reset to the original value. Defaults to True.
+        fallback (bool, optional):
+            if True, when no point satisfied strong wolfe criteria,
+            returns a point with value lower than initial value that doesn't satisfy the criteria. Defaults to False.
         plus_minus (bool, optional):
-            If enabled and the direction is not descent direction, performs line search in opposite direction. Defaults to False.
+            if True, enables the plus-minus variant, where if curvature is negative, line search is performed
+            in the opposite direction. Defaults to False.
+    ## Examples:
+    Conjugate gradient method with strong wolfe line search. Nocedal, Wright recommend setting c2 to 0.1 for CG. Since CG doesn't produce well scaled directions, initial alpha can be determined from function values by ``a_init="first-order"``.
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.PolakRibiere(),
+        tz.m.StrongWolfe(c2=0.1, a_init="first-order")
+    )
+    ```
+    LBFGS strong wolfe line search:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LBFGS(),
+        tz.m.StrongWolfe()
+    )
+    ```
     """
     def __init__(
         self,
-        init: float = 1.0,
         c1: float = 1e-4,
         c2: float = 0.9,
+        a_init: Literal['first-order', 'quadratic', 'quadratic-clip', 'previous', 'fixed'] = 'fixed',
+        a_max: float = 1e12,
+        init_value: float = 1,
         maxiter: int = 25,
         maxzoom: int = 10,
-        # a_max: float = 1e10,
-        expand: float = 2.0,
+        maxeval: int | None = None,
+        tol_change: float = 1e-9,
+        interpolation: Literal["quadratic", "cubic", "bisection", "polynomial", 'polynomial2'] = 'cubic',
         adaptive = True,
+        fallback:bool = False,
         plus_minus = False,
     ):
-        defaults=dict(init=init,c1=c1,c2=c2,maxiter=maxiter,maxzoom=maxzoom,
-                      expand=expand, adaptive=adaptive, plus_minus=plus_minus)
+        defaults=dict(init_value=init_value,init=a_init,a_max=a_max,c1=c1,c2=c2,maxiter=maxiter,maxzoom=maxzoom, fallback=fallback,
+                      maxeval=maxeval, adaptive=adaptive, interpolation=interpolation, plus_minus=plus_minus, tol_change=tol_change)
         super().__init__(defaults=defaults)
         self.global_state['initial_scale'] = 1.0
-        self.global_state['beta_scale'] = 1.0
     @torch.no_grad
     def search(self, update, var):
+        self._g_prev = self._f_prev = None
         objective = self.make_objective_with_derivative(var=var)
-        init, c1, c2, maxiter, maxzoom, expand, adaptive, plus_minus = itemgetter(
-            'init', 'c1', 'c2', 'maxiter', 'maxzoom',
-            'expand', 'adaptive', 'plus_minus')(self.settings[var.params[0]])
+        init_value, init, c1, c2, a_max, maxiter, maxzoom, maxeval, interpolation, adaptive, plus_minus, fallback, tol_change = itemgetter(
+            'init_value', 'init', 'c1', 'c2', 'a_max', 'maxiter', 'maxzoom',
+            'maxeval', 'interpolation', 'adaptive', 'plus_minus', 'fallback', 'tol_change')(self.defaults)
+        dir = as_tensorlist(var.get_update())
+        grad_list = var.get_grad()
+        g_0 = -sum(t.sum() for t in torch._foreach_mul(grad_list, dir))
+        f_0 = var.get_loss(False)
+        dir_norm = dir.global_vector_norm()
+        inverted = False
+        if plus_minus and g_0 > 0:
+            original_objective = objective
+            def inverted_objective(a):
+                l, g_a = original_objective(-a)
+                return l, -g_a
+            objective = inverted_objective
+            inverted = True
+        # --------------------- determine initial step size guess -------------------- #
+        init = init.lower().strip()
+        a_init = init_value
+        if init == 'fixed':
+            pass # use init_value
+        elif init == 'previous':
+            if 'a_prev' in self.global_state:
+                a_init = self.global_state['a_prev']
+        elif init == 'first-order':
+            if 'g_prev' in self.global_state and g_0 < -torch.finfo(dir[0].dtype).tiny * 2:
+                a_prev = self.global_state['a_prev']
+                g_prev = self.global_state['g_prev']
+                if g_prev < 0:
+                    a_init = a_prev * g_prev / g_0
+        elif init in ('quadratic', 'quadratic-clip'):
+            if 'f_prev' in self.global_state and g_0 < -torch.finfo(dir[0].dtype).tiny * 2:
+                f_prev = self.global_state['f_prev']
+                if f_0 < f_prev:
+                    a_init = 2 * (f_0 - f_prev) / g_0
+                    if init == 'quadratic-clip': a_init = min(1, 1.01*a_init)
+        else:
+            raise ValueError(init)
+        if adaptive:
+            a_init *= self.global_state.get('initial_scale', 1)
-        f_0, g_0 = objective(0)
-        step_size,f_a = strong_wolfe(
-            objective,
-            f_0=f_0, g_0=g_0,
-            init=init * self.global_state.setdefault("initial_scale", 1),
+        strong_wolfe = _StrongWolfe(
+            f=objective,
+            f_0=f_0,
+            g_0=g_0,
+            d_norm=dir_norm,
+            a_init=a_init,
+            a_max=a_max,
             c1=c1,
             c2=c2,
             maxiter=maxiter,
             maxzoom=maxzoom,
-            expand=expand,
-            plus_minus=plus_minus,
+            maxeval=maxeval,
+            tol_change=tol_change,
+            interpolation=interpolation,
         )
-        if f_a is not None and (f_a > f_0 or _notfinite(f_a)): step_size = None
-        if step_size is not None and step_size != 0 and not _notfinite(step_size):
-            self.global_state['initial_scale'] = min(1.0, self.global_state['initial_scale'] * math.sqrt(2))
-            return step_size
+        a, f_a, g_a = strong_wolfe.search()
+        if inverted and a is not None: a = -a
+        if f_a is not None and (f_a > f_0 or not math.isfinite(f_a)): a = None
+        if fallback:
+            if a is None or a==0 or not math.isfinite(a):
+                lowest = min(strong_wolfe.history.items(), key=lambda x: x[1][0])
+                if lowest[1][0] < f_0:
+                    a = lowest[0]
+                    f_a, g_a = lowest[1]
+                    if inverted: a = -a
+        if a is not None and a != 0 and math.isfinite(a):
+            #self.global_state['initial_scale'] = min(1.0, self.global_state.get('initial_scale', 1) * math.sqrt(2))
+            self.global_state['initial_scale'] = 1
+            self.global_state['a_prev'] = a
+            self.global_state['f_prev'] = f_0
+            self.global_state['g_prev'] = g_0
+            return a
+        # fail
+        if adaptive:
+            self.global_state['initial_scale'] = self.global_state.get('initial_scale', 1) * 0.5
+            finfo = torch.finfo(dir[0].dtype)
+            if self.global_state['initial_scale'] < finfo.tiny * 2:
+                self.global_state['initial_scale'] = finfo.max / 2
-        # fallback to backtracking on fail
-        if adaptive: self.global_state['initial_scale'] *= 0.5
         return 0

torchzero/modules/misc/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+from .debug import PrintLoss, PrintParams, PrintShape, PrintUpdate
+from .escape import EscapeAnnealing
+from .gradient_accumulation import GradientAccumulation
+from .homotopy import (
+    ExpHomotopy,
+    LambdaHomotopy,
+    LogHomotopy,
+    SqrtHomotopy,
+    SquareHomotopy,
+)
+from .misc import (
+    DivByLoss,
+    FillLoss,
+    GradSign,
+    GraftGradToUpdate,
+    GraftToGrad,
+    GraftToParams,
+    HpuEstimate,
+    LastAbsoluteRatio,
+    LastDifference,
+    LastGradDifference,
+    LastProduct,
+    LastRatio,
+    MulByLoss,
+    NoiseSign,
+    Previous,
+    RandomHvp,
+    Relative,
+    UpdateSign,
+    SaveBest,
+)
+from .multistep import Multistep, NegateOnLossIncrease, Online, Sequential
+from .regularization import Dropout, PerturbWeights, WeightDropout
+from .split import Split
+from .switch import Alternate, Switch

torchzero/modules/misc/debug.py ADDED Viewed

@@ -0,0 +1,48 @@
+from collections import deque
+import torch
+from ...core import Module
+from ...utils.tensorlist import Distributions
+class PrintUpdate(Module):
+    """Prints current update."""
+    def __init__(self, text = 'update = ', print_fn = print):
+        defaults = dict(text=text, print_fn=print_fn)
+        super().__init__(defaults)
+    def step(self, var):
+        self.defaults["print_fn"](f'{self.defaults["text"]}{var.update}')
+        return var
+class PrintShape(Module):
+    """Prints shapes of the update."""
+    def __init__(self, text = 'shapes = ', print_fn = print):
+        defaults = dict(text=text, print_fn=print_fn)
+        super().__init__(defaults)
+    def step(self, var):
+        shapes = [u.shape for u in var.update] if var.update is not None else None
+        self.defaults["print_fn"](f'{self.defaults["text"]}{shapes}')
+        return var
+class PrintParams(Module):
+    """Prints current update."""
+    def __init__(self, text = 'params = ', print_fn = print):
+        defaults = dict(text=text, print_fn=print_fn)
+        super().__init__(defaults)
+    def step(self, var):
+        self.defaults["print_fn"](f'{self.defaults["text"]}{var.params}')
+        return var
+class PrintLoss(Module):
+    """Prints var.get_loss()."""
+    def __init__(self, text = 'loss = ', print_fn = print):
+        defaults = dict(text=text, print_fn=print_fn)
+        super().__init__(defaults)
+    def step(self, var):
+        self.defaults["print_fn"](f'{self.defaults["text"]}{var.get_loss(False)}')
+        return var

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl