PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/line_search/_polyinterp.py ADDED Viewed

@@ -0,0 +1,289 @@
+import numpy as np
+import torch
+from .line_search import LineSearchBase
+# polynomial interpolation
+# this code is from https://github.com/hjmshi/PyTorch-LBFGS/blob/master/functions/LBFGS.py
+# PyTorch-LBFGS: A PyTorch Implementation of L-BFGS
+def polyinterp(points, x_min_bound=None, x_max_bound=None, plot=False):
+    """
+    Gives the minimizer and minimum of the interpolating polynomial over given points
+    based on function and derivative information. Defaults to bisection if no critical
+    points are valid.
+    Based on polyinterp.m Matlab function in minFunc by Mark Schmidt with some slight
+    modifications.
+    Implemented by: Hao-Jun Michael Shi and Dheevatsa Mudigere
+    Last edited 12/6/18.
+    Inputs:
+        points (nparray): two-dimensional array with each point of form [x f g]
+        x_min_bound (float): minimum value that brackets minimum (default: minimum of points)
+        x_max_bound (float): maximum value that brackets minimum (default: maximum of points)
+        plot (bool): plot interpolating polynomial
+    Outputs:
+        x_sol (float): minimizer of interpolating polynomial
+        F_min (float): minimum of interpolating polynomial
+    Note:
+      . Set f or g to np.nan if they are unknown
+    """
+    no_points = points.shape[0]
+    order = np.sum(1 - np.isnan(points[:, 1:3]).astype('int')) - 1
+    x_min = np.min(points[:, 0])
+    x_max = np.max(points[:, 0])
+    # compute bounds of interpolation area
+    if x_min_bound is None:
+        x_min_bound = x_min
+    if x_max_bound is None:
+        x_max_bound = x_max
+    # explicit formula for quadratic interpolation
+    if no_points == 2 and order == 2 and plot is False:
+        # Solution to quadratic interpolation is given by:
+        # a = -(f1 - f2 - g1(x1 - x2))/(x1 - x2)^2
+        # x_min = x1 - g1/(2a)
+        # if x1 = 0, then is given by:
+        # x_min = - (g1*x2^2)/(2(f2 - f1 - g1*x2))
+        if points[0, 0] == 0:
+            x_sol = -points[0, 2] * points[1, 0] ** 2 / (2 * (points[1, 1] - points[0, 1] - points[0, 2] * points[1, 0]))
+        else:
+            a = -(points[0, 1] - points[1, 1] - points[0, 2] * (points[0, 0] - points[1, 0])) / (points[0, 0] - points[1, 0]) ** 2
+            x_sol = points[0, 0] - points[0, 2]/(2*a)
+        x_sol = np.minimum(np.maximum(x_min_bound, x_sol), x_max_bound)
+    # explicit formula for cubic interpolation
+    elif no_points == 2 and order == 3 and plot is False:
+        # Solution to cubic interpolation is given by:
+        # d1 = g1 + g2 - 3((f1 - f2)/(x1 - x2))
+        # d2 = sqrt(d1^2 - g1*g2)
+        # x_min = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2))
+        d1 = points[0, 2] + points[1, 2] - 3 * ((points[0, 1] - points[1, 1]) / (points[0, 0] - points[1, 0]))
+        value = d1 ** 2 - points[0, 2] * points[1, 2]
+        if value > 0:
+            d2 = np.sqrt(value)
+            x_sol = points[1, 0] - (points[1, 0] - points[0, 0]) * ((points[1, 2] + d2 - d1) / (points[1, 2] - points[0, 2] + 2 * d2))
+            x_sol = np.minimum(np.maximum(x_min_bound, x_sol), x_max_bound)
+        else:
+            x_sol = (x_max_bound + x_min_bound)/2
+    # solve linear system
+    else:
+        # define linear constraints
+        A = np.zeros((0, order + 1))
+        b = np.zeros((0, 1))
+        # add linear constraints on function values
+        for i in range(no_points):
+            if not np.isnan(points[i, 1]):
+                constraint = np.zeros((1, order + 1))
+                for j in range(order, -1, -1):
+                    constraint[0, order - j] = points[i, 0] ** j
+                A = np.append(A, constraint, 0)
+                b = np.append(b, points[i, 1])
+        # add linear constraints on gradient values
+        for i in range(no_points):
+            if not np.isnan(points[i, 2]):
+                constraint = np.zeros((1, order + 1))
+                for j in range(order):
+                    constraint[0, j] = (order - j) * points[i, 0] ** (order - j - 1)
+                A = np.append(A, constraint, 0)
+                b = np.append(b, points[i, 2])
+        # check if system is solvable
+        if A.shape[0] != A.shape[1] or np.linalg.matrix_rank(A) != A.shape[0]:
+            x_sol = (x_min_bound + x_max_bound)/2
+            f_min = np.inf
+        else:
+            # solve linear system for interpolating polynomial
+            coeff = np.linalg.solve(A, b)
+            # compute critical points
+            dcoeff = np.zeros(order)
+            for i in range(len(coeff) - 1):
+                dcoeff[i] = coeff[i] * (order - i)
+            crit_pts = np.array([x_min_bound, x_max_bound])
+            crit_pts = np.append(crit_pts, points[:, 0])
+            if not np.isinf(dcoeff).any():
+                roots = np.roots(dcoeff)
+                crit_pts = np.append(crit_pts, roots)
+            # test critical points
+            f_min = np.inf
+            x_sol = (x_min_bound + x_max_bound) / 2 # defaults to bisection
+            for crit_pt in crit_pts:
+                if np.isreal(crit_pt):
+                    if not np.isrealobj(crit_pt): crit_pt = crit_pt.real
+                    if crit_pt >= x_min_bound and crit_pt <= x_max_bound:
+                        F_cp = np.polyval(coeff, crit_pt)
+                        if np.isreal(F_cp) and F_cp < f_min:
+                            x_sol = np.real(crit_pt)
+                            f_min = np.real(F_cp)
+            if(plot):
+                import matplotlib.pyplot as plt
+                plt.figure()
+                x = np.arange(x_min_bound, x_max_bound, (x_max_bound - x_min_bound)/10000)
+                f = np.polyval(coeff, x)
+                plt.plot(x, f)
+                plt.plot(x_sol, f_min, 'x')
+    return x_sol
+# polynomial interpolation
+# this code is based on https://github.com/hjmshi/PyTorch-LBFGS/blob/master/functions/LBFGS.py
+# PyTorch-LBFGS: A PyTorch Implementation of L-BFGS
+# this one is modified where instead of clipping the solution by bounds, it tries a lower degree polynomial
+# all the way to bisection
+def _within_bounds(x, lb, ub):
+    if lb is not None and x < lb: return False
+    if ub is not None and x > ub: return False
+    return True
+def _quad_interp(points):
+    assert points.shape[0] == 2, points.shape
+    if points[0, 0] == 0:
+        denom = 2 * (points[1, 1] - points[0, 1] - points[0, 2] * points[1, 0])
+        if abs(denom) > 1e-32:
+            return -points[0, 2] * points[1, 0] ** 2 / denom
+    else:
+        denom = (points[0, 0] - points[1, 0]) ** 2
+        if denom > 1e-32:
+            a = -(points[0, 1] - points[1, 1] - points[0, 2] * (points[0, 0] - points[1, 0])) / denom
+            if a > 1e-32:
+                return points[0, 0] - points[0, 2]/(2*a)
+    return None
+def _cubic_interp(points, lb, ub):
+    assert points.shape[0] == 2, points.shape
+    denom = points[0, 0] - points[1, 0]
+    if abs(denom) > 1e-32:
+        d1 = points[0, 2] + points[1, 2] - 3 * ((points[0, 1] - points[1, 1]) / denom)
+        value = d1 ** 2 - points[0, 2] * points[1, 2]
+        if value > 0:
+            d2 = np.sqrt(value)
+            denom = points[1, 2] - points[0, 2] + 2 * d2
+            if abs(denom) > 1e-32:
+                x_sol = points[1, 0] - (points[1, 0] - points[0, 0]) * ((points[1, 2] + d2 - d1) / denom)
+                if _within_bounds(x_sol, lb, ub): return x_sol
+    # try quadratic interpolations
+    x_sol = _quad_interp(points)
+    if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+    return None
+def _poly_interp(points, lb, ub):
+    no_points = points.shape[0]
+    assert no_points > 2, points.shape
+    order = np.sum(1 - np.isnan(points[:, 1:3]).astype('int')) - 1
+    # define linear constraints
+    A = np.zeros((0, order + 1))
+    b = np.zeros((0, 1))
+    # add linear constraints on function values
+    for i in range(no_points):
+        if not np.isnan(points[i, 1]):
+            constraint = np.zeros((1, order + 1))
+            for j in range(order, -1, -1):
+                constraint[0, order - j] = points[i, 0] ** j
+            A = np.append(A, constraint, 0)
+            b = np.append(b, points[i, 1])
+    # add linear constraints on gradient values
+    for i in range(no_points):
+        if not np.isnan(points[i, 2]):
+            constraint = np.zeros((1, order + 1))
+            for j in range(order):
+                constraint[0, j] = (order - j) * points[i, 0] ** (order - j - 1)
+            A = np.append(A, constraint, 0)
+            b = np.append(b, points[i, 2])
+    # check if system is solvable
+    if A.shape[0] != A.shape[1] or np.linalg.matrix_rank(A) != A.shape[0]:
+        return None
+    # solve linear system for interpolating polynomial
+    coeff = np.linalg.solve(A, b)
+    # compute critical points
+    dcoeff = np.zeros(order)
+    for i in range(len(coeff) - 1):
+        dcoeff[i] = coeff[i] * (order - i)
+    lower = np.min(points[:, 0]) if lb is None else lb
+    upper = np.max(points[:, 0]) if ub is None else ub
+    crit_pts = np.array([lower, upper])
+    crit_pts = np.append(crit_pts, points[:, 0])
+    if not np.isinf(dcoeff).any():
+        roots = np.roots(dcoeff)
+        crit_pts = np.append(crit_pts, roots)
+    # test critical points
+    f_min = np.inf
+    x_sol = None
+    for crit_pt in crit_pts:
+        if np.isreal(crit_pt):
+            if not np.isrealobj(crit_pt): crit_pt = crit_pt.real
+            if _within_bounds(crit_pt, lb, ub):
+                F_cp = np.polyval(coeff, crit_pt)
+                if np.isreal(F_cp) and F_cp < f_min:
+                    x_sol = np.real(crit_pt)
+                    f_min = np.real(F_cp)
+    return x_sol
+def polyinterp2(points, lb, ub, unbounded: bool = False):
+    no_points = points.shape[0]
+    if no_points <= 1:
+        return (lb + ub)/2
+    order = np.sum(1 - np.isnan(points[:, 1:3]).astype('int')) - 1
+    x_min = np.min(points[:, 0])
+    x_max = np.max(points[:, 0])
+    # compute bounds of interpolation area
+    if not unbounded:
+        if lb is None:
+            lb = x_min
+        if ub is None:
+            ub = x_max
+    if no_points == 2 and order == 2:
+        x_sol = _quad_interp(points)
+        if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+        return (lb + ub)/2
+    if no_points == 2 and order == 3:
+        x_sol = _cubic_interp(points, lb, ub) # includes fallback on _quad_interp
+        if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+        return (lb + ub)/2
+    if no_points <= 2: # order < 2
+        return (lb + ub)/2
+    if no_points == 3:
+        for p in (points[:2], points[1:], points[::2]):
+            x_sol = _cubic_interp(p, lb, ub)
+            if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+    x_sol = _poly_interp(points, lb, ub)
+    if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+    return polyinterp2(points[1:], lb, ub)

torchzero/modules/line_search/adaptive.py ADDED Viewed

@@ -0,0 +1,124 @@
+import math
+from bisect import insort
+from collections import deque
+from collections.abc import Callable
+from operator import itemgetter
+import numpy as np
+import torch
+from .line_search import LineSearchBase, TerminationCondition, termination_condition
+def adaptive_tracking(
+    f,
+    a_init,
+    maxiter: int,
+    nplus: float = 2,
+    nminus: float = 0.5,
+    f_0 = None,
+):
+    niter = 0
+    if f_0 is None: f_0 = f(0)
+    a = a_init
+    f_a = f(a)
+    # backtrack
+    a_prev = a
+    f_prev = math.inf
+    if (f_a > f_0) or (not math.isfinite(f_a)):
+        while (f_a < f_prev) or not math.isfinite(f_a):
+            a_prev, f_prev = a, f_a
+            maxiter -= 1
+            if maxiter < 0: break
+            a = a*nminus
+            f_a = f(a)
+            niter += 1
+        if f_prev < f_0: return a_prev, f_prev, niter
+        return 0, f_0, niter
+    # forwardtrack
+    a_prev = a
+    f_prev = math.inf
+    while (f_a <= f_prev) and math.isfinite(f_a):
+        a_prev, f_prev = a, f_a
+        maxiter -= 1
+        if maxiter < 0: break
+        a *= nplus
+        f_a = f(a)
+        niter+= 1
+    if f_prev < f_0: return a_prev, f_prev, niter
+    return 0, f_0, niter
+class AdaptiveTracking(LineSearchBase):
+    """A line search that evaluates previous step size, if value increased, backtracks until the value stops decreasing,
+    otherwise forward-tracks until value stops decreasing.
+    Args:
+        init (float, optional): initial step size. Defaults to 1.0.
+        nplus (float, optional): multiplier to step size if initial step size is optimal. Defaults to 2.
+        nminus (float, optional): multiplier to step size if initial step size is too big. Defaults to 0.5.
+        maxiter (int, optional): maximum number of function evaluations per step. Defaults to 10.
+        adaptive (bool, optional):
+            when enabled, if line search failed, step size will continue decreasing on the next step.
+            Otherwise it will restart the line search from ``init`` step size. Defaults to True.
+    """
+    def __init__(
+        self,
+        init: float = 1.0,
+        nplus: float = 2,
+        nminus: float = 0.5,
+        maxiter: int = 10,
+        adaptive=True,
+    ):
+        defaults=dict(init=init,nplus=nplus,nminus=nminus,maxiter=maxiter,adaptive=adaptive)
+        super().__init__(defaults=defaults)
+    def reset(self):
+        super().reset()
+    @torch.no_grad
+    def search(self, update, var):
+        init, nplus, nminus, maxiter, adaptive = itemgetter(
+            'init', 'nplus', 'nminus', 'maxiter', 'adaptive')(self.defaults)
+        objective = self.make_objective(var=var)
+        # scale a_prev
+        a_prev = self.global_state.get('a_prev', init)
+        if adaptive: a_prev = a_prev * self.global_state.get('init_scale', 1)
+        a_init = a_prev
+        if a_init < torch.finfo(var.params[0].dtype).tiny * 2:
+            a_init = torch.finfo(var.params[0].dtype).max / 2
+        step_size, f, niter = adaptive_tracking(
+            objective,
+            a_init=a_init,
+            maxiter=maxiter,
+            nplus=nplus,
+            nminus=nminus,
+        )
+        # found an alpha that reduces loss
+        if step_size != 0:
+            assert (var.loss is None) or (math.isfinite(f) and f < var.loss)
+            self.global_state['init_scale'] = 1
+            # if niter == 1, forward tracking failed to decrease function value compared to f_a_prev
+            if niter == 1 and step_size >= a_init: step_size *= nminus
+            self.global_state['a_prev'] = step_size
+            return step_size
+        # on fail reduce beta scale value
+        self.global_state['init_scale'] = self.global_state.get('init_scale', 1) * nminus**maxiter
+        self.global_state['a_prev'] = init
+        return 0

torchzero/modules/line_search/backtracking.py CHANGED Viewed

@@ -4,7 +4,7 @@ from operator import itemgetter
 import torch
-from .line_search import LineSearch
+from .line_search import LineSearchBase, TerminationCondition, termination_condition
 def backtracking_line_search(
@@ -14,29 +14,37 @@ def backtracking_line_search(
     beta: float = 0.5,
     c: float = 1e-4,
     maxiter: int = 10,
-    try_negative: bool = False,
+    condition: TerminationCondition = 'armijo',
 ) -> float | None:
     """
     Args:
-        objective_fn: evaluates step size along some descent direction.
-        dir_derivative: directional derivative along the descent direction.
-        alpha_init: initial step size.
+        f: evaluates step size along some descent direction.
+        g_0: directional derivative along the descent direction.
+        init: initial step size.
         beta: The factor by which to decrease alpha in each iteration
         c: The constant for the Armijo sufficient decrease condition
-        max_iter: Maximum number of backtracking iterations (default: 10).
+        maxiter: Maximum number of backtracking iterations (default: 10).
     Returns:
         step size
     """
     a = init
-    f_x = f(0)
+    f_0 = f(0)
+    f_prev = None
     for iteration in range(maxiter):
         f_a = f(a)
+        if not math.isfinite(f_a):
+            a *= beta
+            continue
-        if f_a <= f_x + c * a * min(g_0, 0): # pyright: ignore[reportArgumentType]
+        if (f_prev is not None) and (f_a > f_prev) and (f_prev < f_0):
+            return a / beta # new value is larger than previous value
+        f_prev = f_a
+        if termination_condition(condition, f_0=f_0, g_0=g_0, f_a=f_a, g_a=None, a=a, c=c):
             # found an acceptable alpha
             return a
@@ -44,108 +52,134 @@ def backtracking_line_search(
         a *= beta
     # fail
-    if try_negative:
-        def inv_objective(alpha): return f(-alpha)
-        v = backtracking_line_search(
-            inv_objective,
-            g_0=-g_0,
-            beta=beta,
-            c=c,
-            maxiter=maxiter,
-            try_negative=False,
-        )
-        if v is not None: return -v
     return None
-class Backtracking(LineSearch):
-    """Backtracking line search satisfying the Armijo condition.
+class Backtracking(LineSearchBase):
+    """Backtracking line search.
     Args:
         init (float, optional): initial step size. Defaults to 1.0.
         beta (float, optional): multiplies each consecutive step size by this value. Defaults to 0.5.
-        c (float, optional): acceptance value for Armijo condition. Defaults to 1e-4.
-        maxiter (int, optional): Maximum line search function evaluations. Defaults to 10.
+        c (float, optional): sufficient decrease condition. Defaults to 1e-4.
+        condition (TerminationCondition, optional):
+            termination condition, only ones that do not use gradient at f(x+a*d) can be specified.
+            - "armijo" - sufficient decrease condition.
+            - "decrease" - any decrease in objective function value satisfies the condition.
+            "goldstein" can techincally be specified but it doesn't make sense because there is not zoom stage.
+            Defaults to 'armijo'.
+        maxiter (int, optional): maximum number of function evaluations per step. Defaults to 10.
         adaptive (bool, optional):
-            when enabled, if line search failed, initial step size is reduced.
-            Otherwise it is reset to initial value. Defaults to True.
-        try_negative (bool, optional): Whether to perform line search in opposite direction on fail. Defaults to False.
+            when enabled, if line search failed, step size will continue decreasing on the next step.
+            Otherwise it will restart the line search from ``init`` step size. Defaults to True.
+    Examples:
+    Gradient descent with backtracking line search:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Backtracking()
+    )
+    ```
+    L-BFGS with backtracking line search:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LBFGS(),
+        tz.m.Backtracking()
+    )
+    ```
     """
     def __init__(
         self,
         init: float = 1.0,
         beta: float = 0.5,
         c: float = 1e-4,
+        condition: TerminationCondition = 'armijo',
         maxiter: int = 10,
         adaptive=True,
-        try_negative: bool = False,
     ):
-        defaults=dict(init=init,beta=beta,c=c,maxiter=maxiter,adaptive=adaptive, try_negative=try_negative)
+        defaults=dict(init=init,beta=beta,c=c,condition=condition,maxiter=maxiter,adaptive=adaptive)
         super().__init__(defaults=defaults)
-        self.global_state['beta_scale'] = 1.0
     def reset(self):
         super().reset()
-        self.global_state['beta_scale'] = 1.0
     @torch.no_grad
     def search(self, update, var):
-        init, beta, c, maxiter, adaptive, try_negative = itemgetter(
-            'init', 'beta', 'c', 'maxiter', 'adaptive', 'try_negative')(self.settings[var.params[0]])
+        init, beta, c, condition, maxiter, adaptive = itemgetter(
+            'init', 'beta', 'c', 'condition', 'maxiter', 'adaptive')(self.defaults)
         objective = self.make_objective(var=var)
         # # directional derivative
-        d = -sum(t.sum() for t in torch._foreach_mul(var.get_grad(), var.get_update()))
+        if c == 0: d = 0
+        else: d = -sum(t.sum() for t in torch._foreach_mul(var.get_grad(), var.get_update()))
-        # scale beta (beta is multiplicative and i think may be better than scaling initial step size)
-        if adaptive: beta = beta * self.global_state['beta_scale']
+        # scale init
+        init_scale = self.global_state.get('init_scale', 1)
+        if adaptive: init = init * init_scale
-        step_size = backtracking_line_search(objective, d, init=init,beta=beta,
-                                        c=c,maxiter=maxiter, try_negative=try_negative)
+        step_size = backtracking_line_search(objective, d, init=init, beta=beta,c=c, condition=condition, maxiter=maxiter)
         # found an alpha that reduces loss
         if step_size is not None:
-            self.global_state['beta_scale'] = min(1.0, self.global_state['beta_scale'] * math.sqrt(1.5))
+            #self.global_state['beta_scale'] = min(1.0, self.global_state['beta_scale'] * math.sqrt(1.5))
+            self.global_state['init_scale'] = 1
             return step_size
-        # on fail reduce beta scale value
-        self.global_state['beta_scale'] /= 1.5
+        # on fail set init_scale to continue decreasing the step size
+        # or set to large step size when it becomes too small
+        if adaptive:
+            finfo = torch.finfo(var.params[0].dtype)
+            if init_scale <= finfo.tiny * 2:
+                self.global_state["init_scale"] = finfo.max / 2
+            else:
+                self.global_state['init_scale'] = init_scale * beta**maxiter
         return 0
 def _lerp(start,end,weight):
     return start + weight * (end - start)
-class AdaptiveBacktracking(LineSearch):
+class AdaptiveBacktracking(LineSearchBase):
     """Adaptive backtracking line search. After each line search procedure, a new initial step size is set
     such that optimal step size in the procedure would be found on the second line search iteration.
     Args:
-        init (float, optional): step size for the first step. Defaults to 1.0.
+        init (float, optional): initial step size. Defaults to 1.0.
         beta (float, optional): multiplies each consecutive step size by this value. Defaults to 0.5.
-        c (float, optional): acceptance value for Armijo condition. Defaults to 1e-4.
-        maxiter (int, optional): Maximum line search function evaluations. Defaults to 10.
+        c (float, optional): sufficient decrease condition. Defaults to 1e-4.
+        condition (TerminationCondition, optional):
+            termination condition, only ones that do not use gradient at f(x+a*d) can be specified.
+            - "armijo" - sufficient decrease condition.
+            - "decrease" - any decrease in objective function value satisfies the condition.
+            "goldstein" can techincally be specified but it doesn't make sense because there is not zoom stage.
+            Defaults to 'armijo'.
+        maxiter (int, optional): maximum number of function evaluations per step. Defaults to 10.
         target_iters (int, optional):
-            target number of iterations that would be performed until optimal step size is found. Defaults to 1.
+            sets next step size such that this number of iterations are expected
+            to be performed until optimal step size is found. Defaults to 1.
         nplus (float, optional):
-            Multiplier to initial step size if it was found to be the optimal step size. Defaults to 2.0.
+            if initial step size is optimal, it is multiplied by this value. Defaults to 2.0.
         scale_beta (float, optional):
-            Momentum for initial step size, at 0 disables momentum. Defaults to 0.0.
-        try_negative (bool, optional): Whether to perform line search in opposite direction on fail. Defaults to False.
+            momentum for initial step size, at 0 disables momentum. Defaults to 0.0.
     """
     def __init__(
         self,
         init: float = 1.0,
         beta: float = 0.5,
         c: float = 1e-4,
+        condition: TerminationCondition = 'armijo',
         maxiter: int = 20,
         target_iters = 1,
         nplus = 2.0,
         scale_beta = 0.0,
-        try_negative: bool = False,
     ):
-        defaults=dict(init=init,beta=beta,c=c,maxiter=maxiter,target_iters=target_iters,nplus=nplus,scale_beta=scale_beta, try_negative=try_negative)
+        defaults=dict(init=init,beta=beta,c=c,condition=condition,maxiter=maxiter,target_iters=target_iters,nplus=nplus,scale_beta=scale_beta)
         super().__init__(defaults=defaults)
         self.global_state['beta_scale'] = 1.0
@@ -158,8 +192,8 @@ class AdaptiveBacktracking(LineSearch):
     @torch.no_grad
     def search(self, update, var):
-        init, beta, c, maxiter, target_iters, nplus, scale_beta, try_negative=itemgetter(
-            'init','beta','c','maxiter','target_iters','nplus','scale_beta', 'try_negative')(self.settings[var.params[0]])
+        init, beta, c,condition, maxiter, target_iters, nplus, scale_beta=itemgetter(
+            'init','beta','c','condition', 'maxiter','target_iters','nplus','scale_beta')(self.defaults)
         objective = self.make_objective(var=var)
@@ -173,8 +207,7 @@ class AdaptiveBacktracking(LineSearch):
         # scale step size so that decrease is expected at target_iters
         init = init * self.global_state['initial_scale']
-        step_size = backtracking_line_search(objective, d, init=init, beta=beta,
-                                        c=c,maxiter=maxiter, try_negative=try_negative)
+        step_size = backtracking_line_search(objective, d, init=init, beta=beta, c=c, condition=condition, maxiter=maxiter)
         # found an alpha that reduces loss
         if step_size is not None:
@@ -183,7 +216,12 @@ class AdaptiveBacktracking(LineSearch):
             # initial step size satisfied conditions, increase initial_scale by nplus
             if step_size == init and target_iters > 0:
                 self.global_state['initial_scale'] *= nplus ** target_iters
-                self.global_state['initial_scale'] = min(self.global_state['initial_scale'], 1e32) # avoid overflow error
+                # clip by maximum possibel value to avoid overflow exception
+                self.global_state['initial_scale'] = min(
+                    self.global_state['initial_scale'],
+                    torch.finfo(var.params[0].dtype).max / 2,
+                )
             else:
                 # otherwise make initial_scale such that target_iters iterations will satisfy armijo

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl