PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

tests/test_opts.py +95 -76
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +229 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/spsa1.py +93 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/__init__.py +1 -1
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +6 -7
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +114 -175
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +16 -4
torchzero/modules/line_search/strong_wolfe.py +319 -220
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +253 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +207 -170
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +99 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +122 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/optimizer.py +2 -2
torchzero/utils/python_tools.py +7 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.14.dist-info/METADATA +14 -0
torchzero-0.3.14.dist-info/RECORD +167 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/WHEEL +0 -0

torchzero/modules/least_squares/gn.py ADDED Viewed

@@ -0,0 +1,161 @@
+import torch
+from ...core import Module
+from ...utils.derivatives import jacobian_wrt, flatten_jacobian
+from ...utils import vec_to_tensors
+from ...utils.linalg import linear_operator
+class SumOfSquares(Module):
+    """Sets loss to be the sum of squares of values returned by the closure.
+    This is meant to be used to test least squares methods against ordinary minimization methods.
+    To use this, the closure should return a vector of values to minimize sum of squares of.
+    Please add the `backward` argument, it will always be False but it is required.
+    """
+    def __init__(self):
+        super().__init__()
+    @torch.no_grad
+    def step(self, var):
+        closure = var.closure
+        if closure is not None:
+            def sos_closure(backward=True):
+                if backward:
+                    var.zero_grad()
+                    with torch.enable_grad():
+                        loss = closure(False)
+                        loss = loss.pow(2).sum()
+                        loss.backward()
+                    return loss
+                loss = closure(False)
+                return loss.pow(2).sum()
+            var.closure = sos_closure
+        if var.loss is not None:
+            var.loss = var.loss.pow(2).sum()
+        if var.loss_approx is not None:
+            var.loss_approx = var.loss_approx.pow(2).sum()
+        return var
+class GaussNewton(Module):
+    """Gauss-newton method.
+    To use this, the closure should return a vector of values to minimize sum of squares of.
+    Please add the ``backward`` argument, it will always be False but it is required.
+    Gradients will be calculated via batched autograd within this module, you don't need to
+    implement the backward pass. Please see below for an example.
+    Note:
+        This method requires ``ndim^2`` memory, however, if it is used within ``tz.m.TrustCG`` trust region,
+        the memory requirement is ``ndim*m``, where ``m`` is number of values in the output.
+    Args:
+        reg (float, optional): regularization parameter. Defaults to 1e-8.
+        batched (bool, optional): whether to use vmapping. Defaults to True.
+    Examples:
+    minimizing the rosenbrock function:
+    ```python
+    def rosenbrock(X):
+        x1, x2 = X
+        return torch.stack([(1 - x1), 100 * (x2 - x1**2)])
+    X = torch.tensor([-1.1, 2.5], requires_grad=True)
+    opt = tz.Modular([X], tz.m.GaussNewton(), tz.m.Backtracking())
+    # define the closure for line search
+    def closure(backward=True):
+        return rosenbrock(X)
+    # minimize
+    for iter in range(10):
+        loss = opt.step(closure)
+        print(f'{loss = }')
+    ```
+    training a neural network with a matrix-free GN trust region:
+    ```python
+    X = torch.randn(64, 20)
+    y = torch.randn(64, 10)
+    model = nn.Sequential(nn.Linear(20, 64), nn.ELU(), nn.Linear(64, 10))
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.TrustCG(tz.m.GaussNewton()),
+    )
+    def closure(backward=True):
+        y_hat = model(X) # (64, 10)
+        return (y_hat - y).pow(2).mean(0) # (10, )
+    for i in range(100):
+        losses = opt.step(closure)
+        if i % 10 == 0:
+            print(f'{losses.mean() = }')
+    ```
+    """
+    def __init__(self, reg:float = 1e-8, batched:bool=True, ):
+        super().__init__(defaults=dict(batched=batched, reg=reg))
+    @torch.no_grad
+    def update(self, var):
+        params = var.params
+        batched = self.defaults['batched']
+        closure = var.closure
+        assert closure is not None
+        # gauss newton direction
+        with torch.enable_grad():
+            f = var.get_loss(backward=False) # n_out
+            assert isinstance(f, torch.Tensor)
+            G_list = jacobian_wrt([f.ravel()], params, batched=batched)
+        var.loss = f.pow(2).sum()
+        G = self.global_state["G"] = flatten_jacobian(G_list) # (n_out, ndim)
+        Gtf = G.T @ f.detach() # (ndim)
+        self.global_state["Gtf"] = Gtf
+        var.grad = vec_to_tensors(Gtf, var.params)
+        # set closure to calculate sum of squares for line searches etc
+        if var.closure is not None:
+            def sos_closure(backward=True):
+                if backward:
+                    var.zero_grad()
+                    with torch.enable_grad():
+                        loss = closure(False).pow(2).sum()
+                        loss.backward()
+                    return loss
+                loss = closure(False).pow(2).sum()
+                return loss
+            var.closure = sos_closure
+    @torch.no_grad
+    def apply(self, var):
+        reg = self.defaults['reg']
+        G = self.global_state['G']
+        Gtf = self.global_state['Gtf']
+        GtG = G.T @ G # (ndim, ndim)
+        if reg != 0:
+            GtG.add_(torch.eye(GtG.size(0), device=GtG.device, dtype=GtG.dtype).mul_(reg))
+        v = torch.linalg.lstsq(GtG, Gtf).solution # pylint:disable=not-callable
+        var.update = vec_to_tensors(v, var.params)
+        return var
+    def get_H(self, var):
+        G = self.global_state['G']
+        return linear_operator.AtA(G)

torchzero/modules/line_search/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from .adaptive import AdaptiveLineSearch
+from .adaptive import AdaptiveTracking
 from .backtracking import AdaptiveBacktracking, Backtracking
 from .line_search import LineSearchBase
 from .scipy import ScipyMinimizeScalar
-from .strong_wolfe import StrongWolfe
+from .strong_wolfe import StrongWolfe

torchzero/modules/line_search/_polyinterp.py ADDED Viewed

@@ -0,0 +1,289 @@
+import numpy as np
+import torch
+from .line_search import LineSearchBase
+# polynomial interpolation
+# this code is from https://github.com/hjmshi/PyTorch-LBFGS/blob/master/functions/LBFGS.py
+# PyTorch-LBFGS: A PyTorch Implementation of L-BFGS
+def polyinterp(points, x_min_bound=None, x_max_bound=None, plot=False):
+    """
+    Gives the minimizer and minimum of the interpolating polynomial over given points
+    based on function and derivative information. Defaults to bisection if no critical
+    points are valid.
+    Based on polyinterp.m Matlab function in minFunc by Mark Schmidt with some slight
+    modifications.
+    Implemented by: Hao-Jun Michael Shi and Dheevatsa Mudigere
+    Last edited 12/6/18.
+    Inputs:
+        points (nparray): two-dimensional array with each point of form [x f g]
+        x_min_bound (float): minimum value that brackets minimum (default: minimum of points)
+        x_max_bound (float): maximum value that brackets minimum (default: maximum of points)
+        plot (bool): plot interpolating polynomial
+    Outputs:
+        x_sol (float): minimizer of interpolating polynomial
+        F_min (float): minimum of interpolating polynomial
+    Note:
+      . Set f or g to np.nan if they are unknown
+    """
+    no_points = points.shape[0]
+    order = np.sum(1 - np.isnan(points[:, 1:3]).astype('int')) - 1
+    x_min = np.min(points[:, 0])
+    x_max = np.max(points[:, 0])
+    # compute bounds of interpolation area
+    if x_min_bound is None:
+        x_min_bound = x_min
+    if x_max_bound is None:
+        x_max_bound = x_max
+    # explicit formula for quadratic interpolation
+    if no_points == 2 and order == 2 and plot is False:
+        # Solution to quadratic interpolation is given by:
+        # a = -(f1 - f2 - g1(x1 - x2))/(x1 - x2)^2
+        # x_min = x1 - g1/(2a)
+        # if x1 = 0, then is given by:
+        # x_min = - (g1*x2^2)/(2(f2 - f1 - g1*x2))
+        if points[0, 0] == 0:
+            x_sol = -points[0, 2] * points[1, 0] ** 2 / (2 * (points[1, 1] - points[0, 1] - points[0, 2] * points[1, 0]))
+        else:
+            a = -(points[0, 1] - points[1, 1] - points[0, 2] * (points[0, 0] - points[1, 0])) / (points[0, 0] - points[1, 0]) ** 2
+            x_sol = points[0, 0] - points[0, 2]/(2*a)
+        x_sol = np.minimum(np.maximum(x_min_bound, x_sol), x_max_bound)
+    # explicit formula for cubic interpolation
+    elif no_points == 2 and order == 3 and plot is False:
+        # Solution to cubic interpolation is given by:
+        # d1 = g1 + g2 - 3((f1 - f2)/(x1 - x2))
+        # d2 = sqrt(d1^2 - g1*g2)
+        # x_min = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2))
+        d1 = points[0, 2] + points[1, 2] - 3 * ((points[0, 1] - points[1, 1]) / (points[0, 0] - points[1, 0]))
+        value = d1 ** 2 - points[0, 2] * points[1, 2]
+        if value > 0:
+            d2 = np.sqrt(value)
+            x_sol = points[1, 0] - (points[1, 0] - points[0, 0]) * ((points[1, 2] + d2 - d1) / (points[1, 2] - points[0, 2] + 2 * d2))
+            x_sol = np.minimum(np.maximum(x_min_bound, x_sol), x_max_bound)
+        else:
+            x_sol = (x_max_bound + x_min_bound)/2
+    # solve linear system
+    else:
+        # define linear constraints
+        A = np.zeros((0, order + 1))
+        b = np.zeros((0, 1))
+        # add linear constraints on function values
+        for i in range(no_points):
+            if not np.isnan(points[i, 1]):
+                constraint = np.zeros((1, order + 1))
+                for j in range(order, -1, -1):
+                    constraint[0, order - j] = points[i, 0] ** j
+                A = np.append(A, constraint, 0)
+                b = np.append(b, points[i, 1])
+        # add linear constraints on gradient values
+        for i in range(no_points):
+            if not np.isnan(points[i, 2]):
+                constraint = np.zeros((1, order + 1))
+                for j in range(order):
+                    constraint[0, j] = (order - j) * points[i, 0] ** (order - j - 1)
+                A = np.append(A, constraint, 0)
+                b = np.append(b, points[i, 2])
+        # check if system is solvable
+        if A.shape[0] != A.shape[1] or np.linalg.matrix_rank(A) != A.shape[0]:
+            x_sol = (x_min_bound + x_max_bound)/2
+            f_min = np.inf
+        else:
+            # solve linear system for interpolating polynomial
+            coeff = np.linalg.solve(A, b)
+            # compute critical points
+            dcoeff = np.zeros(order)
+            for i in range(len(coeff) - 1):
+                dcoeff[i] = coeff[i] * (order - i)
+            crit_pts = np.array([x_min_bound, x_max_bound])
+            crit_pts = np.append(crit_pts, points[:, 0])
+            if not np.isinf(dcoeff).any():
+                roots = np.roots(dcoeff)
+                crit_pts = np.append(crit_pts, roots)
+            # test critical points
+            f_min = np.inf
+            x_sol = (x_min_bound + x_max_bound) / 2 # defaults to bisection
+            for crit_pt in crit_pts:
+                if np.isreal(crit_pt):
+                    if not np.isrealobj(crit_pt): crit_pt = crit_pt.real
+                    if crit_pt >= x_min_bound and crit_pt <= x_max_bound:
+                        F_cp = np.polyval(coeff, crit_pt)
+                        if np.isreal(F_cp) and F_cp < f_min:
+                            x_sol = np.real(crit_pt)
+                            f_min = np.real(F_cp)
+            if(plot):
+                import matplotlib.pyplot as plt
+                plt.figure()
+                x = np.arange(x_min_bound, x_max_bound, (x_max_bound - x_min_bound)/10000)
+                f = np.polyval(coeff, x)
+                plt.plot(x, f)
+                plt.plot(x_sol, f_min, 'x')
+    return x_sol
+# polynomial interpolation
+# this code is based on https://github.com/hjmshi/PyTorch-LBFGS/blob/master/functions/LBFGS.py
+# PyTorch-LBFGS: A PyTorch Implementation of L-BFGS
+# this one is modified where instead of clipping the solution by bounds, it tries a lower degree polynomial
+# all the way to bisection
+def _within_bounds(x, lb, ub):
+    if lb is not None and x < lb: return False
+    if ub is not None and x > ub: return False
+    return True
+def _quad_interp(points):
+    assert points.shape[0] == 2, points.shape
+    if points[0, 0] == 0:
+        denom = 2 * (points[1, 1] - points[0, 1] - points[0, 2] * points[1, 0])
+        if abs(denom) > 1e-32:
+            return -points[0, 2] * points[1, 0] ** 2 / denom
+    else:
+        denom = (points[0, 0] - points[1, 0]) ** 2
+        if denom > 1e-32:
+            a = -(points[0, 1] - points[1, 1] - points[0, 2] * (points[0, 0] - points[1, 0])) / denom
+            if a > 1e-32:
+                return points[0, 0] - points[0, 2]/(2*a)
+    return None
+def _cubic_interp(points, lb, ub):
+    assert points.shape[0] == 2, points.shape
+    denom = points[0, 0] - points[1, 0]
+    if abs(denom) > 1e-32:
+        d1 = points[0, 2] + points[1, 2] - 3 * ((points[0, 1] - points[1, 1]) / denom)
+        value = d1 ** 2 - points[0, 2] * points[1, 2]
+        if value > 0:
+            d2 = np.sqrt(value)
+            denom = points[1, 2] - points[0, 2] + 2 * d2
+            if abs(denom) > 1e-32:
+                x_sol = points[1, 0] - (points[1, 0] - points[0, 0]) * ((points[1, 2] + d2 - d1) / denom)
+                if _within_bounds(x_sol, lb, ub): return x_sol
+    # try quadratic interpolations
+    x_sol = _quad_interp(points)
+    if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+    return None
+def _poly_interp(points, lb, ub):
+    no_points = points.shape[0]
+    assert no_points > 2, points.shape
+    order = np.sum(1 - np.isnan(points[:, 1:3]).astype('int')) - 1
+    # define linear constraints
+    A = np.zeros((0, order + 1))
+    b = np.zeros((0, 1))
+    # add linear constraints on function values
+    for i in range(no_points):
+        if not np.isnan(points[i, 1]):
+            constraint = np.zeros((1, order + 1))
+            for j in range(order, -1, -1):
+                constraint[0, order - j] = points[i, 0] ** j
+            A = np.append(A, constraint, 0)
+            b = np.append(b, points[i, 1])
+    # add linear constraints on gradient values
+    for i in range(no_points):
+        if not np.isnan(points[i, 2]):
+            constraint = np.zeros((1, order + 1))
+            for j in range(order):
+                constraint[0, j] = (order - j) * points[i, 0] ** (order - j - 1)
+            A = np.append(A, constraint, 0)
+            b = np.append(b, points[i, 2])
+    # check if system is solvable
+    if A.shape[0] != A.shape[1] or np.linalg.matrix_rank(A) != A.shape[0]:
+        return None
+    # solve linear system for interpolating polynomial
+    coeff = np.linalg.solve(A, b)
+    # compute critical points
+    dcoeff = np.zeros(order)
+    for i in range(len(coeff) - 1):
+        dcoeff[i] = coeff[i] * (order - i)
+    lower = np.min(points[:, 0]) if lb is None else lb
+    upper = np.max(points[:, 0]) if ub is None else ub
+    crit_pts = np.array([lower, upper])
+    crit_pts = np.append(crit_pts, points[:, 0])
+    if not np.isinf(dcoeff).any():
+        roots = np.roots(dcoeff)
+        crit_pts = np.append(crit_pts, roots)
+    # test critical points
+    f_min = np.inf
+    x_sol = None
+    for crit_pt in crit_pts:
+        if np.isreal(crit_pt):
+            if not np.isrealobj(crit_pt): crit_pt = crit_pt.real
+            if _within_bounds(crit_pt, lb, ub):
+                F_cp = np.polyval(coeff, crit_pt)
+                if np.isreal(F_cp) and F_cp < f_min:
+                    x_sol = np.real(crit_pt)
+                    f_min = np.real(F_cp)
+    return x_sol
+def polyinterp2(points, lb, ub, unbounded: bool = False):
+    no_points = points.shape[0]
+    if no_points <= 1:
+        return (lb + ub)/2
+    order = np.sum(1 - np.isnan(points[:, 1:3]).astype('int')) - 1
+    x_min = np.min(points[:, 0])
+    x_max = np.max(points[:, 0])
+    # compute bounds of interpolation area
+    if not unbounded:
+        if lb is None:
+            lb = x_min
+        if ub is None:
+            ub = x_max
+    if no_points == 2 and order == 2:
+        x_sol = _quad_interp(points)
+        if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+        return (lb + ub)/2
+    if no_points == 2 and order == 3:
+        x_sol = _cubic_interp(points, lb, ub) # includes fallback on _quad_interp
+        if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+        return (lb + ub)/2
+    if no_points <= 2: # order < 2
+        return (lb + ub)/2
+    if no_points == 3:
+        for p in (points[:2], points[1:], points[::2]):
+            x_sol = _cubic_interp(p, lb, ub)
+            if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+    x_sol = _poly_interp(points, lb, ub)
+    if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+    return polyinterp2(points[1:], lb, ub)

torchzero/modules/line_search/adaptive.py CHANGED Viewed

@@ -1,58 +1,73 @@
 import math
+from bisect import insort
+from collections import deque
 from collections.abc import Callable
 from operator import itemgetter
+import numpy as np
 import torch
-from .line_search import LineSearchBase
+from .line_search import LineSearchBase, TerminationCondition, termination_condition
 def adaptive_tracking(
     f,
-    x_0,
+    a_init,
     maxiter: int,
     nplus: float = 2,
     nminus: float = 0.5,
+    f_0 = None,
 ):
-    f_0 = f(0)
+    niter = 0
+    if f_0 is None: f_0 = f(0)
-    t = x_0
-    f_t = f(t)
+    a = a_init
+    f_a = f(a)
     # backtrack
-    if f_t > f_0:
-        while f_t > f_0:
+    a_prev = a
+    f_prev = math.inf
+    if (f_a > f_0) or (not math.isfinite(f_a)):
+        while (f_a < f_prev) or not math.isfinite(f_a):
+            a_prev, f_prev = a, f_a
             maxiter -= 1
-            if maxiter < 0: return 0, f_0
-            t = t*nminus
-            f_t = f(t)
-        return t, f_t
+            if maxiter < 0: break
+            a = a*nminus
+            f_a = f(a)
+            niter += 1
+        if f_prev < f_0: return a_prev, f_prev, niter
+        return 0, f_0, niter
     # forwardtrack
-    f_prev = f_t
-    t *= nplus
-    f_t = f(t)
-    if f_prev < f_t: return t / nplus, f_prev
-    while f_prev >= f_t:
+    a_prev = a
+    f_prev = math.inf
+    while (f_a <= f_prev) and math.isfinite(f_a):
+        a_prev, f_prev = a, f_a
         maxiter -= 1
-        if maxiter < 0: return t, f_t
-        f_prev = f_t
-        t *= nplus
-        f_t = f(t)
-    return t / nplus, f_prev
+        if maxiter < 0: break
+        a *= nplus
+        f_a = f(a)
+        niter+= 1
+    if f_prev < f_0: return a_prev, f_prev, niter
+    return 0, f_0, niter
-class AdaptiveLineSearch(LineSearchBase):
-    """Adaptive line search, similar to backtracking but also has forward tracking mode.
-    Currently doesn't check for weak curvature condition.
+class AdaptiveTracking(LineSearchBase):
+    """A line search that evaluates previous step size, if value increased, backtracks until the value stops decreasing,
+    otherwise forward-tracks until value stops decreasing.
     Args:
         init (float, optional): initial step size. Defaults to 1.0.
-        beta (float, optional): multiplies each consecutive step size by this value. Defaults to 0.5.
-        maxiter (int, optional): Maximum line search function evaluations. Defaults to 10.
+        nplus (float, optional): multiplier to step size if initial step size is optimal. Defaults to 2.
+        nminus (float, optional): multiplier to step size if initial step size is too big. Defaults to 0.5.
+        maxiter (int, optional): maximum number of function evaluations per step. Defaults to 10.
         adaptive (bool, optional):
-            when enabled, if line search failed, beta size is reduced.
-            Otherwise it is reset to initial value. Defaults to True.
+            when enabled, if line search failed, step size will continue decreasing on the next step.
+            Otherwise it will restart the line search from ``init`` step size. Defaults to True.
     """
     def __init__(
         self,
@@ -62,38 +77,48 @@ class AdaptiveLineSearch(LineSearchBase):
         maxiter: int = 10,
         adaptive=True,
     ):
-        defaults=dict(init=init,nplus=nplus,nminus=nminus,maxiter=maxiter,adaptive=adaptive,)
+        defaults=dict(init=init,nplus=nplus,nminus=nminus,maxiter=maxiter,adaptive=adaptive)
         super().__init__(defaults=defaults)
-        self.global_state['beta_scale'] = 1.0
     def reset(self):
         super().reset()
-        self.global_state['beta_scale'] = 1.0
     @torch.no_grad
     def search(self, update, var):
         init, nplus, nminus, maxiter, adaptive = itemgetter(
-            'init', 'nplus', 'nminus', 'maxiter', 'adaptive')(self.settings[var.params[0]])
+            'init', 'nplus', 'nminus', 'maxiter', 'adaptive')(self.defaults)
         objective = self.make_objective(var=var)
-        # # directional derivative
-        # d = -sum(t.sum() for t in torch._foreach_mul(var.get_grad(), var.get_update()))
+        # scale a_prev
+        a_prev = self.global_state.get('a_prev', init)
+        if adaptive: a_prev = a_prev * self.global_state.get('init_scale', 1)
-        # scale beta (beta is multiplicative and i think may be better than scaling initial step size)
-        beta_scale = self.global_state.get('beta_scale', 1)
-        x_prev = self.global_state.get('prev_x', 1)
+        a_init = a_prev
+        if a_init < torch.finfo(var.params[0].dtype).tiny * 2:
+            a_init = torch.finfo(var.params[0].dtype).max / 2
-        if adaptive: nminus = nminus * beta_scale
-        step_size, f = adaptive_tracking(objective, x_prev, maxiter, nplus=nplus, nminus=nminus)
+        step_size, f, niter = adaptive_tracking(
+            objective,
+            a_init=a_init,
+            maxiter=maxiter,
+            nplus=nplus,
+            nminus=nminus,
+        )
         # found an alpha that reduces loss
         if step_size != 0:
-            self.global_state['beta_scale'] = min(1.0, self.global_state['beta_scale'] * math.sqrt(1.5))
+            assert (var.loss is None) or (math.isfinite(f) and f < var.loss)
+            self.global_state['init_scale'] = 1
+            # if niter == 1, forward tracking failed to decrease function value compared to f_a_prev
+            if niter == 1 and step_size >= a_init: step_size *= nminus
+            self.global_state['a_prev'] = step_size
             return step_size
         # on fail reduce beta scale value
-        self.global_state['beta_scale'] /= 1.5
+        self.global_state['init_scale'] = self.global_state.get('init_scale', 1) * nminus**maxiter
+        self.global_state['a_prev'] = init
         return 0

torchzero 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl