PyPI - torchzero - Versions diffs - 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl - Mend

torchzero 0.3.13py3-none-any.whl → 0.3.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

tests/test_opts.py +4 -10
torchzero/core/__init__.py +4 -1
torchzero/core/chain.py +50 -0
torchzero/core/functional.py +37 -0
torchzero/core/modular.py +237 -0
torchzero/core/module.py +12 -599
torchzero/core/reformulation.py +3 -1
torchzero/core/transform.py +7 -5
torchzero/core/var.py +376 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/adahessian.py +2 -2
torchzero/modules/adaptive/esgd.py +2 -2
torchzero/modules/adaptive/matrix_momentum.py +1 -1
torchzero/modules/adaptive/sophia_h.py +2 -2
torchzero/modules/conjugate_gradient/cg.py +16 -16
torchzero/modules/experimental/__init__.py +1 -0
torchzero/modules/experimental/newtonnewton.py +5 -5
torchzero/modules/experimental/spsa1.py +93 -0
torchzero/modules/functional.py +7 -0
torchzero/modules/grad_approximation/__init__.py +1 -1
torchzero/modules/grad_approximation/forward_gradient.py +2 -5
torchzero/modules/grad_approximation/rfdm.py +27 -110
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +1 -1
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +11 -20
torchzero/modules/line_search/scipy.py +15 -3
torchzero/modules/line_search/strong_wolfe.py +3 -5
torchzero/modules/misc/misc.py +2 -2
torchzero/modules/misc/multistep.py +13 -13
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/quasi_newton.py +15 -6
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/restarts/restars.py +5 -4
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +89 -0
torchzero/modules/second_order/inm.py +105 -0
torchzero/modules/second_order/newton.py +103 -193
torchzero/modules/second_order/newton_cg.py +86 -110
torchzero/modules/second_order/nystrom.py +1 -1
torchzero/modules/second_order/rsn.py +227 -0
torchzero/modules/trust_region/levenberg_marquardt.py +2 -2
torchzero/modules/trust_region/trust_cg.py +6 -4
torchzero/modules/wrappers/optim_wrapper.py +49 -42
torchzero/modules/zeroth_order/__init__.py +1 -1
torchzero/modules/zeroth_order/cd.py +1 -238
torchzero/utils/derivatives.py +19 -19
torchzero/utils/linalg/linear_operator.py +50 -2
torchzero/utils/optimizer.py +2 -2
torchzero/utils/python_tools.py +1 -0
{torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/METADATA +1 -1
{torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/RECORD +57 -48
torchzero/modules/higher_order/__init__.py +0 -1
/torchzero/modules/{higher_order → experimental}/higher_order_newton.py +0 -0
{torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/WHEEL +0 -0
{torchzero-0.3.13.dist-info → torchzero-0.3.15.dist-info}/top_level.txt +0 -0

torchzero/modules/experimental/spsa1.py ADDED Viewed

@@ -0,0 +1,93 @@
+from collections.abc import Callable
+from typing import Any
+from functools import partial
+import torch
+from ...utils import TensorList, NumberList
+from ..grad_approximation.grad_approximator import GradApproximator, GradTarget
+class SPSA1(GradApproximator):
+    """One-measurement variant of SPSA. Unlike standard two-measurement SPSA, the estimated
+    gradient often won't be a descent direction, however the expectation is biased towards
+    the descent direction. Therefore this variant of SPSA is only recommended for a specific
+    class of problems where the objective function changes on each evaluation,
+    for example feedback control problems.
+    Args:
+        h (float, optional):
+            finite difference step size, recommended to set to same value as learning rate. Defaults to 1e-3.
+        n_samples (int, optional): number of random samples. Defaults to 1.
+        eps (float, optional): measurement noise estimate. Defaults to 1e-8.
+        seed (int | None | torch.Generator, optional): random seed. Defaults to None.
+        target (GradTarget, optional): what to set on closure. Defaults to "closure".
+    Reference:
+        [SPALL, JAMES C. "A One-measurement Form of Simultaneous Stochastic Approximation](https://www.jhuapl.edu/spsa/PDF-SPSA/automatica97_one_measSPSA.pdf)."
+    """
+    def __init__(
+        self,
+        h: float = 1e-3,
+        n_samples: int = 1,
+        eps: float = 1e-8, # measurement noise
+        pre_generate = False,
+        seed: int | None | torch.Generator = None,
+        target: GradTarget = "closure",
+    ):
+        defaults = dict(h=h, eps=eps, n_samples=n_samples, pre_generate=pre_generate, seed=seed)
+        super().__init__(defaults, target=target)
+    def pre_step(self, var):
+        if self.defaults['pre_generate']:
+            params = TensorList(var.params)
+            generator = self.get_generator(params[0].device, self.defaults['seed'])
+            n_samples = self.defaults['n_samples']
+            h = self.get_settings(var.params, 'h')
+            perturbations = [params.rademacher_like(generator=generator) for _ in range(n_samples)]
+            torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
+            for param, prt in zip(params, zip(*perturbations)):
+                self.state[param]['perturbations'] = prt
+    @torch.no_grad
+    def approximate(self, closure, params, loss):
+        generator = self.get_generator(params[0].device, self.defaults['seed'])
+        params = TensorList(params)
+        orig_params = params.clone() # store to avoid small changes due to float imprecision
+        loss_approx = None
+        h, eps = self.get_settings(params, "h", "eps", cls=NumberList)
+        n_samples = self.defaults['n_samples']
+        default = [None]*n_samples
+        # perturbations are pre-multiplied by h
+        perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
+        grad = None
+        for i in range(n_samples):
+            prt = perturbations[i]
+            if prt[0] is None:
+                prt = params.rademacher_like(generator=generator).mul_(h)
+            else: prt = TensorList(prt)
+            params += prt
+            L = closure(False)
+            params.copy_(orig_params)
+            sample = prt * ((L + eps) / h)
+            if grad is None: grad = sample
+            else: grad += sample
+        assert grad is not None
+        if n_samples > 1: grad.div_(n_samples)
+        # mean if got per-sample values
+        return grad, loss, loss_approx

torchzero/modules/functional.py CHANGED Viewed

@@ -253,3 +253,10 @@ def safe_clip(x: torch.Tensor, min=None):
     if x.abs() < min: return x.new_full(x.size(), min).copysign(x)
     return x
+def clip_by_finfo(x, finfo: torch.finfo):
+    """clips by (dtype.max / 2, dtype.min / 2)"""
+    if x > finfo.max / 2: return finfo.max / 2
+    if x < finfo.min / 2: return finfo.min / 2
+    return x

torchzero/modules/grad_approximation/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from .grad_approximator import GradApproximator, GradTarget
 from .fdm import FDM
 from .rfdm import RandomizedFDM, MeZO, SPSA, RDSA, GaussianSmoothing
-from .forward_gradient import ForwardGradient
+from .forward_gradient import ForwardGradient

torchzero/modules/grad_approximation/forward_gradient.py CHANGED Viewed

@@ -23,8 +23,6 @@ class ForwardGradient(RandomizedFDM):
     Args:
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         distribution (Distributions, optional): distribution for random gradient samples. Defaults to "gaussian".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         jvp_method (str, optional):
@@ -40,14 +38,13 @@ class ForwardGradient(RandomizedFDM):
         self,
         n_samples: int = 1,
         distribution: Distributions = "gaussian",
-        beta: float = 0,
         pre_generate = True,
         jvp_method: Literal['autograd', 'forward', 'central'] = 'autograd',
         h: float = 1e-3,
         target: GradTarget = "closure",
         seed: int | None | torch.Generator = None,
     ):
-        super().__init__(h=h, n_samples=n_samples, distribution=distribution, beta=beta, target=target, pre_generate=pre_generate, seed=seed)
+        super().__init__(h=h, n_samples=n_samples, distribution=distribution, target=target, pre_generate=pre_generate, seed=seed)
         self.defaults['jvp_method'] = jvp_method
     @torch.no_grad
@@ -62,7 +59,7 @@ class ForwardGradient(RandomizedFDM):
         distribution = settings['distribution']
         default = [None]*n_samples
         perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
-        generator = self._get_generator(settings['seed'], params)
+        generator = self.get_generator(params[0].device, self.defaults['seed'])
         grad = None
         for i in range(n_samples):

torchzero/modules/grad_approximation/rfdm.py CHANGED Viewed

@@ -164,7 +164,6 @@ class RandomizedFDM(GradApproximator):
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
         distribution (Distributions, optional): distribution. Defaults to "rademacher".
             If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
-        beta (float, optional): optinal momentum for generated perturbations. Defaults to 1e-3.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -173,7 +172,7 @@ class RandomizedFDM(GradApproximator):
     Examples:
     #### Simultaneous perturbation stochastic approximation (SPSA) method
-    SPSA is randomized finite differnce with rademacher distribution and central formula.
+    SPSA is randomized FDM with rademacher distribution and central formula.
     ```py
     spsa = tz.Modular(
         model.parameters(),
@@ -184,8 +183,7 @@ class RandomizedFDM(GradApproximator):
     #### Random-direction stochastic approximation (RDSA) method
-    RDSA is randomized finite differnce with usually gaussian distribution and central formula.
+    RDSA is randomized FDM with usually gaussian distribution and central formula.
     ```
     rdsa = tz.Modular(
         model.parameters(),
@@ -194,23 +192,9 @@ class RandomizedFDM(GradApproximator):
     )
     ```
-    #### RandomizedFDM with momentum
-    Momentum might help by reducing the variance of the estimated gradients.
-    ```
-    momentum_spsa = tz.Modular(
-        model.parameters(),
-        tz.m.RandomizedFDM(),
-        tz.m.HeavyBall(0.9),
-        tz.m.LR(1e-3)
-    )
-    ```
     #### Gaussian smoothing method
     GS uses many gaussian samples with possibly a larger finite difference step size.
     ```
     gs = tz.Modular(
         model.parameters(),
@@ -220,44 +204,15 @@ class RandomizedFDM(GradApproximator):
     )
     ```
-    #### SPSA-NewtonCG
-    NewtonCG with hessian-vector product estimated via gradient difference
-    calls closure multiple times per step. If each closure call estimates gradients
-    with different perturbations, NewtonCG is unable to produce useful directions.
-    By setting pre_generate to True, perturbations are generated once before each step,
-    and each closure call estimates gradients using the same pre-generated perturbations.
-    This way closure-based algorithms are able to use gradients estimated in a consistent way.
+    #### RandomizedFDM with momentum
+    Momentum might help by reducing the variance of the estimated gradients.
     ```
-    opt = tz.Modular(
+    momentum_spsa = tz.Modular(
         model.parameters(),
-        tz.m.RandomizedFDM(n_samples=10),
-        tz.m.NewtonCG(hvp_method="forward", pre_generate=True),
-        tz.m.Backtracking()
-    )
-    ```
-    #### SPSA-LBFGS
-    LBFGS uses a memory of past parameter and gradient differences. If past gradients
-    were estimated with different perturbations, LBFGS directions will be useless.
-    To alleviate this momentum can be added to random perturbations to make sure they only
-    change by a little bit, and the history stays relevant. The momentum is determined by the :code:`beta` parameter.
-    The disadvantage is that the subspace the algorithm is able to explore changes slowly.
-    Additionally we will reset SPSA and LBFGS memory every 100 steps to remove influence from old gradient estimates.
-    ```
-    opt = tz.Modular(
-        bench.parameters(),
-        tz.m.ResetEvery(
-            [tz.m.RandomizedFDM(n_samples=10, pre_generate=True, beta=0.99), tz.m.LBFGS()],
-            steps = 100,
-        ),
-        tz.m.Backtracking()
+        tz.m.RandomizedFDM(),
+        tz.m.HeavyBall(0.9),
+        tz.m.LR(1e-3)
     )
     ```
     """
@@ -268,75 +223,46 @@ class RandomizedFDM(GradApproximator):
         n_samples: int = 1,
         formula: _FD_Formula = "central",
         distribution: Distributions = "rademacher",
-        beta: float = 0,
         pre_generate = True,
         seed: int | None | torch.Generator = None,
         target: GradTarget = "closure",
     ):
-        defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution, beta=beta, pre_generate=pre_generate, seed=seed)
+        defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution, pre_generate=pre_generate, seed=seed)
         super().__init__(defaults, target=target)
-    def reset(self):
-        self.state.clear()
-        generator = self.global_state.get('generator', None) # avoid resetting generator
-        self.global_state.clear()
-        if generator is not None: self.global_state['generator'] = generator
-        for c in self.children.values(): c.reset()
-    def _get_generator(self, seed: int | None | torch.Generator, params: list[torch.Tensor]):
-        if 'generator' not in self.global_state:
-            if isinstance(seed, torch.Generator): self.global_state['generator'] = seed
-            elif seed is not None: self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
-            else: self.global_state['generator'] = None
-        return self.global_state['generator']
     def pre_step(self, var):
-        h, beta = self.get_settings(var.params, 'h', 'beta')
-        n_samples = self.defaults['n_samples']
-        distribution = self.defaults['distribution']
+        h = self.get_settings(var.params, 'h')
         pre_generate = self.defaults['pre_generate']
         if pre_generate:
+            n_samples = self.defaults['n_samples']
+            distribution = self.defaults['distribution']
             params = TensorList(var.params)
-            generator = self._get_generator(self.defaults['seed'], var.params)
+            generator = self.get_generator(params[0].device, self.defaults['seed'])
             perturbations = [params.sample_like(distribution=distribution, variance=1, generator=generator) for _ in range(n_samples)]
+            # this is false for ForwardGradient where h isn't used and it subclasses this
             if self.PRE_MULTIPLY_BY_H:
                 torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
-            if all(i==0 for i in beta):
-                # just use pre-generated perturbations
-                for param, prt in zip(params, zip(*perturbations)):
-                    self.state[param]['perturbations'] = prt
-            else:
-                # lerp old and new perturbations. This makes the subspace change gradually
-                # which in theory might improve algorithms with history
-                for i,p in enumerate(params):
-                    state = self.state[p]
-                    if 'perturbations' not in state: state['perturbations'] = [p[i] for p in perturbations]
-                cur = [self.state[p]['perturbations'][:n_samples] for p in params]
-                cur_flat = [p for l in cur for p in l]
-                new_flat = [p for l in zip(*perturbations) for p in l]
-                betas = [1-v for b in beta for v in [b]*n_samples]
-                torch._foreach_lerp_(cur_flat, new_flat, betas)
+            for param, prt in zip(params, zip(*perturbations)):
+                self.state[param]['perturbations'] = prt
     @torch.no_grad
     def approximate(self, closure, params, loss):
         params = TensorList(params)
-        orig_params = params.clone() # store to avoid small changes due to float imprecision
         loss_approx = None
         h = NumberList(self.settings[p]['h'] for p in params)
-        settings = self.settings[params[0]]
-        n_samples = settings['n_samples']
-        fd_fn = _RFD_FUNCS[settings['formula']]
+        n_samples = self.defaults['n_samples']
+        distribution = self.defaults['distribution']
+        fd_fn = _RFD_FUNCS[self.defaults['formula']]
         default = [None]*n_samples
         perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
-        distribution = settings['distribution']
-        generator = self._get_generator(settings['seed'], params)
+        generator = self.get_generator(params[0].device, self.defaults['seed'])
         grad = None
         for i in range(n_samples):
@@ -356,7 +282,6 @@ class RandomizedFDM(GradApproximator):
             if grad is None: grad = prt * d
             else: grad += prt * d
-        params.set_(orig_params)
         assert grad is not None
         if n_samples > 1: grad.div_(n_samples)
@@ -384,8 +309,6 @@ class SPSA(RandomizedFDM):
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
         distribution (Distributions, optional): distribution. Defaults to "rademacher".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -408,8 +331,6 @@ class RDSA(RandomizedFDM):
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
         distribution (Distributions, optional): distribution. Defaults to "gaussian".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -425,12 +346,11 @@ class RDSA(RandomizedFDM):
         n_samples: int = 1,
         formula: _FD_Formula = "central2",
         distribution: Distributions = "gaussian",
-        beta: float = 0,
         pre_generate = True,
         target: GradTarget = "closure",
         seed: int | None | torch.Generator = None,
     ):
-        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,beta=beta,pre_generate=pre_generate,target=target,seed=seed)
+        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,pre_generate=pre_generate,target=target,seed=seed)
 class GaussianSmoothing(RandomizedFDM):
     """
@@ -445,8 +365,6 @@ class GaussianSmoothing(RandomizedFDM):
         n_samples (int, optional): number of random gradient samples. Defaults to 100.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'forward2'.
         distribution (Distributions, optional): distribution. Defaults to "gaussian".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -462,12 +380,11 @@ class GaussianSmoothing(RandomizedFDM):
         n_samples: int = 100,
         formula: _FD_Formula = "forward2",
         distribution: Distributions = "gaussian",
-        beta: float = 0,
         pre_generate = True,
         target: GradTarget = "closure",
         seed: int | None | torch.Generator = None,
     ):
-        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,beta=beta,pre_generate=pre_generate,target=target,seed=seed)
+        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,pre_generate=pre_generate,target=target,seed=seed)
 class MeZO(GradApproximator):
     """Gradient approximation via memory-efficient zeroth order optimizer (MeZO) - https://arxiv.org/abs/2305.17333.
@@ -525,9 +442,9 @@ class MeZO(GradApproximator):
         loss_approx = None
         h = NumberList(self.settings[p]['h'] for p in params)
-        settings = self.settings[params[0]]
-        n_samples = settings['n_samples']
-        fd_fn = _RFD_FUNCS[settings['formula']]
+        n_samples = self.defaults['n_samples']
+        fd_fn = _RFD_FUNCS[self.defaults['formula']]
         prt_fns = self.global_state['prt_fns']
         grad = None

torchzero/modules/line_search/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .adaptive import AdaptiveTracking
+from .adaptive import AdaptiveBisection
 from .backtracking import AdaptiveBacktracking, Backtracking
 from .line_search import LineSearchBase
 from .scipy import ScipyMinimizeScalar

torchzero/modules/line_search/_polyinterp.py CHANGED Viewed

@@ -2,7 +2,7 @@ import numpy as np
 import torch
 from .line_search import LineSearchBase
+from ...utils import tofloat
 # polynomial interpolation
 # this code is from https://github.com/hjmshi/PyTorch-LBFGS/blob/master/functions/LBFGS.py
@@ -284,6 +284,8 @@ def polyinterp2(points, lb, ub, unbounded: bool = False):
             x_sol = _cubic_interp(p, lb, ub)
             if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
+    if lb is not None: lb = tofloat(lb)
+    if ub is not None: ub = tofloat(ub)
     x_sol = _poly_interp(points, lb, ub)
     if x_sol is not None and _within_bounds(x_sol, lb, ub): return x_sol
     return polyinterp2(points[1:], lb, ub)

torchzero/modules/line_search/adaptive.py CHANGED Viewed

@@ -10,7 +10,7 @@ import torch
 from .line_search import LineSearchBase, TerminationCondition, termination_condition
-def adaptive_tracking(
+def adaptive_bisection(
     f,
     a_init,
     maxiter: int,
@@ -56,7 +56,7 @@ def adaptive_tracking(
     return 0, f_0, niter
-class AdaptiveTracking(LineSearchBase):
+class AdaptiveBisection(LineSearchBase):
     """A line search that evaluates previous step size, if value increased, backtracks until the value stops decreasing,
     otherwise forward-tracks until value stops decreasing.
@@ -98,7 +98,7 @@ class AdaptiveTracking(LineSearchBase):
         if a_init < torch.finfo(var.params[0].dtype).tiny * 2:
             a_init = torch.finfo(var.params[0].dtype).max / 2
-        step_size, f, niter = adaptive_tracking(
+        step_size, f, niter = adaptive_bisection(
             objective,
             a_init=a_init,
             maxiter=maxiter,

torchzero/modules/line_search/backtracking.py CHANGED Viewed

@@ -136,7 +136,7 @@ class Backtracking(LineSearchBase):
         if adaptive:
             finfo = torch.finfo(var.params[0].dtype)
             if init_scale <= finfo.tiny * 2:
-                self.global_state["init_scale"] = finfo.max / 2
+                self.global_state["init_scale"] = init * 2
             else:
                 self.global_state['init_scale'] = init_scale * beta**maxiter
         return 0

torchzero/modules/line_search/interpolation.py ADDED Viewed

@@ -0,0 +1,160 @@
+import math
+from bisect import insort
+import numpy as np
+from numpy.polynomial import Polynomial
+# we have a list of points in ascending order of their `y` value
+class Point:
+    __slots__ = ("x", "y", "d")
+    def __init__(self, x, y, d):
+        self.x = x
+        self.y = y
+        self.d = d
+    def __lt__(self, other):
+        return self.y < other.y
+def _get_dpoint(points: list[Point]):
+    """returns lowest point with derivative and list of other points"""
+    for i,p in enumerate(points):
+        if p.d is not None:
+            cpoints = points.copy()
+            del cpoints[i]
+            return p, cpoints
+    return None, points
+# -------------------------------- quadratic2 -------------------------------- #
+def _fitmin_quadratic2(x1, y1, d1, x2, y2):
+    a = (y2 - y1 - d1*(x2 - x1)) / (x2 - x1)**2
+    if a <= 0: return None
+    b = d1 - 2*a*x1
+    # c = y_1 - d_1*x_1 + a*x_1**2
+    return -b / (2*a)
+def quadratic2(points:list[Point]):
+    pd, points = _get_dpoint(points)
+    if pd is None: return None
+    if len(points) == 0: return None
+    pn = points[0]
+    return _fitmin_quadratic2(pd.x, pd.y, pd.d, pn.x, pn.y)
+# -------------------------------- quadratic3 -------------------------------- #
+def _fitmin_quadratic3(x1, y1, x2, y2, x3, y3):
+    quad = Polynomial.fit([x1,x2,x3], [y1,y2,y3], deg=2)
+    a,b,c = quad.coef
+    if a <= 0: return None
+    return -b / (2*a)
+def quadratic3(points:list[Point]):
+    if len(points) < 3: return None
+    p1,p2,p3 = points[:3]
+    return _fitmin_quadratic3(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y)
+# ---------------------------------- cubic3 ---------------------------------- #
+def _minimize_polynomial(poly: Polynomial):
+    roots = poly.deriv().roots()
+    vals = poly(roots)
+    argmin = np.argmin(vals)
+    return roots[argmin], vals[argmin]
+def _fitmin_cubic3(x1,y1,x2,y2,x3,y3,x4,d4):
+    """x4 is allowed to be equal to x1"""
+    A = np.array([
+        [x1**3, x1**2, x1, 1],
+        [x2**3, x2**2, x2, 1],
+        [x3**3, x3**2, x3, 1],
+        [3*x4**2, 2*x4, 1, 0]
+    ])
+    B = np.array([y1, y2, y3, d4])
+    try:
+        coeffs = np.linalg.solve(A, B)
+    except np.linalg.LinAlgError:
+        return None
+    cubic = Polynomial(coeffs)
+    x_min, y_min = _minimize_polynomial(cubic)
+    if y_min < min(y1,y2,y3): return x_min
+    return None
+def cubic3(points: list[Point]):
+    pd, points = _get_dpoint(points)
+    if pd is None: return None
+    if len(points) < 2: return None
+    p1, p2 = points[:2]
+    return _fitmin_cubic3(pd.x, pd.y, p1.x, p1.y, p2.x, p2.y, pd.x, pd.d)
+# ---------------------------------- cubic4 ---------------------------------- #
+def _fitmin_cubic4(x1, y1, x2, y2, x3, y3, x4, y4):
+    cubic = Polynomial.fit([x1,x2,x3,x4], [y1,y2,y3,y4], deg=3)
+    x_min, y_min = _minimize_polynomial(cubic)
+    if y_min < min(y1,y2,y3,y4): return x_min
+    return None
+def cubic4(points:list[Point]):
+    if len(points) < 4: return None
+    p1,p2,p3,p4 = points[:4]
+    return _fitmin_cubic4(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)
+# ---------------------------------- linear3 --------------------------------- #
+def _linear_intersection(x1,y1,s1,x2,y2,s2):
+    if s1 == 0 or s2 == 0 or s1 == s2: return None
+    return (y1 - s1*x1 - y2 + s2*x2) / (s2 - s1)
+def _fitmin_linear3(x1, y1, d1, x2, y2, x3, y3):
+    # we have that
+    # s2 = (y2 - y3) / (x2 - x3) # slope origin in x2 y2
+    # f1(x) = y1 + d1 * (x - x1)
+    # f2(x) = y2 + s2 * (x - x2)
+    # y1 + d1 * (x - x1) = y2 + s2 * (x - x2)
+    # y1 + d1 x - d1 x1 - y2 - s2 x + s2 x2 = 0
+    # s2 x - d1 x = y1 - d1 x1 - y2 + s2 x2
+    # x = (y1 - d1 x1 - y2 + s2 x2) / (s2 - d1)
+    if x2 < x1 < x3 or x3 < x1 < x2: # point with derivative in between
+        return None
+    if d1 > 0:
+        if x2 > x1 or x3 > x1: return None  # intersection is above to the right
+        if x2 > x3: x2,y2,x3,y3 = x3,y3,x2,y2
+    if d1 < 0:
+        if x2 < x1 or x3 < x1: return None  # intersection is above to the left
+        if x2 < x3: x2,y2,x3,y3 = x3,y3,x2,y2
+    s2 = (y2 - y3) / (x2 - x3)
+    return _linear_intersection(x1,y1,d1,x2,y2,s2)
+def linear3(points:list[Point]):
+    pd, points = _get_dpoint(points)
+    if pd is None: return None
+    if len(points) < 2: return None
+    p1, p2 = points[:2]
+    return _fitmin_linear3(pd.x, pd.y, pd.d, p1.x, p1.y, p2.x, p2.y)
+# ---------------------------------- linear4 --------------------------------- #
+def _fitmin_linear4(x1, y1, x2, y2, x3, y3, x4, y4):
+    # sort by x
+    points = ((x1,y1), (x2,y2), (x3,y3), (x4,y4))
+    points = sorted(points, key=lambda x: x[0])
+    (x1,y1), (x2,y2), (x3,y3), (x4,y4) = points
+    s1 = (y1 - y2) / (x1 - x2)
+    s3 = (y3 - y4) / (x3 - x4)
+    return _linear_intersection(x1,y1,s1,x3,y3,s3)
+def linear4(points:list[Point]):
+    if len(points) < 4: return None
+    p1,p2,p3,p4 = points[:4]
+    return _fitmin_linear4(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)

torchzero 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl

torchzero 0.3.13py3-none-any.whl → 0.3.15py3-none-any.whl