PyPI - torchzero - Versions diffs - 0.3.13__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

torchzero 0.3.13py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

tests/test_opts.py +0 -7
torchzero/core/module.py +4 -0
torchzero/modules/conjugate_gradient/cg.py +16 -16
torchzero/modules/experimental/spsa1.py +93 -0
torchzero/modules/grad_approximation/__init__.py +1 -1
torchzero/modules/grad_approximation/forward_gradient.py +2 -5
torchzero/modules/grad_approximation/rfdm.py +27 -110
torchzero/modules/line_search/scipy.py +15 -3
torchzero/modules/line_search/strong_wolfe.py +0 -2
torchzero/modules/restarts/restars.py +5 -4
torchzero/modules/second_order/newton_cg.py +86 -110
torchzero/modules/trust_region/levenberg_marquardt.py +2 -2
torchzero/modules/trust_region/trust_cg.py +6 -4
torchzero/modules/zeroth_order/__init__.py +1 -1
torchzero/modules/zeroth_order/cd.py +1 -238
torchzero/utils/optimizer.py +2 -2
torchzero/utils/python_tools.py +1 -0
{torchzero-0.3.13.dist-info → torchzero-0.3.14.dist-info}/METADATA +1 -1
{torchzero-0.3.13.dist-info → torchzero-0.3.14.dist-info}/RECORD +21 -20
{torchzero-0.3.13.dist-info → torchzero-0.3.14.dist-info}/WHEEL +0 -0
{torchzero-0.3.13.dist-info → torchzero-0.3.14.dist-info}/top_level.txt +0 -0

tests/test_opts.py CHANGED Viewed

@@ -400,13 +400,6 @@ RandomizedFDM_4samples = Run(
     func='booth', steps=50, loss=1e-5, merge_invariant=True,
     sphere_steps=100, sphere_loss=400,
 )
-RandomizedFDM_4samples_lerp = Run(
-    func_opt=lambda p: tz.Modular(p, tz.m.RandomizedFDM(n_samples=4, beta=0.99, seed=0), tz.m.LR(0.1)),
-    sphere_opt=lambda p: tz.Modular(p, tz.m.RandomizedFDM(n_samples=4, beta=0.9, seed=0), tz.m.LR(0.001)),
-    needs_closure=True,
-    func='booth', steps=50, loss=1e-5, merge_invariant=True,
-    sphere_steps=100, sphere_loss=505,
-)
 RandomizedFDM_4samples_no_pre_generate = Run(
     func_opt=lambda p: tz.Modular(p, tz.m.RandomizedFDM(n_samples=4, pre_generate=False, seed=0), tz.m.LR(0.1)),
     sphere_opt=lambda p: tz.Modular(p, tz.m.RandomizedFDM(n_samples=4, pre_generate=False, seed=0), tz.m.LR(0.001)),

torchzero/core/module.py CHANGED Viewed

@@ -531,7 +531,11 @@ class Module(ABC):
     def reset(self):
         """Resets the internal state of the module (e.g. momentum) and all children. By default clears state and global state."""
         self.state.clear()
+        generator = self.global_state.get("generator", None)
         self.global_state.clear()
+        if generator is not None: self.global_state["generator"] = generator
         for c in self.children.values(): c.reset()
     def reset_for_online(self):

torchzero/modules/conjugate_gradient/cg.py CHANGED Viewed

@@ -50,7 +50,7 @@ class ConguateGradientBase(Transform, ABC):
     ```
     """
-    def __init__(self, defaults = None, clip_beta: bool = False, restart_interval: int | None | Literal['auto'] = None, inner: Chainable | None = None):
+    def __init__(self, defaults, clip_beta: bool, restart_interval: int | None | Literal['auto'], inner: Chainable | None = None):
         if defaults is None: defaults = {}
         defaults['restart_interval'] = restart_interval
         defaults['clip_beta'] = clip_beta
@@ -140,8 +140,8 @@ class PolakRibiere(ConguateGradientBase):
     Note:
         This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
     """
-    def __init__(self, clip_beta=True, restart_interval: int | None = None, inner: Chainable | None = None):
-        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def __init__(self, clip_beta=True, restart_interval: int | None | Literal['auto'] = 'auto', inner: Chainable | None = None):
+        super().__init__({}, clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
     def get_beta(self, p, g, prev_g, prev_d):
         return polak_ribiere_beta(g, prev_g)
@@ -158,7 +158,7 @@ class FletcherReeves(ConguateGradientBase):
         This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
     """
     def __init__(self, restart_interval: int | None | Literal['auto'] = 'auto', clip_beta=False, inner: Chainable | None = None):
-        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+        super().__init__({}, clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
     def initialize(self, p, g):
         self.global_state['prev_gg'] = g.dot(g)
@@ -183,8 +183,8 @@ class HestenesStiefel(ConguateGradientBase):
     Note:
         This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
     """
-    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
-        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def __init__(self, restart_interval: int | None | Literal['auto'] = 'auto', clip_beta=False, inner: Chainable | None = None):
+        super().__init__({}, clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
     def get_beta(self, p, g, prev_g, prev_d):
         return hestenes_stiefel_beta(g, prev_d, prev_g)
@@ -202,8 +202,8 @@ class DaiYuan(ConguateGradientBase):
     Note:
         This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1)`` after this.
     """
-    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
-        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def __init__(self, restart_interval: int | None | Literal['auto'] = 'auto', clip_beta=False, inner: Chainable | None = None):
+        super().__init__({}, clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
     def get_beta(self, p, g, prev_g, prev_d):
         return dai_yuan_beta(g, prev_d, prev_g)
@@ -221,8 +221,8 @@ class LiuStorey(ConguateGradientBase):
     Note:
         This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
     """
-    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
-        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def __init__(self, restart_interval: int | None | Literal['auto'] = 'auto', clip_beta=False, inner: Chainable | None = None):
+        super().__init__({}, clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
     def get_beta(self, p, g, prev_g, prev_d):
         return liu_storey_beta(g, prev_d, prev_g)
@@ -239,8 +239,8 @@ class ConjugateDescent(ConguateGradientBase):
     Note:
         This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
     """
-    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
-        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def __init__(self, restart_interval: int | None | Literal['auto'] = 'auto', clip_beta=False, inner: Chainable | None = None):
+        super().__init__({}, clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
     def get_beta(self, p, g, prev_g, prev_d):
         return conjugate_descent_beta(g, prev_d, prev_g)
@@ -264,8 +264,8 @@ class HagerZhang(ConguateGradientBase):
     Note:
         This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
     """
-    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
-        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def __init__(self, restart_interval: int | None | Literal['auto'] = 'auto', clip_beta=False, inner: Chainable | None = None):
+        super().__init__({}, clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
     def get_beta(self, p, g, prev_g, prev_d):
         return hager_zhang_beta(g, prev_d, prev_g)
@@ -291,8 +291,8 @@ class DYHS(ConguateGradientBase):
     Note:
         This requires step size to be determined via a line search, so put a line search like ``tz.m.StrongWolfe(c2=0.1, a_init="first-order")`` after this.
     """
-    def __init__(self, restart_interval: int | None | Literal['auto'] = None, clip_beta=False, inner: Chainable | None = None):
-        super().__init__(clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
+    def __init__(self, restart_interval: int | None | Literal['auto'] = 'auto', clip_beta=False, inner: Chainable | None = None):
+        super().__init__({}, clip_beta=clip_beta, restart_interval=restart_interval, inner=inner)
     def get_beta(self, p, g, prev_g, prev_d):
         return dyhs_beta(g, prev_d, prev_g)

torchzero/modules/experimental/spsa1.py ADDED Viewed

@@ -0,0 +1,93 @@
+from collections.abc import Callable
+from typing import Any
+from functools import partial
+import torch
+from ...utils import TensorList, NumberList
+from ..grad_approximation.grad_approximator import GradApproximator, GradTarget
+class SPSA1(GradApproximator):
+    """One-measurement variant of SPSA. Unlike standard two-measurement SPSA, the estimated
+    gradient often won't be a descent direction, however the expectation is biased towards
+    the descent direction. Therefore this variant of SPSA is only recommended for a specific
+    class of problems where the objective function changes on each evaluation,
+    for example feedback control problems.
+    Args:
+        h (float, optional):
+            finite difference step size, recommended to set to same value as learning rate. Defaults to 1e-3.
+        n_samples (int, optional): number of random samples. Defaults to 1.
+        eps (float, optional): measurement noise estimate. Defaults to 1e-8.
+        seed (int | None | torch.Generator, optional): random seed. Defaults to None.
+        target (GradTarget, optional): what to set on closure. Defaults to "closure".
+    Reference:
+        [SPALL, JAMES C. "A One-measurement Form of Simultaneous Stochastic Approximation](https://www.jhuapl.edu/spsa/PDF-SPSA/automatica97_one_measSPSA.pdf)."
+    """
+    def __init__(
+        self,
+        h: float = 1e-3,
+        n_samples: int = 1,
+        eps: float = 1e-8, # measurement noise
+        pre_generate = False,
+        seed: int | None | torch.Generator = None,
+        target: GradTarget = "closure",
+    ):
+        defaults = dict(h=h, eps=eps, n_samples=n_samples, pre_generate=pre_generate, seed=seed)
+        super().__init__(defaults, target=target)
+    def pre_step(self, var):
+        if self.defaults['pre_generate']:
+            params = TensorList(var.params)
+            generator = self.get_generator(params[0].device, self.defaults['seed'])
+            n_samples = self.defaults['n_samples']
+            h = self.get_settings(var.params, 'h')
+            perturbations = [params.sample_like(distribution='rademacher', generator=generator) for _ in range(n_samples)]
+            torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
+            for param, prt in zip(params, zip(*perturbations)):
+                self.state[param]['perturbations'] = prt
+    @torch.no_grad
+    def approximate(self, closure, params, loss):
+        generator = self.get_generator(params[0].device, self.defaults['seed'])
+        params = TensorList(params)
+        orig_params = params.clone() # store to avoid small changes due to float imprecision
+        loss_approx = None
+        h, eps = self.get_settings(params, "h", "eps", cls=NumberList)
+        n_samples = self.defaults['n_samples']
+        default = [None]*n_samples
+        # perturbations are pre-multiplied by h
+        perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
+        grad = None
+        for i in range(n_samples):
+            prt = perturbations[i]
+            if prt[0] is None:
+                prt = params.sample_like('rademacher', generator=generator).mul_(h)
+            else: prt = TensorList(prt)
+            params += prt
+            L = closure(False)
+            params.copy_(orig_params)
+            sample = prt * ((L + eps) / h)
+            if grad is None: grad = sample
+            else: grad += sample
+        assert grad is not None
+        if n_samples > 1: grad.div_(n_samples)
+        # mean if got per-sample values
+        return grad, loss, loss_approx

torchzero/modules/grad_approximation/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from .grad_approximator import GradApproximator, GradTarget
 from .fdm import FDM
 from .rfdm import RandomizedFDM, MeZO, SPSA, RDSA, GaussianSmoothing
-from .forward_gradient import ForwardGradient
+from .forward_gradient import ForwardGradient

torchzero/modules/grad_approximation/forward_gradient.py CHANGED Viewed

@@ -23,8 +23,6 @@ class ForwardGradient(RandomizedFDM):
     Args:
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         distribution (Distributions, optional): distribution for random gradient samples. Defaults to "gaussian".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         jvp_method (str, optional):
@@ -40,14 +38,13 @@ class ForwardGradient(RandomizedFDM):
         self,
         n_samples: int = 1,
         distribution: Distributions = "gaussian",
-        beta: float = 0,
         pre_generate = True,
         jvp_method: Literal['autograd', 'forward', 'central'] = 'autograd',
         h: float = 1e-3,
         target: GradTarget = "closure",
         seed: int | None | torch.Generator = None,
     ):
-        super().__init__(h=h, n_samples=n_samples, distribution=distribution, beta=beta, target=target, pre_generate=pre_generate, seed=seed)
+        super().__init__(h=h, n_samples=n_samples, distribution=distribution, target=target, pre_generate=pre_generate, seed=seed)
         self.defaults['jvp_method'] = jvp_method
     @torch.no_grad
@@ -62,7 +59,7 @@ class ForwardGradient(RandomizedFDM):
         distribution = settings['distribution']
         default = [None]*n_samples
         perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
-        generator = self._get_generator(settings['seed'], params)
+        generator = self.get_generator(params[0].device, self.defaults['seed'])
         grad = None
         for i in range(n_samples):

torchzero/modules/grad_approximation/rfdm.py CHANGED Viewed

@@ -164,7 +164,6 @@ class RandomizedFDM(GradApproximator):
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
         distribution (Distributions, optional): distribution. Defaults to "rademacher".
             If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
-        beta (float, optional): optinal momentum for generated perturbations. Defaults to 1e-3.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -173,7 +172,7 @@ class RandomizedFDM(GradApproximator):
     Examples:
     #### Simultaneous perturbation stochastic approximation (SPSA) method
-    SPSA is randomized finite differnce with rademacher distribution and central formula.
+    SPSA is randomized FDM with rademacher distribution and central formula.
     ```py
     spsa = tz.Modular(
         model.parameters(),
@@ -184,8 +183,7 @@ class RandomizedFDM(GradApproximator):
     #### Random-direction stochastic approximation (RDSA) method
-    RDSA is randomized finite differnce with usually gaussian distribution and central formula.
+    RDSA is randomized FDM with usually gaussian distribution and central formula.
     ```
     rdsa = tz.Modular(
         model.parameters(),
@@ -194,23 +192,9 @@ class RandomizedFDM(GradApproximator):
     )
     ```
-    #### RandomizedFDM with momentum
-    Momentum might help by reducing the variance of the estimated gradients.
-    ```
-    momentum_spsa = tz.Modular(
-        model.parameters(),
-        tz.m.RandomizedFDM(),
-        tz.m.HeavyBall(0.9),
-        tz.m.LR(1e-3)
-    )
-    ```
     #### Gaussian smoothing method
     GS uses many gaussian samples with possibly a larger finite difference step size.
     ```
     gs = tz.Modular(
         model.parameters(),
@@ -220,44 +204,15 @@ class RandomizedFDM(GradApproximator):
     )
     ```
-    #### SPSA-NewtonCG
-    NewtonCG with hessian-vector product estimated via gradient difference
-    calls closure multiple times per step. If each closure call estimates gradients
-    with different perturbations, NewtonCG is unable to produce useful directions.
-    By setting pre_generate to True, perturbations are generated once before each step,
-    and each closure call estimates gradients using the same pre-generated perturbations.
-    This way closure-based algorithms are able to use gradients estimated in a consistent way.
+    #### RandomizedFDM with momentum
+    Momentum might help by reducing the variance of the estimated gradients.
     ```
-    opt = tz.Modular(
+    momentum_spsa = tz.Modular(
         model.parameters(),
-        tz.m.RandomizedFDM(n_samples=10),
-        tz.m.NewtonCG(hvp_method="forward", pre_generate=True),
-        tz.m.Backtracking()
-    )
-    ```
-    #### SPSA-LBFGS
-    LBFGS uses a memory of past parameter and gradient differences. If past gradients
-    were estimated with different perturbations, LBFGS directions will be useless.
-    To alleviate this momentum can be added to random perturbations to make sure they only
-    change by a little bit, and the history stays relevant. The momentum is determined by the :code:`beta` parameter.
-    The disadvantage is that the subspace the algorithm is able to explore changes slowly.
-    Additionally we will reset SPSA and LBFGS memory every 100 steps to remove influence from old gradient estimates.
-    ```
-    opt = tz.Modular(
-        bench.parameters(),
-        tz.m.ResetEvery(
-            [tz.m.RandomizedFDM(n_samples=10, pre_generate=True, beta=0.99), tz.m.LBFGS()],
-            steps = 100,
-        ),
-        tz.m.Backtracking()
+        tz.m.RandomizedFDM(),
+        tz.m.HeavyBall(0.9),
+        tz.m.LR(1e-3)
     )
     ```
     """
@@ -268,75 +223,46 @@ class RandomizedFDM(GradApproximator):
         n_samples: int = 1,
         formula: _FD_Formula = "central",
         distribution: Distributions = "rademacher",
-        beta: float = 0,
         pre_generate = True,
         seed: int | None | torch.Generator = None,
         target: GradTarget = "closure",
     ):
-        defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution, beta=beta, pre_generate=pre_generate, seed=seed)
+        defaults = dict(h=h, formula=formula, n_samples=n_samples, distribution=distribution, pre_generate=pre_generate, seed=seed)
         super().__init__(defaults, target=target)
-    def reset(self):
-        self.state.clear()
-        generator = self.global_state.get('generator', None) # avoid resetting generator
-        self.global_state.clear()
-        if generator is not None: self.global_state['generator'] = generator
-        for c in self.children.values(): c.reset()
-    def _get_generator(self, seed: int | None | torch.Generator, params: list[torch.Tensor]):
-        if 'generator' not in self.global_state:
-            if isinstance(seed, torch.Generator): self.global_state['generator'] = seed
-            elif seed is not None: self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
-            else: self.global_state['generator'] = None
-        return self.global_state['generator']
     def pre_step(self, var):
-        h, beta = self.get_settings(var.params, 'h', 'beta')
-        n_samples = self.defaults['n_samples']
-        distribution = self.defaults['distribution']
+        h = self.get_settings(var.params, 'h')
         pre_generate = self.defaults['pre_generate']
         if pre_generate:
+            n_samples = self.defaults['n_samples']
+            distribution = self.defaults['distribution']
             params = TensorList(var.params)
-            generator = self._get_generator(self.defaults['seed'], var.params)
+            generator = self.get_generator(params[0].device, self.defaults['seed'])
             perturbations = [params.sample_like(distribution=distribution, variance=1, generator=generator) for _ in range(n_samples)]
+            # this is false for ForwardGradient where h isn't used and it subclasses this
             if self.PRE_MULTIPLY_BY_H:
                 torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
-            if all(i==0 for i in beta):
-                # just use pre-generated perturbations
-                for param, prt in zip(params, zip(*perturbations)):
-                    self.state[param]['perturbations'] = prt
-            else:
-                # lerp old and new perturbations. This makes the subspace change gradually
-                # which in theory might improve algorithms with history
-                for i,p in enumerate(params):
-                    state = self.state[p]
-                    if 'perturbations' not in state: state['perturbations'] = [p[i] for p in perturbations]
-                cur = [self.state[p]['perturbations'][:n_samples] for p in params]
-                cur_flat = [p for l in cur for p in l]
-                new_flat = [p for l in zip(*perturbations) for p in l]
-                betas = [1-v for b in beta for v in [b]*n_samples]
-                torch._foreach_lerp_(cur_flat, new_flat, betas)
+            for param, prt in zip(params, zip(*perturbations)):
+                self.state[param]['perturbations'] = prt
     @torch.no_grad
     def approximate(self, closure, params, loss):
         params = TensorList(params)
-        orig_params = params.clone() # store to avoid small changes due to float imprecision
         loss_approx = None
         h = NumberList(self.settings[p]['h'] for p in params)
-        settings = self.settings[params[0]]
-        n_samples = settings['n_samples']
-        fd_fn = _RFD_FUNCS[settings['formula']]
+        n_samples = self.defaults['n_samples']
+        distribution = self.defaults['distribution']
+        fd_fn = _RFD_FUNCS[self.defaults['formula']]
         default = [None]*n_samples
         perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
-        distribution = settings['distribution']
-        generator = self._get_generator(settings['seed'], params)
+        generator = self.get_generator(params[0].device, self.defaults['seed'])
         grad = None
         for i in range(n_samples):
@@ -356,7 +282,6 @@ class RandomizedFDM(GradApproximator):
             if grad is None: grad = prt * d
             else: grad += prt * d
-        params.set_(orig_params)
         assert grad is not None
         if n_samples > 1: grad.div_(n_samples)
@@ -384,8 +309,6 @@ class SPSA(RandomizedFDM):
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
         distribution (Distributions, optional): distribution. Defaults to "rademacher".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -408,8 +331,6 @@ class RDSA(RandomizedFDM):
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'central2'.
         distribution (Distributions, optional): distribution. Defaults to "gaussian".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -425,12 +346,11 @@ class RDSA(RandomizedFDM):
         n_samples: int = 1,
         formula: _FD_Formula = "central2",
         distribution: Distributions = "gaussian",
-        beta: float = 0,
         pre_generate = True,
         target: GradTarget = "closure",
         seed: int | None | torch.Generator = None,
     ):
-        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,beta=beta,pre_generate=pre_generate,target=target,seed=seed)
+        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,pre_generate=pre_generate,target=target,seed=seed)
 class GaussianSmoothing(RandomizedFDM):
     """
@@ -445,8 +365,6 @@ class GaussianSmoothing(RandomizedFDM):
         n_samples (int, optional): number of random gradient samples. Defaults to 100.
         formula (_FD_Formula, optional): finite difference formula. Defaults to 'forward2'.
         distribution (Distributions, optional): distribution. Defaults to "gaussian".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         seed (int | None | torch.Generator, optional): Seed for random generator. Defaults to None.
@@ -462,12 +380,11 @@ class GaussianSmoothing(RandomizedFDM):
         n_samples: int = 100,
         formula: _FD_Formula = "forward2",
         distribution: Distributions = "gaussian",
-        beta: float = 0,
         pre_generate = True,
         target: GradTarget = "closure",
         seed: int | None | torch.Generator = None,
     ):
-        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,beta=beta,pre_generate=pre_generate,target=target,seed=seed)
+        super().__init__(h=h, n_samples=n_samples,formula=formula,distribution=distribution,pre_generate=pre_generate,target=target,seed=seed)
 class MeZO(GradApproximator):
     """Gradient approximation via memory-efficient zeroth order optimizer (MeZO) - https://arxiv.org/abs/2305.17333.
@@ -525,9 +442,9 @@ class MeZO(GradApproximator):
         loss_approx = None
         h = NumberList(self.settings[p]['h'] for p in params)
-        settings = self.settings[params[0]]
-        n_samples = settings['n_samples']
-        fd_fn = _RFD_FUNCS[settings['formula']]
+        n_samples = self.defaults['n_samples']
+        fd_fn = _RFD_FUNCS[self.defaults['formula']]
         prt_fns = self.global_state['prt_fns']
         grad = None

torchzero/modules/line_search/scipy.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import math
 from collections.abc import Mapping
 from operator import itemgetter
@@ -17,6 +18,7 @@ class ScipyMinimizeScalar(LineSearchBase):
         bounds (Sequence | None, optional):
             For method ‘bounded’, bounds is mandatory and must have two finite items corresponding to the optimization bounds. Defaults to None.
         tol (float | None, optional): Tolerance for termination. Defaults to None.
+        prev_init (bool, optional): uses previous step size as initial guess for the line search.
         options (dict | None, optional): A dictionary of solver options. Defaults to None.
     For more details on methods and arguments refer to https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize_scalar.html
@@ -29,9 +31,10 @@ class ScipyMinimizeScalar(LineSearchBase):
         bracket=None,
         bounds=None,
         tol: float | None = None,
+        prev_init: bool = False,
         options=None,
     ):
-        defaults = dict(method=method,bracket=bracket,bounds=bounds,tol=tol,options=options,maxiter=maxiter)
+        defaults = dict(method=method,bracket=bracket,bounds=bounds,tol=tol,options=options,maxiter=maxiter, prev_init=prev_init)
         super().__init__(defaults)
         import scipy.optimize
@@ -48,5 +51,14 @@ class ScipyMinimizeScalar(LineSearchBase):
             options = dict(options) if isinstance(options, Mapping) else {}
             options['maxiter'] = maxiter
-        res = self.scopt.minimize_scalar(objective, method=method, bracket=bracket, bounds=bounds, tol=tol, options=options)
-        return res.x
+        if self.defaults["prev_init"] and "x_prev" in self.global_state:
+            if bracket is None: bracket = (0, 1)
+            bracket = (*bracket[:-1], self.global_state["x_prev"])
+        x = self.scopt.minimize_scalar(objective, method=method, bracket=bracket, bounds=bounds, tol=tol, options=options).x # pyright:ignore[reportAttributeAccessIssue]
+        max = torch.finfo(var.params[0].dtype).max / 2
+        if (not math.isfinite(x)) or abs(x) >= max: x = 0
+        self.global_state['x_prev'] = x
+        return x

torchzero/modules/line_search/strong_wolfe.py CHANGED Viewed

@@ -330,7 +330,6 @@ class StrongWolfe(LineSearchBase):
         if adaptive:
             a_init *= self.global_state.get('initial_scale', 1)
         strong_wolfe = _StrongWolfe(
             f=objective,
             f_0=f_0,
@@ -360,7 +359,6 @@ class StrongWolfe(LineSearchBase):
                     if inverted: a = -a
         if a is not None and a != 0 and math.isfinite(a):
-            #self.global_state['initial_scale'] = min(1.0, self.global_state.get('initial_scale', 1) * math.sqrt(2))
             self.global_state['initial_scale'] = 1
             self.global_state['a_prev'] = a
             self.global_state['f_prev'] = f_0

torchzero/modules/restarts/restars.py CHANGED Viewed

@@ -60,18 +60,18 @@ class RestartStrategyBase(Module, ABC):
 class RestartOnStuck(RestartStrategyBase):
-    """Resets the state when update (difference in parameters) is close to zero for multiple steps in a row.
+    """Resets the state when update (difference in parameters) is zero for multiple steps in a row.
     Args:
         modules (Chainable | None):
             modules to reset. If None, resets all modules.
         tol (float, optional):
-            step is considered failed when maximum absolute parameter difference is smaller than this. Defaults to 1e-10.
+            step is considered failed when maximum absolute parameter difference is smaller than this. Defaults to None (uses twice the smallest respresentable number)
         n_tol (int, optional):
-            number of failed consequtive steps required to trigger a reset. Defaults to 4.
+            number of failed consequtive steps required to trigger a reset. Defaults to 10.
     """
-    def __init__(self, modules: Chainable | None, tol: float = 1e-10, n_tol: int = 4):
+    def __init__(self, modules: Chainable | None, tol: float | None = None, n_tol: int = 10):
         defaults = dict(tol=tol, n_tol=n_tol)
         super().__init__(defaults, modules)
@@ -82,6 +82,7 @@ class RestartOnStuck(RestartStrategyBase):
         params = TensorList(var.params)
         tol = self.defaults['tol']
+        if tol is None: tol = torch.finfo(params[0].dtype).tiny * 2
         n_tol = self.defaults['n_tol']
         n_bad = self.global_state.get('n_bad', 0)

torchzero 0.3.13__py3-none-any.whl → 0.3.14__py3-none-any.whl

torchzero 0.3.13py3-none-any.whl → 0.3.14py3-none-any.whl