PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

tests/test_opts.py +95 -76
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +229 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/spsa1.py +93 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/__init__.py +1 -1
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +6 -7
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +114 -175
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +16 -4
torchzero/modules/line_search/strong_wolfe.py +319 -220
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +253 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +207 -170
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +99 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +122 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/optimizer.py +2 -2
torchzero/utils/python_tools.py +7 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.14.dist-info/METADATA +14 -0
torchzero-0.3.14.dist-info/RECORD +167 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/WHEEL +0 -0

torchzero/modules/experimental/spsa1.py ADDED Viewed

@@ -0,0 +1,93 @@
+from collections.abc import Callable
+from typing import Any
+from functools import partial
+import torch
+from ...utils import TensorList, NumberList
+from ..grad_approximation.grad_approximator import GradApproximator, GradTarget
+class SPSA1(GradApproximator):
+    """One-measurement variant of SPSA. Unlike standard two-measurement SPSA, the estimated
+    gradient often won't be a descent direction, however the expectation is biased towards
+    the descent direction. Therefore this variant of SPSA is only recommended for a specific
+    class of problems where the objective function changes on each evaluation,
+    for example feedback control problems.
+    Args:
+        h (float, optional):
+            finite difference step size, recommended to set to same value as learning rate. Defaults to 1e-3.
+        n_samples (int, optional): number of random samples. Defaults to 1.
+        eps (float, optional): measurement noise estimate. Defaults to 1e-8.
+        seed (int | None | torch.Generator, optional): random seed. Defaults to None.
+        target (GradTarget, optional): what to set on closure. Defaults to "closure".
+    Reference:
+        [SPALL, JAMES C. "A One-measurement Form of Simultaneous Stochastic Approximation](https://www.jhuapl.edu/spsa/PDF-SPSA/automatica97_one_measSPSA.pdf)."
+    """
+    def __init__(
+        self,
+        h: float = 1e-3,
+        n_samples: int = 1,
+        eps: float = 1e-8, # measurement noise
+        pre_generate = False,
+        seed: int | None | torch.Generator = None,
+        target: GradTarget = "closure",
+    ):
+        defaults = dict(h=h, eps=eps, n_samples=n_samples, pre_generate=pre_generate, seed=seed)
+        super().__init__(defaults, target=target)
+    def pre_step(self, var):
+        if self.defaults['pre_generate']:
+            params = TensorList(var.params)
+            generator = self.get_generator(params[0].device, self.defaults['seed'])
+            n_samples = self.defaults['n_samples']
+            h = self.get_settings(var.params, 'h')
+            perturbations = [params.sample_like(distribution='rademacher', generator=generator) for _ in range(n_samples)]
+            torch._foreach_mul_([p for l in perturbations for p in l], [v for vv in h for v in [vv]*n_samples])
+            for param, prt in zip(params, zip(*perturbations)):
+                self.state[param]['perturbations'] = prt
+    @torch.no_grad
+    def approximate(self, closure, params, loss):
+        generator = self.get_generator(params[0].device, self.defaults['seed'])
+        params = TensorList(params)
+        orig_params = params.clone() # store to avoid small changes due to float imprecision
+        loss_approx = None
+        h, eps = self.get_settings(params, "h", "eps", cls=NumberList)
+        n_samples = self.defaults['n_samples']
+        default = [None]*n_samples
+        # perturbations are pre-multiplied by h
+        perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
+        grad = None
+        for i in range(n_samples):
+            prt = perturbations[i]
+            if prt[0] is None:
+                prt = params.sample_like('rademacher', generator=generator).mul_(h)
+            else: prt = TensorList(prt)
+            params += prt
+            L = closure(False)
+            params.copy_(orig_params)
+            sample = prt * ((L + eps) / h)
+            if grad is None: grad = sample
+            else: grad += sample
+        assert grad is not None
+        if n_samples > 1: grad.div_(n_samples)
+        # mean if got per-sample values
+        return grad, loss, loss_approx

torchzero/modules/experimental/structural_projections.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from ...core import Chainable
 from ...utils import vec_to_tensors, TensorList
-from ..optimizers.shampoo import _merge_small_dims
+from ..adaptive.shampoo import _merge_small_dims
 from ..projections import ProjectionBase

torchzero/modules/functional.py CHANGED Viewed

@@ -9,9 +9,17 @@ Additional functional variants are present in most module files, e.g. `adam_`, `
 """
 from collections.abc import Callable
 from typing import overload
 import torch
-from ..utils import NumberList, TensorList
+from ..utils import (
+    NumberList,
+    TensorList,
+    generic_finfo_eps,
+    generic_max,
+    generic_sum,
+    tofloat,
+)
 inf = float('inf')
@@ -87,10 +95,10 @@ def root(tensors_:TensorList, p:float, inplace: bool):
         if p == 1: return tensors_.abs_()
         if p == 2: return tensors_.sqrt_()
         return tensors_.pow_(1/p)
-    else:
-        if p == 1: return tensors_.abs()
-        if p == 2: return tensors_.sqrt()
-        return tensors_.pow(1/p)
+    if p == 1: return tensors_.abs()
+    if p == 2: return tensors_.sqrt()
+    return tensors_.pow(1/p)
 def ema_(
@@ -207,13 +215,41 @@ def sqrt_centered_ema_sq_(
         ema_sq_fn=lambda *a, **kw: centered_ema_sq_(*a, **kw, exp_avg_=exp_avg_)
     )
-@overload
-def safe_scaling_(tensors_: torch.Tensor) -> torch.Tensor: ...
-@overload
-def safe_scaling_(tensors_: TensorList) -> TensorList: ...
-def safe_scaling_(tensors_: torch.Tensor | TensorList):
-    if isinstance(tensors_, torch.Tensor): scale = 1 / tensors_.abs().sum()
-    else: scale = 1 / tensors_.abs().global_sum()
-    scale = scale.clip(min=torch.finfo(tensors_[0].dtype).eps, max=1)
-    return tensors_.mul_(scale)
+def initial_step_size(tensors: torch.Tensor | TensorList, eps=None) -> float:
+    """initial scaling taken from pytorch L-BFGS to avoid requiring a lot of line search iterations,
+    this version is safer and makes sure largest value isn't smaller than epsilon."""
+    tensors_abs = tensors.abs()
+    tensors_sum = generic_sum(tensors_abs)
+    tensors_max = generic_max(tensors_abs)
+    feps = generic_finfo_eps(tensors)
+    if eps is None: eps = feps
+    else: eps = max(eps, feps)
+    # scale should not make largest value smaller than epsilon
+    min = eps / tensors_max
+    if min >= 1: return 1.0
+    scale = 1 / tensors_sum
+    scale = scale.clip(min=min.item(), max=1)
+    return scale.item()
+def epsilon_step_size(tensors: torch.Tensor | TensorList, alpha=1e-7) -> float:
+    """makes sure largest value isn't smaller than epsilon."""
+    tensors_abs = tensors.abs()
+    tensors_max = generic_max(tensors_abs)
+    if tensors_max < alpha: return 1.0
+    if tensors_max < 1: alpha = alpha / tensors_max
+    return tofloat(alpha)
+def safe_clip(x: torch.Tensor, min=None):
+    """makes sure absolute value of scalar tensor x is not smaller than min"""
+    assert x.numel() == 1, x.shape
+    if min is None: min = torch.finfo(x.dtype).tiny * 2
+    if x.abs() < min: return x.new_full(x.size(), min).copysign(x)
+    return x

torchzero/modules/grad_approximation/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from .grad_approximator import GradApproximator, GradTarget
 from .fdm import FDM
 from .rfdm import RandomizedFDM, MeZO, SPSA, RDSA, GaussianSmoothing
-from .forward_gradient import ForwardGradient
+from .forward_gradient import ForwardGradient

torchzero/modules/grad_approximation/fdm.py CHANGED Viewed

@@ -93,7 +93,7 @@ _FD_FUNCS = {
 class FDM(GradApproximator):
     """Approximate gradients via finite difference method.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -103,24 +103,23 @@ class FDM(GradApproximator):
         target (GradTarget, optional): what to set on var. Defaults to 'closure'.
     Examples:
-        plain FDM:
-        .. code-block:: python
-            fdm = tz.Modular(model.parameters(), tz.m.FDM(), tz.m.LR(1e-2))
-        Any gradient-based method can use FDM-estimated gradients seamlessly.
-        .. code-block:: python
-            fdm_ncg = tz.Modular(
-                model.parameters(),
-                tz.m.FDM(),
-                # set hvp_method to "forward" so that it
-                # uses gradient difference instead of autograd
-                tz.m.NewtonCG(hvp_method="forward"),
-                tz.m.Backtracking()
-            )
+    plain FDM:
+    ```python
+    fdm = tz.Modular(model.parameters(), tz.m.FDM(), tz.m.LR(1e-2))
+    ```
+    Any gradient-based method can use FDM-estimated gradients.
+    ```python
+    fdm_ncg = tz.Modular(
+        model.parameters(),
+        tz.m.FDM(),
+        # set hvp_method to "forward" so that it
+        # uses gradient difference instead of autograd
+        tz.m.NewtonCG(hvp_method="forward"),
+        tz.m.Backtracking()
+    )
+    ```
     """
     def __init__(self, h: float=1e-3, formula: _FD_Formula = 'central', target: GradTarget = 'closure'):
         defaults = dict(h=h, formula=formula)
@@ -139,7 +138,7 @@ class FDM(GradApproximator):
             h = settings['h']
             fd_fn = _FD_FUNCS[settings['formula']]
-            p_flat = p.view(-1); g_flat = g.view(-1)
+            p_flat = p.ravel(); g_flat = g.ravel()
             for i in range(len(p_flat)):
                 loss, loss_approx, d = fd_fn(closure=closure, param=p_flat, idx=i, h=h, v_0=loss)
                 g_flat[i] = d

torchzero/modules/grad_approximation/forward_gradient.py CHANGED Viewed

@@ -15,7 +15,7 @@ class ForwardGradient(RandomizedFDM):
     This method samples one or more directional derivatives evaluated via autograd jacobian-vector products. This is very similar to randomized finite difference.
-    .. note::
+    Note:
         This module is a gradient approximator. It modifies the closure to evaluate the estimated gradients,
         and further closure-based modules will use the modified closure. All modules after this will use estimated gradients.
@@ -23,8 +23,6 @@ class ForwardGradient(RandomizedFDM):
     Args:
         n_samples (int, optional): number of random gradient samples. Defaults to 1.
         distribution (Distributions, optional): distribution for random gradient samples. Defaults to "gaussian".
-        beta (float, optional):
-            If this is set to a value higher than zero, instead of using directional derivatives in a new random direction on each step, the direction changes gradually with momentum based on this value. This may make it possible to use methods with memory. Defaults to 0.
         pre_generate (bool, optional):
             whether to pre-generate gradient samples before each step. If samples are not pre-generated, whenever a method performs multiple closure evaluations, the gradient will be evaluated in different directions each time. Defaults to True.
         jvp_method (str, optional):
@@ -40,14 +38,13 @@ class ForwardGradient(RandomizedFDM):
         self,
         n_samples: int = 1,
         distribution: Distributions = "gaussian",
-        beta: float = 0,
         pre_generate = True,
         jvp_method: Literal['autograd', 'forward', 'central'] = 'autograd',
         h: float = 1e-3,
         target: GradTarget = "closure",
         seed: int | None | torch.Generator = None,
     ):
-        super().__init__(h=h, n_samples=n_samples, distribution=distribution, beta=beta, target=target, pre_generate=pre_generate, seed=seed)
+        super().__init__(h=h, n_samples=n_samples, distribution=distribution, target=target, pre_generate=pre_generate, seed=seed)
         self.defaults['jvp_method'] = jvp_method
     @torch.no_grad
@@ -62,12 +59,14 @@ class ForwardGradient(RandomizedFDM):
         distribution = settings['distribution']
         default = [None]*n_samples
         perturbations = list(zip(*(self.state[p].get('perturbations', default) for p in params)))
-        generator = self._get_generator(settings['seed'], params)
+        generator = self.get_generator(params[0].device, self.defaults['seed'])
         grad = None
         for i in range(n_samples):
             prt = perturbations[i]
-            if prt[0] is None: prt = params.sample_like(distribution=distribution, generator=generator)
+            if prt[0] is None:
+                prt = params.sample_like(distribution=distribution, variance=1, generator=generator)
             else: prt = TensorList(prt)
             if jvp_method == 'autograd':

torchzero/modules/grad_approximation/grad_approximator.py CHANGED Viewed

@@ -24,63 +24,59 @@ class GradApproximator(Module, ABC):
     Example:
-        Basic SPSA method implementation.
-        .. code-block:: python
-            class SPSA(GradApproximator):
-                def __init__(self, h=1e-3):
-                    defaults = dict(h=h)
-                    super().__init__(defaults)
-                @torch.no_grad
-                def approximate(self, closure, params, loss):
-                    perturbation = [rademacher_like(p) * self.settings[p]['h'] for p in params]
-                    # evaluate params + perturbation
-                    torch._foreach_add_(params, perturbation)
-                    loss_plus = closure(False)
-                    # evaluate params - perturbation
-                    torch._foreach_sub_(params, perturbation)
-                    torch._foreach_sub_(params, perturbation)
-                    loss_minus = closure(False)
-                    # restore original params
-                    torch._foreach_add_(params, perturbation)
-                    # calculate SPSA gradients
-                    spsa_grads = []
-                    for p, pert in zip(params, perturbation):
-                        settings = self.settings[p]
-                        h = settings['h']
-                        d = (loss_plus - loss_minus) / (2*(h**2))
-                        spsa_grads.append(pert * d)
-                    # returns tuple: (grads, loss, loss_approx)
-                    # loss must be with initial parameters
-                    # since we only evaluated loss with perturbed parameters
-                    # we only have loss_approx
-                    return spsa_grads, None, loss_plus
-            """
+    Basic SPSA method implementation.
+    ```python
+    class SPSA(GradApproximator):
+        def __init__(self, h=1e-3):
+            defaults = dict(h=h)
+            super().__init__(defaults)
+        @torch.no_grad
+        def approximate(self, closure, params, loss):
+            perturbation = [rademacher_like(p) * self.settings[p]['h'] for p in params]
+            # evaluate params + perturbation
+            torch._foreach_add_(params, perturbation)
+            loss_plus = closure(False)
+            # evaluate params - perturbation
+            torch._foreach_sub_(params, perturbation)
+            torch._foreach_sub_(params, perturbation)
+            loss_minus = closure(False)
+            # restore original params
+            torch._foreach_add_(params, perturbation)
+            # calculate SPSA gradients
+            spsa_grads = []
+            for p, pert in zip(params, perturbation):
+                settings = self.settings[p]
+                h = settings['h']
+                d = (loss_plus - loss_minus) / (2*(h**2))
+                spsa_grads.append(pert * d)
+            # returns tuple: (grads, loss, loss_approx)
+            # loss must be with initial parameters
+            # since we only evaluated loss with perturbed parameters
+            # we only have loss_approx
+            return spsa_grads, None, loss_plus
+    ```
+    """
     def __init__(self, defaults: dict[str, Any] | None = None, target: GradTarget = 'closure'):
         super().__init__(defaults)
         self._target: GradTarget = target
     @abstractmethod
-    def approximate(self, closure: Callable, params: list[torch.Tensor], loss: _Scalar | None) -> tuple[Iterable[torch.Tensor], _Scalar | None, _Scalar | None]:
-        """Returns a tuple: (grad, loss, loss_approx), make sure this resets parameters to their original values!"""
+    def approximate(self, closure: Callable, params: list[torch.Tensor], loss: torch.Tensor | None) -> tuple[Iterable[torch.Tensor], torch.Tensor | None, torch.Tensor | None]:
+        """Returns a tuple: ``(grad, loss, loss_approx)``, make sure this resets parameters to their original values!"""
-    def pre_step(self, var: Var) -> Var | None:
+    def pre_step(self, var: Var) -> None:
         """This runs once before each step, whereas `approximate` may run multiple times per step if further modules
         evaluate gradients at multiple points. This is useful for example to pre-generate new random perturbations."""
-        return var
     @torch.no_grad
     def step(self, var):
-        ret = self.pre_step(var)
-        if isinstance(ret, Var): var = ret
+        self.pre_step(var)
         if var.closure is None: raise RuntimeError("Gradient approximation requires closure")
         params, closure, loss = var.params, var.closure, var.loss
@@ -108,4 +104,4 @@ class GradApproximator(Module, ABC):
         else: raise ValueError(self._target)
         return var
-_FD_Formula = Literal['forward', 'forward2', 'backward', 'backward2', 'central', 'central2', 'central3', 'forward3', 'backward3', 'central4', 'forward4', 'forward5', 'bspsa5']
+_FD_Formula = Literal['forward', 'forward2', 'backward', 'backward2', 'central', 'central2', 'central3', 'forward3', 'backward3', 'central4', 'forward4', 'forward5', 'bspsa4']

torchzero 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl