PyPI - torchzero - Versions diffs - 0.0.1__py3-none-any.whl - Mend

torchzero 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

torchzero/__init__.py +4 -0
torchzero/core/__init__.py +13 -0
torchzero/core/module.py +471 -0
torchzero/core/tensorlist_optimizer.py +219 -0
torchzero/modules/__init__.py +21 -0
torchzero/modules/adaptive/__init__.py +4 -0
torchzero/modules/adaptive/adaptive.py +192 -0
torchzero/modules/experimental/__init__.py +19 -0
torchzero/modules/experimental/experimental.py +294 -0
torchzero/modules/experimental/quad_interp.py +104 -0
torchzero/modules/experimental/subspace.py +259 -0
torchzero/modules/gradient_approximation/__init__.py +7 -0
torchzero/modules/gradient_approximation/_fd_formulas.py +3 -0
torchzero/modules/gradient_approximation/base_approximator.py +110 -0
torchzero/modules/gradient_approximation/fdm.py +125 -0
torchzero/modules/gradient_approximation/forward_gradient.py +163 -0
torchzero/modules/gradient_approximation/newton_fdm.py +198 -0
torchzero/modules/gradient_approximation/rfdm.py +125 -0
torchzero/modules/line_search/__init__.py +30 -0
torchzero/modules/line_search/armijo.py +56 -0
torchzero/modules/line_search/base_ls.py +139 -0
torchzero/modules/line_search/directional_newton.py +217 -0
torchzero/modules/line_search/grid_ls.py +158 -0
torchzero/modules/line_search/scipy_minimize_scalar.py +62 -0
torchzero/modules/meta/__init__.py +12 -0
torchzero/modules/meta/alternate.py +65 -0
torchzero/modules/meta/grafting.py +195 -0
torchzero/modules/meta/optimizer_wrapper.py +173 -0
torchzero/modules/meta/return_overrides.py +46 -0
torchzero/modules/misc/__init__.py +10 -0
torchzero/modules/misc/accumulate.py +43 -0
torchzero/modules/misc/basic.py +115 -0
torchzero/modules/misc/lr.py +96 -0
torchzero/modules/misc/multistep.py +51 -0
torchzero/modules/misc/on_increase.py +53 -0
torchzero/modules/momentum/__init__.py +4 -0
torchzero/modules/momentum/momentum.py +106 -0
torchzero/modules/operations/__init__.py +29 -0
torchzero/modules/operations/multi.py +298 -0
torchzero/modules/operations/reduction.py +134 -0
torchzero/modules/operations/singular.py +113 -0
torchzero/modules/optimizers/__init__.py +10 -0
torchzero/modules/optimizers/adagrad.py +49 -0
torchzero/modules/optimizers/adam.py +118 -0
torchzero/modules/optimizers/lion.py +28 -0
torchzero/modules/optimizers/rmsprop.py +51 -0
torchzero/modules/optimizers/rprop.py +99 -0
torchzero/modules/optimizers/sgd.py +54 -0
torchzero/modules/orthogonalization/__init__.py +2 -0
torchzero/modules/orthogonalization/newtonschulz.py +159 -0
torchzero/modules/orthogonalization/svd.py +86 -0
torchzero/modules/quasi_newton/__init__.py +4 -0
torchzero/modules/regularization/__init__.py +22 -0
torchzero/modules/regularization/dropout.py +34 -0
torchzero/modules/regularization/noise.py +77 -0
torchzero/modules/regularization/normalization.py +328 -0
torchzero/modules/regularization/ortho_grad.py +78 -0
torchzero/modules/regularization/weight_decay.py +92 -0
torchzero/modules/scheduling/__init__.py +2 -0
torchzero/modules/scheduling/lr_schedulers.py +131 -0
torchzero/modules/scheduling/step_size.py +80 -0
torchzero/modules/second_order/__init__.py +4 -0
torchzero/modules/second_order/newton.py +165 -0
torchzero/modules/smoothing/__init__.py +5 -0
torchzero/modules/smoothing/gaussian_smoothing.py +90 -0
torchzero/modules/smoothing/laplacian_smoothing.py +128 -0
torchzero/modules/weight_averaging/__init__.py +2 -0
torchzero/modules/weight_averaging/ema.py +72 -0
torchzero/modules/weight_averaging/swa.py +171 -0
torchzero/optim/__init__.py +10 -0
torchzero/optim/experimental/__init__.py +20 -0
torchzero/optim/experimental/experimental.py +343 -0
torchzero/optim/experimental/ray_search.py +83 -0
torchzero/optim/first_order/__init__.py +18 -0
torchzero/optim/first_order/cautious.py +158 -0
torchzero/optim/first_order/forward_gradient.py +70 -0
torchzero/optim/first_order/optimizers.py +570 -0
torchzero/optim/modular.py +132 -0
torchzero/optim/quasi_newton/__init__.py +1 -0
torchzero/optim/quasi_newton/directional_newton.py +58 -0
torchzero/optim/second_order/__init__.py +1 -0
torchzero/optim/second_order/newton.py +94 -0
torchzero/optim/wrappers/__init__.py +0 -0
torchzero/optim/wrappers/nevergrad.py +113 -0
torchzero/optim/wrappers/nlopt.py +165 -0
torchzero/optim/wrappers/scipy.py +439 -0
torchzero/optim/zeroth_order/__init__.py +4 -0
torchzero/optim/zeroth_order/fdm.py +87 -0
torchzero/optim/zeroth_order/newton_fdm.py +146 -0
torchzero/optim/zeroth_order/rfdm.py +217 -0
torchzero/optim/zeroth_order/rs.py +85 -0
torchzero/random/__init__.py +1 -0
torchzero/random/random.py +46 -0
torchzero/tensorlist.py +819 -0
torchzero/utils/__init__.py +0 -0
torchzero/utils/compile.py +39 -0
torchzero/utils/derivatives.py +99 -0
torchzero/utils/python_tools.py +25 -0
torchzero/utils/torch_tools.py +92 -0
torchzero-0.0.1.dist-info/LICENSE +21 -0
torchzero-0.0.1.dist-info/METADATA +118 -0
torchzero-0.0.1.dist-info/RECORD +104 -0
torchzero-0.0.1.dist-info/WHEEL +5 -0
torchzero-0.0.1.dist-info/top_level.txt +1 -0

torchzero/modules/line_search/grid_ls.py ADDED Viewed

@@ -0,0 +1,158 @@
+from typing import Any, Literal
+from collections.abc import Sequence
+import numpy as np
+import torch
+from ...tensorlist import TensorList
+from ...core import _ClosureType, OptimizationState
+from .base_ls import LineSearchBase
+class GridLS(LineSearchBase):
+    """Test all `lrs` and pick best.
+    Args:
+        lrs (Sequence[float] | np.ndarray | torch.Tensor): sequence of lrs to test.
+        stop_on_improvement (bool, optional): stops if loss improves compared to current loss. Defaults to False.
+        stop_on_worsened (bool, optional):
+            stops if next lr loss is worse than previous one.
+            this assumes that lrs are in ascending order. Defaults to False.
+        log_lrs (bool, optional):
+            saves lrs and losses with them into optimizer._lrs (for debugging).
+            Defaults to False.
+    """
+    def __init__(
+        self,
+        lrs: Sequence[float] | np.ndarray | torch.Tensor,
+        stop_on_improvement=False,
+        stop_on_worsened=False,
+        log_lrs = False,
+    ):
+        super().__init__({}, maxiter=None, log_lrs=log_lrs)
+        self.lrs = lrs
+        self.stop_on_improvement = stop_on_improvement
+        self.stop_on_worsened = stop_on_worsened
+    @torch.no_grad
+    def _find_best_lr(self, state: OptimizationState, params: TensorList) -> float:
+        if state.closure is None: raise ValueError("closure is not set")
+        if state.ascent is None: raise ValueError("ascent_direction is not set")
+        if self.stop_on_improvement:
+            if state.fx0 is None: state.fx0 = state.closure(False)
+            self._lowest_loss = state.fx0
+        for lr in self.lrs:
+            loss = self._evaluate_lr_(float(lr), state.closure, state.ascent, params)
+            # if worsened
+            if self.stop_on_worsened and loss != self._lowest_loss:
+                break
+            # if improved
+            if self.stop_on_improvement and loss == self._lowest_loss:
+                break
+        return float(self._best_lr)
+class MultiplicativeLS(GridLS):
+    """Starts with `init` lr, then keeps multiplying it by `mul` until loss stops decreasing.
+    Args:
+        init (float, optional): initial lr. Defaults to 0.001.
+        mul (float, optional): lr multiplier. Defaults to 2.
+        num (int, optional): maximum number of multiplication steps. Defaults to 10.
+        stop_on_improvement (bool, optional): stops if loss improves compared to current loss. Defaults to False.
+        stop_on_worsened (bool, optional):
+            stops if next lr loss is worse than previous one.
+            this assumes that lrs are in ascending order. Defaults to False.
+        log_lrs (bool, optional):
+            saves lrs and losses with them into optimizer._lrs (for debugging).
+            Defaults to False.
+    """
+    def __init__(
+        self,
+        init: float = 0.001,
+        mul: float = 2,
+        num=10,
+        stop_on_improvement=False,
+        stop_on_worsened=True,
+    ):
+        super().__init__(
+            [init * mul**i for i in range(num)],
+            stop_on_improvement=stop_on_improvement,
+            stop_on_worsened=stop_on_worsened,
+        )
+class BacktrackingLS(GridLS):
+    """tests `init` lr, and keeps multiplying it by `mul` until loss becomes better than initial loss.
+    note: this doesn't include Armijo–Goldstein condition.
+    Args:
+        init (float, optional): initial lr. Defaults to 1.
+        mul (float, optional): lr multiplier. Defaults to 0.5.
+        num (int, optional): maximum number of multiplication steps. Defaults to 10.
+        stop_on_improvement (bool, optional): stops if loss improves compared to current loss. Defaults to False.
+        stop_on_worsened (bool, optional):
+            stops if next lr loss is worse than previous one.
+            this assumes that lrs are in ascending order. Defaults to False.
+        log_lrs (bool, optional):
+            saves lrs and losses with them into optimizer._lrs (for debugging).
+            Defaults to False.
+    """
+    def __init__(
+        self,
+        init: float = 1,
+        mul: float = 0.5,
+        num=10,
+        stop_on_improvement=True,
+        stop_on_worsened=False,
+        log_lrs = False,
+    ):
+        super().__init__(
+            [init * mul**i for i in range(num)],
+            stop_on_improvement=stop_on_improvement,
+            stop_on_worsened=stop_on_worsened,
+            log_lrs = log_lrs,
+        )
+class LinspaceLS(GridLS):
+    """Test all learning rates from a linspace and pick best."""
+    def __init__(
+        self,
+        start: float = 0.001,
+        end: float = 2,
+        steps=10,
+        stop_on_improvement=False,
+        stop_on_worsened=False,
+        log_lrs = False,
+    ):
+        super().__init__(
+            torch.linspace(start, end, steps),
+            stop_on_improvement=stop_on_improvement,
+            stop_on_worsened=stop_on_worsened,
+            log_lrs = log_lrs,
+        )
+class ArangeLS(GridLS):
+    """Test all learning rates from a linspace and pick best."""
+    def __init__(
+        self,
+        start: float = 0.001,
+        end: float = 2,
+        step=0.1,
+        stop_on_improvement=False,
+        stop_on_worsened=False,
+        log_lrs = False,
+    ):
+        super().__init__(
+            torch.arange(start, end, step),
+            stop_on_improvement=stop_on_improvement,
+            stop_on_worsened=stop_on_worsened,
+            log_lrs = log_lrs,
+        )

torchzero/modules/line_search/scipy_minimize_scalar.py ADDED Viewed

@@ -0,0 +1,62 @@
+import typing
+import torch
+try:
+    import scipy.optimize as scopt
+except ModuleNotFoundError:
+    scopt = typing.cast(typing.Any, None)
+from ...tensorlist import TensorList
+from ...core import OptimizationState
+from .base_ls import LineSearchBase, MaxIterReached
+if typing.TYPE_CHECKING:
+    import scipy.optimize as scopt
+class ScipyMinimizeScalarLS(LineSearchBase):
+    """Line search via `scipy.optimize.minimize_scalar`. All args except maxiter are the same as for it.
+    Args:
+        method (Optional[str], optional): 'brent', 'golden' or 'bounded'. Defaults to None.
+        maxiter (Optional[int], optional): hard limit on maximum number of function evaluations. Defaults to None.
+        bracket (optional): bracket. Defaults to None.
+        bounds (optional): bounds. Defaults to None.
+        tol (Optional[float], optional): some kind of tolerance. Defaults to None.
+        options (optional): options for method. Defaults to None.
+        log_lrs (bool, optional): logs lrs and values into `_lrs`. Defaults to False.
+    """
+    def __init__(
+        self,
+        method: str | None = None,
+        maxiter: int | None = None,
+        bracket = None,
+        bounds = None,
+        tol: float | None = None,
+        options = None,
+        log_lrs = False,
+    ):
+        if scopt is None: raise ModuleNotFoundError("scipy is not installed")
+        super().__init__({}, maxiter=maxiter, log_lrs=log_lrs)
+        self.method = method
+        self.tol = tol
+        self.bracket = bracket
+        self.bounds = bounds
+        self.options = options
+    @torch.no_grad
+    def _find_best_lr(self, state: OptimizationState, params: TensorList) -> float:
+        try:
+            res = scopt.minimize_scalar(
+                self._evaluate_lr_ensure_float,
+                args = (state.closure, state.ascent, params),
+                method = self.method,
+                tol = self.tol,
+                bracket = self.bracket,
+                bounds = self.bounds,
+                options = self.options,
+            ) # type:ignore
+        except MaxIterReached:
+            pass
+        return float(self._best_lr)

torchzero/modules/meta/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Modules that use other modules."""
+# from .chain import Chain, ChainReturn
+import sys
+from .alternate import Alternate
+from .grafting import Graft, IntermoduleCautious, SignGrafting
+from .return_overrides import ReturnAscent, ReturnClosure, SetGrad
+# if sys.version_info[1] < 12:
+from .optimizer_wrapper import Wrap, WrapClosure
+# else:
+#     from .optimizer_wrapper import Wrap, WrapClosure

torchzero/modules/meta/alternate.py ADDED Viewed

@@ -0,0 +1,65 @@
+import random
+from collections.abc import Iterable
+from typing import Any, Literal
+from ...core import OptimizerModule, _Chainable
+class Alternate(OptimizerModule):
+    """Alternates stepping with multiple modules.
+    Args:
+        modules (Iterable[OptimizerModule  |  Iterable[OptimizerModule]]): modules to alternate between.
+        mode (int | list[int] | tuple[int] | "random"], optional):
+            can be integer - number of repeats for all modules;
+            list or tuple of integers per each module with number of repeats;
+            "random" to pick module randomly each time. Defaults to 1.
+        seed (int | None, optional): seed for "random" mode. Defaults to None.
+    """
+    def __init__(
+        self,
+        modules: Iterable[_Chainable],
+        mode: int | list[int] | tuple[int] | Literal["random"] = 1,
+        seed: int | None = None
+    ):
+        super().__init__({})
+        modules = list(modules)
+        for i,m in enumerate(modules):
+            self._set_child_(i, m)
+        self.random = random.Random(seed)
+        if isinstance(mode, int): mode = [mode for _ in modules]
+        self.mode: list[int] | tuple[int] | Literal['random'] = mode
+        self.cur = 0
+        if self.mode == 'random': self.remaining = 0
+        else:
+            self.remaining = self.mode[0]
+            if len(self.mode) != len(self.children):
+                raise ValueError(f"got {len(self.children)} modules but {len(mode)} repeats, they should be the same")
+    def step(self, state):
+        if self.mode == 'random':
+            module = self.random.choice(list(self.children.values()))
+        else:
+            if self.remaining == 0:
+                self.cur += 1
+            if self.cur >= len(self.mode):
+                self.cur = 0
+            if self.remaining == 0: self.remaining = self.mode[self.cur]
+            module = self.children[self.cur]
+            self.remaining -= 1
+        if self.next_module is None:
+            return module.step(state)
+        state.ascent = module.return_ascent(state)
+        return self._update_params_or_step_with_next(state)

torchzero/modules/meta/grafting.py ADDED Viewed

@@ -0,0 +1,195 @@
+from collections.abc import Iterable
+from typing import Literal
+import torch
+from ...core import OptimizerModule
+from ...tensorlist import TensorList
+class Graft(OptimizerModule):
+    """
+    Optimizer grafting (magnitude#direction).
+    Takes update of one optimizer and makes its norm same as update of another optimizer.
+    Can be applied to all weights or layerwise.
+    Args:
+        magnitude (OptimizerModule | Iterable[OptimizerModule]):
+            module to use magnitude from.
+            If sequence of modules is provided, they will be chained.
+        direction (OptimizerModule | Iterable[OptimizerModule]):
+            module/modules to use direction from.
+            If sequence of modules is provided, they will be chained.
+        ord (int, optional): norm type. Defaults to 2.
+        eps (float, optional): epsilon for numerical stability. Defaults to 1e-8.
+        layerwise (bool, optional): whether to apply grafting layerwise. Defaults to False.
+    reference
+        *Agarwal, N., Anil, R., Hazan, E., Koren, T., & Zhang, C.
+        Learning Rate Grafting: Transferability of Optimizer Tuning.*
+    """
+    def __init__(
+        self,
+        magnitude: OptimizerModule | Iterable[OptimizerModule],
+        direction: OptimizerModule | Iterable[OptimizerModule],
+        ord: float = 2,
+        eps: float = 1e-8,
+        layerwise: bool = False,
+        # TODO: channelwise
+    ):
+        super().__init__({})
+        self._set_child_('magnitude', magnitude)
+        self._set_child_('direction', direction)
+        self.ord = ord
+        self.eps = eps
+        self.layerwise = layerwise
+    @torch.no_grad
+    def step(self, state):
+        state_copy = state.copy(clone_ascent=True)
+        magnitude = self.children['magnitude'].return_ascent(state_copy)
+        if state_copy.grad is not None: state.grad = state_copy.grad
+        if state_copy.fx0 is not None: state.fx0 = state_copy.fx0
+        if state_copy.fx0_approx is not None: state.fx0_approx = state_copy.fx0_approx
+        direction = self.children['direction'].return_ascent(state)
+        if self.layerwise:
+            M = magnitude.norm(self.ord)
+            D = direction.norm(self.ord)
+            D.select_set_(D == 0, M)
+        else:
+            M = magnitude.total_vector_norm(self.ord)
+            D = direction.total_vector_norm(self.ord)
+            if D == 0: D = M
+        state.ascent = direction.mul_(M / (D + self.eps))
+        return self._update_params_or_step_with_next(state)
+class SignGrafting(OptimizerModule):
+    """Weight-wise grafting-like operation where sign of the ascent is taken from first module
+    and magnitude from second module.
+    Args:
+        magnitude (OptimizerModule | Iterable[OptimizerModule]):
+            module to take magnitude from.
+            If sequence of modules is provided, they will be chained.
+        sign (OptimizerModule | Iterable[OptimizerModule]):
+            module to take sign from.
+            If sequence of modules is provided, they will be chained.
+    """
+    def __init__(
+        self,
+        magnitude: OptimizerModule | Iterable[OptimizerModule],
+        sign: OptimizerModule | Iterable[OptimizerModule],
+    ):
+        super().__init__({})
+        self._set_child_('magnitude', magnitude)
+        self._set_child_('sign', sign)
+    @torch.no_grad
+    def step(self, state):
+        state_copy = state.copy(clone_ascent=True)
+        magnitude = self.children['magnitude'].return_ascent(state_copy)
+        # make sure to store grad and fx0 if it was calculated
+        state.update_attrs_(state_copy)
+        sign = self.children['sign'].return_ascent(state)
+        state.ascent = magnitude.copysign_(sign)
+        return self._update_params_or_step_with_next(state)
+class IntermoduleCautious(OptimizerModule):
+    """Negates update for parameters where updates of two modules or module chains have inconsistent sign.
+    Optionally normalizes the update by the number of parameters that are not masked.
+    Args:
+        main_module (OptimizerModule | Iterable[OptimizerModule]):
+            main module or sequence of modules to chain, which update will be used with a consistency mask applied.
+        compare_module (OptimizerModule | Iterable[OptimizerModule]):
+            module or sequence of modules to chain, which update will be used to compute a consistency mask.
+            Can also be set to `ascent` to compare to update that is passed `main_module`, or `grad` to compare
+            to gradients.
+        normalize (bool, optional):
+            renormalize update after masking.
+            only has effect when mode is 'zero'. Defaults to False.
+        eps (float, optional): epsilon for normalization. Defaults to 1e-6.
+        mode (str, optional):
+            what to do with updates with inconsistent signs.
+            "zero" - set them to zero (as in paper)
+            "grad" - set them to the gradient
+            "compare_module" - set them to `compare_module`'s update
+            "negate" - negate them (same as using update magnitude and gradient sign)
+    """
+    def __init__(
+        self,
+        main_module: OptimizerModule | Iterable[OptimizerModule],
+        compare_module: OptimizerModule | Iterable[OptimizerModule] | Literal['ascent', 'grad'],
+        normalize=False,
+        eps=1e-6,
+        mode: Literal["zero", "grad", "backtrack", "compare_module"] = "zero",
+    ):
+        super().__init__({})
+        self._set_child_('main',main_module)
+        if isinstance(compare_module, str): self.compare_mode = compare_module
+        else:
+            self._set_child_('compare', compare_module)
+            self.compare_mode = 'module'
+        self.eps = eps
+        self.normalize = normalize
+        self.mode: Literal["zero", "grad", "backtrack", "compare_module"]  = mode
+    @torch.no_grad
+    def step(self, state):
+        params = None
+        state_copy = state.copy(clone_ascent=True)
+        ascent = self.children['main'].return_ascent(state_copy)
+        state.update_attrs_(state_copy)
+        if self.compare_mode == 'module': compare = self.children['compare'].return_ascent(state)
+        else:
+            params = self.get_params()
+            if self.compare_mode == 'ascent': compare: TensorList = state.maybe_use_grad_(params)
+            elif self.compare_mode == 'grad': compare: TensorList = state.maybe_compute_grad_(params)
+            else: raise ValueError(f'Invalid compare_module: {self.compare_mode}')
+        # mask will be > 0 for parameters where both signs are the same
+        mask = (ascent * compare) > 0
+        if self.mode == 'backtrack':
+            ascent -= ascent.mul(2).mul_(mask.logical_not_())
+        else:
+            # normalize if mode is `zero`
+            if self.normalize and self.mode == 'zero':
+                fmask = mask.to(ascent[0].dtype)
+                fmask /= fmask.total_mean() + self.eps
+            else:
+                fmask = mask
+            # apply the mask
+            ascent *= fmask
+            if self.mode == 'grad':
+                params = self.get_params()
+                ascent += state.maybe_compute_grad_(params) * mask.logical_not_()
+            elif self.mode == 'compare_module':
+                ascent += compare * mask.logical_not_()
+        state.ascent = ascent
+        return self._update_params_or_step_with_next(state, params)

torchzero/modules/meta/optimizer_wrapper.py ADDED Viewed

@@ -0,0 +1,173 @@
+from collections.abc import Callable, Sequence
+from typing import Any, overload
+import torch
+from typing_extensions import Concatenate, ParamSpec
+from ...core import OptimizerModule
+from .return_overrides import SetGrad
+K = ParamSpec('K')
+class Wrap(OptimizerModule):
+    """
+    Wraps any torch.optim.Optimizer.
+    Sets .grad attribute to the current update and steps with the `optimizer`.
+    Additionally, if this is not the last module, this takes the update of `optimizer`,
+    undoes it and passes to the next module instead. That means you can chain multiple
+    optimizers together.
+    Args:
+        optimizer (torch.optim.Optimizer): optimizer to wrap,
+            or a callable (class) that constructs the optimizer.
+        kwargs:
+            if class is passed, kwargs are passed to the constructor.
+            parameters are passed separately and automatically
+            which is the point of passing a constructor
+            instead of an optimizer directly.
+    This can be constructed in two ways.
+    .. code-block:: python
+        wrapper = OptimizerWrapper(torch.optim.SGD(model.parameters(), lr = 0.1))
+        # or
+        wrapper = OptimizerWrapper(torch.optim.SGD, lr = 0.1)
+    """
+    @overload
+    def __init__(self, optimizer: torch.optim.Optimizer): ...
+    @overload
+    # def __init__[**K](
+    def __init__(
+        self,
+        optimizer: Callable[Concatenate[Any, K], torch.optim.Optimizer],
+        *args: K.args,
+        **kwargs: K.kwargs,
+        # optimizer: abc.Callable[..., torch.optim.Optimizer],
+        # *args,
+        # **kwargs,
+    ): ...
+    def __init__(self, optimizer, *args, **kwargs):
+        super().__init__({})
+        self._optimizer_cls: torch.optim.Optimizer | Callable[..., torch.optim.Optimizer] = optimizer
+        self._args = args
+        self._kwargs = kwargs
+    def _initialize_(self, params, set_passed_params):
+        """Initializes this optimizer and all children with the given parameters."""
+        super()._initialize_(params, set_passed_params=set_passed_params)
+        if isinstance(self._optimizer_cls, torch.optim.Optimizer) or not callable(self._optimizer_cls):
+            self.optimizer = self._optimizer_cls
+        else:
+            self.optimizer = self._optimizer_cls(params, *self._args, **self._kwargs)
+    @torch.no_grad
+    def step(self, state):
+        # check attrs
+        # if self.pass_closure:
+        #     if state.closure is None: raise ValueError('ClosureOptimizerWrapper requires closure.')
+        #     if state.ascent is not None:
+        #         raise ValueError('pass_closure = True, means ascent must be None (not sure though)')
+        params = self.get_params()
+        if self.next_module is None:
+            # set grad to ascent and make a step with the optimizer
+            g = state.maybe_use_grad_(params)
+            params.set_grad_(g)
+            state.fx0 = self.optimizer.step()
+            return state.get_loss()
+        params_before_step = params.clone()
+        g = state.maybe_use_grad_(params)
+        params.set_grad_(g)
+        state.fx0 = self.optimizer.step()
+        # calculate update as difference in params
+        state.ascent = params_before_step - params
+        params.set_(params_before_step)
+        return self.next_module.step(state)
+class WrapClosure(OptimizerModule):
+    """
+    Wraps any torch.optim.Optimizer. This only works with modules with :code:`target = "Closure"` argument.
+    The modified closure will be passed to the optimizer.
+    Alternative any module can be turned into a closure module by using :any:`MakeClosure` module,
+    in that case this should be placed after MakeClosure.
+    Args:
+        optimizer (torch.optim.Optimizer): optimizer to wrap,
+            or a callable (class) that constructs the optimizer.
+        kwargs:
+            if class is passed, kwargs are passed to the constructor.
+            parameters are passed separately and automatically
+            which is the point of passing a constructor
+            instead of an optimizer directly.
+    This can be constructed in two ways.
+    .. code-block:: python
+        wrapper = OptimizerWrapper(torch.optim.SGD(model.parameters(), lr = 0.1))
+        # or
+        wrapper = OptimizerWrapper(torch.optim.SGD, lr = 0.1)
+    """
+    @overload
+    def __init__(self, optimizer: torch.optim.Optimizer,): ...
+    @overload
+    def __init__(
+        self,
+        optimizer: Callable[Concatenate[Any, K], torch.optim.Optimizer],
+        *args: K.args,
+        **kwargs: K.kwargs,
+        # optimizer: abc.Callable[..., torch.optim.Optimizer],
+        # *args,
+        # **kwargs,
+    ): ...
+    def __init__(self, optimizer, *args, **kwargs):
+        super().__init__({})
+        self._optimizer_cls: torch.optim.Optimizer | Callable[..., torch.optim.Optimizer] = optimizer
+        self._args = args
+        self._kwargs = kwargs
+    def _initialize_(self, params, set_passed_params):
+        """Initializes this optimizer and all children with the given parameters."""
+        super()._initialize_(params, set_passed_params=set_passed_params)
+        if isinstance(self._optimizer_cls, torch.optim.Optimizer) or not callable(self._optimizer_cls):
+            self.optimizer = self._optimizer_cls
+        else:
+            self.optimizer = self._optimizer_cls(params, *self._args, **self._kwargs)
+    @torch.no_grad
+    def step(self, state):
+        # check attrs
+        # if self.pass_closure:
+        #     if state.closure is None: raise ValueError('ClosureOptimizerWrapper requires closure.')
+        #     if state.ascent is not None:
+        #         raise ValueError('pass_closure = True, means ascent must be None (not sure though)')
+        params = self.get_params()
+        if self.next_module is None:
+            # set grad to ascent and make a step with the optimizer
+            state.fx0 = self.optimizer.step(state.closure) # type:ignore
+            return state.get_loss()
+        params_before_step = params.clone()
+        state.fx0 = self.optimizer.step(state.closure) # type:ignore
+        # calculate update as difference in params
+        state.ascent = params_before_step - params
+        params.set_(params_before_step)
+        return self.next_module.step(state)