PyPI - torchzero - Versions diffs - 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

docs/source/conf.py +57 -0
tests/test_identical.py +230 -0
tests/test_module.py +50 -0
tests/test_opts.py +884 -0
tests/test_tensorlist.py +1787 -0
tests/test_utils_optimizer.py +170 -0
tests/test_vars.py +184 -0
torchzero/__init__.py +4 -4
torchzero/core/__init__.py +3 -13
torchzero/core/module.py +629 -510
torchzero/core/preconditioner.py +137 -0
torchzero/core/transform.py +252 -0
torchzero/modules/__init__.py +13 -21
torchzero/modules/clipping/__init__.py +3 -0
torchzero/modules/clipping/clipping.py +320 -0
torchzero/modules/clipping/ema_clipping.py +135 -0
torchzero/modules/clipping/growth_clipping.py +187 -0
torchzero/modules/experimental/__init__.py +13 -18
torchzero/modules/experimental/absoap.py +350 -0
torchzero/modules/experimental/adadam.py +111 -0
torchzero/modules/experimental/adamY.py +135 -0
torchzero/modules/experimental/adasoap.py +282 -0
torchzero/modules/experimental/algebraic_newton.py +145 -0
torchzero/modules/experimental/curveball.py +89 -0
torchzero/modules/experimental/dsoap.py +290 -0
torchzero/modules/experimental/gradmin.py +85 -0
torchzero/modules/experimental/reduce_outward_lr.py +35 -0
torchzero/modules/experimental/spectral.py +286 -0
torchzero/modules/experimental/subspace_preconditioners.py +128 -0
torchzero/modules/experimental/tropical_newton.py +136 -0
torchzero/modules/functional.py +209 -0
torchzero/modules/grad_approximation/__init__.py +4 -0
torchzero/modules/grad_approximation/fdm.py +120 -0
torchzero/modules/grad_approximation/forward_gradient.py +81 -0
torchzero/modules/grad_approximation/grad_approximator.py +66 -0
torchzero/modules/grad_approximation/rfdm.py +259 -0
torchzero/modules/line_search/__init__.py +5 -30
torchzero/modules/line_search/backtracking.py +186 -0
torchzero/modules/line_search/line_search.py +181 -0
torchzero/modules/line_search/scipy.py +37 -0
torchzero/modules/line_search/strong_wolfe.py +260 -0
torchzero/modules/line_search/trust_region.py +61 -0
torchzero/modules/lr/__init__.py +2 -0
torchzero/modules/lr/lr.py +59 -0
torchzero/modules/lr/step_size.py +97 -0
torchzero/modules/momentum/__init__.py +14 -4
torchzero/modules/momentum/averaging.py +78 -0
torchzero/modules/momentum/cautious.py +181 -0
torchzero/modules/momentum/ema.py +173 -0
torchzero/modules/momentum/experimental.py +189 -0
torchzero/modules/momentum/matrix_momentum.py +124 -0
torchzero/modules/momentum/momentum.py +43 -106
torchzero/modules/ops/__init__.py +103 -0
torchzero/modules/ops/accumulate.py +65 -0
torchzero/modules/ops/binary.py +240 -0
torchzero/modules/ops/debug.py +25 -0
torchzero/modules/ops/misc.py +419 -0
torchzero/modules/ops/multi.py +137 -0
torchzero/modules/ops/reduce.py +149 -0
torchzero/modules/ops/split.py +75 -0
torchzero/modules/ops/switch.py +68 -0
torchzero/modules/ops/unary.py +115 -0
torchzero/modules/ops/utility.py +112 -0
torchzero/modules/optimizers/__init__.py +18 -10
torchzero/modules/optimizers/adagrad.py +146 -49
torchzero/modules/optimizers/adam.py +112 -118
torchzero/modules/optimizers/lion.py +18 -11
torchzero/modules/optimizers/muon.py +222 -0
torchzero/modules/optimizers/orthograd.py +55 -0
torchzero/modules/optimizers/rmsprop.py +103 -51
torchzero/modules/optimizers/rprop.py +342 -99
torchzero/modules/optimizers/shampoo.py +197 -0
torchzero/modules/optimizers/soap.py +286 -0
torchzero/modules/optimizers/sophia_h.py +129 -0
torchzero/modules/projections/__init__.py +5 -0
torchzero/modules/projections/dct.py +73 -0
torchzero/modules/projections/fft.py +73 -0
torchzero/modules/projections/galore.py +10 -0
torchzero/modules/projections/projection.py +218 -0
torchzero/modules/projections/structural.py +151 -0
torchzero/modules/quasi_newton/__init__.py +7 -4
torchzero/modules/quasi_newton/cg.py +218 -0
torchzero/modules/quasi_newton/experimental/__init__.py +1 -0
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +265 -0
torchzero/modules/quasi_newton/lbfgs.py +228 -0
torchzero/modules/quasi_newton/lsr1.py +170 -0
torchzero/modules/quasi_newton/olbfgs.py +196 -0
torchzero/modules/quasi_newton/quasi_newton.py +475 -0
torchzero/modules/second_order/__init__.py +3 -4
torchzero/modules/second_order/newton.py +142 -165
torchzero/modules/second_order/newton_cg.py +84 -0
torchzero/modules/second_order/nystrom.py +168 -0
torchzero/modules/smoothing/__init__.py +2 -5
torchzero/modules/smoothing/gaussian.py +164 -0
torchzero/modules/smoothing/{laplacian_smoothing.py → laplacian.py} +115 -128
torchzero/modules/weight_decay/__init__.py +1 -0
torchzero/modules/weight_decay/weight_decay.py +52 -0
torchzero/modules/wrappers/__init__.py +1 -0
torchzero/modules/wrappers/optim_wrapper.py +91 -0
torchzero/optim/__init__.py +2 -10
torchzero/optim/utility/__init__.py +1 -0
torchzero/optim/utility/split.py +45 -0
torchzero/optim/wrappers/nevergrad.py +2 -28
torchzero/optim/wrappers/nlopt.py +31 -16
torchzero/optim/wrappers/scipy.py +79 -156
torchzero/utils/__init__.py +27 -0
torchzero/utils/compile.py +175 -37
torchzero/utils/derivatives.py +513 -99
torchzero/utils/linalg/__init__.py +5 -0
torchzero/utils/linalg/matrix_funcs.py +87 -0
torchzero/utils/linalg/orthogonalize.py +11 -0
torchzero/utils/linalg/qr.py +71 -0
torchzero/utils/linalg/solve.py +168 -0
torchzero/utils/linalg/svd.py +20 -0
torchzero/utils/numberlist.py +132 -0
torchzero/utils/ops.py +10 -0
torchzero/utils/optimizer.py +284 -0
torchzero/utils/optuna_tools.py +40 -0
torchzero/utils/params.py +149 -0
torchzero/utils/python_tools.py +40 -25
torchzero/utils/tensorlist.py +1081 -0
torchzero/utils/torch_tools.py +48 -12
torchzero-0.3.2.dist-info/METADATA +379 -0
torchzero-0.3.2.dist-info/RECORD +128 -0
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info}/WHEEL +1 -1
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info/licenses}/LICENSE +0 -0
torchzero-0.3.2.dist-info/top_level.txt +3 -0
torchzero/core/tensorlist_optimizer.py +0 -219
torchzero/modules/adaptive/__init__.py +0 -4
torchzero/modules/adaptive/adaptive.py +0 -192
torchzero/modules/experimental/experimental.py +0 -294
torchzero/modules/experimental/quad_interp.py +0 -104
torchzero/modules/experimental/subspace.py +0 -259
torchzero/modules/gradient_approximation/__init__.py +0 -7
torchzero/modules/gradient_approximation/_fd_formulas.py +0 -3
torchzero/modules/gradient_approximation/base_approximator.py +0 -105
torchzero/modules/gradient_approximation/fdm.py +0 -125
torchzero/modules/gradient_approximation/forward_gradient.py +0 -163
torchzero/modules/gradient_approximation/newton_fdm.py +0 -198
torchzero/modules/gradient_approximation/rfdm.py +0 -125
torchzero/modules/line_search/armijo.py +0 -56
torchzero/modules/line_search/base_ls.py +0 -139
torchzero/modules/line_search/directional_newton.py +0 -217
torchzero/modules/line_search/grid_ls.py +0 -158
torchzero/modules/line_search/scipy_minimize_scalar.py +0 -62
torchzero/modules/meta/__init__.py +0 -12
torchzero/modules/meta/alternate.py +0 -65
torchzero/modules/meta/grafting.py +0 -195
torchzero/modules/meta/optimizer_wrapper.py +0 -173
torchzero/modules/meta/return_overrides.py +0 -46
torchzero/modules/misc/__init__.py +0 -10
torchzero/modules/misc/accumulate.py +0 -43
torchzero/modules/misc/basic.py +0 -115
torchzero/modules/misc/lr.py +0 -96
torchzero/modules/misc/multistep.py +0 -51
torchzero/modules/misc/on_increase.py +0 -53
torchzero/modules/operations/__init__.py +0 -29
torchzero/modules/operations/multi.py +0 -298
torchzero/modules/operations/reduction.py +0 -134
torchzero/modules/operations/singular.py +0 -113
torchzero/modules/optimizers/sgd.py +0 -54
torchzero/modules/orthogonalization/__init__.py +0 -2
torchzero/modules/orthogonalization/newtonschulz.py +0 -159
torchzero/modules/orthogonalization/svd.py +0 -86
torchzero/modules/regularization/__init__.py +0 -22
torchzero/modules/regularization/dropout.py +0 -34
torchzero/modules/regularization/noise.py +0 -77
torchzero/modules/regularization/normalization.py +0 -328
torchzero/modules/regularization/ortho_grad.py +0 -78
torchzero/modules/regularization/weight_decay.py +0 -92
torchzero/modules/scheduling/__init__.py +0 -2
torchzero/modules/scheduling/lr_schedulers.py +0 -131
torchzero/modules/scheduling/step_size.py +0 -80
torchzero/modules/smoothing/gaussian_smoothing.py +0 -90
torchzero/modules/weight_averaging/__init__.py +0 -2
torchzero/modules/weight_averaging/ema.py +0 -72
torchzero/modules/weight_averaging/swa.py +0 -171
torchzero/optim/experimental/__init__.py +0 -20
torchzero/optim/experimental/experimental.py +0 -343
torchzero/optim/experimental/ray_search.py +0 -83
torchzero/optim/first_order/__init__.py +0 -18
torchzero/optim/first_order/cautious.py +0 -158
torchzero/optim/first_order/forward_gradient.py +0 -70
torchzero/optim/first_order/optimizers.py +0 -570
torchzero/optim/modular.py +0 -148
torchzero/optim/quasi_newton/__init__.py +0 -1
torchzero/optim/quasi_newton/directional_newton.py +0 -58
torchzero/optim/second_order/__init__.py +0 -1
torchzero/optim/second_order/newton.py +0 -94
torchzero/optim/zeroth_order/__init__.py +0 -4
torchzero/optim/zeroth_order/fdm.py +0 -87
torchzero/optim/zeroth_order/newton_fdm.py +0 -146
torchzero/optim/zeroth_order/rfdm.py +0 -217
torchzero/optim/zeroth_order/rs.py +0 -85
torchzero/random/__init__.py +0 -1
torchzero/random/random.py +0 -46
torchzero/tensorlist.py +0 -826
torchzero-0.1.8.dist-info/METADATA +0 -130
torchzero-0.1.8.dist-info/RECORD +0 -104
torchzero-0.1.8.dist-info/top_level.txt +0 -1

torchzero/core/module.py CHANGED Viewed

@@ -1,510 +1,629 @@
-import sys
-import warnings
-from abc import ABC, abstractmethod
-from collections.abc import Callable, Iterable, Sequence
-from typing import Any, Literal
-from typing_extensions import Self, TypeAlias
-import torch
-from torch.optim.optimizer import ParamsT
-from ..tensorlist import TensorList
-from ..utils.python_tools import _ScalarLoss, flatten
-from .tensorlist_optimizer import (
-    TensorListOptimizer,
-    _ClosureType,
-    _maybe_pass_backward,
-)
-def _get_loss(fx0, fx0_approx):
-    """Returns fx0 if it is not None otherwise fx0_approx"""
-    if fx0 is None: return fx0_approx
-    return fx0
-class OptimizationVars:
-    """Holds optimization variables. This is usually automatically created by :any:`torchzero.optim.Modular`."""
-    def __init__(self, closure: _ClosureType | None, model: torch.nn.Module | None):
-        self.closure: _ClosureType | None = closure
-        """A closure that reevaluates the model and returns the loss.
-        The closure should accept `backward` boolean argument that is True by default, which,
-        if True, sets `.grad` attributes of all learnable params, for example via `loss.backward()`.
-        Closure can be None for most first order optimizers."""
-        self.ascent: TensorList | None = None
-        """Ascent direction, for example the gradients.
-        Will be None on the first module in the chain.
-        May remain none for modules that create a new closure."""
-        self.fx0: _ScalarLoss | None = None
-        """Loss value strictly with initial parameters of the current step.
-        If initial parameters have not been evaluated, this should be None."""
-        self.fx0_approx: _ScalarLoss | None = None
-        """Loss value, could be sampled nearby the initial parameters.
-        This is mainly used as the return value of the step method when fx0 is None."""
-        self.grad: TensorList | None = None
-        """Gradient if it has been computed, otherwise None.
-        Gradient must be evaluated strictly with initial parameters of the current step"""
-        self.model = model
-        """model itself (torch.nn.Module) if it was passed, otherwise None."""
-        self.post_step_hooks = []
-        """callables that get executed after each step. Used by periodic SWA to reset momentum when setting model parameters to SWA.
-        Signature:
-        .. code:: py
-            def hook(optimizer: ModularOptimizer, state: OptimizationState) -> None:
-                ...
-        """
-    def maybe_compute_grad_(self, params: TensorList | None) -> TensorList:
-        """Computes gradient if it hasn't been computed already, and returns it"""
-        if self.grad is None:
-            if params is None: raise ValueError()
-            if self.closure is not None:
-                with torch.enable_grad(): self.fx0 = self.closure() # pylint:disable = not-callable (???)
-            self.grad = params.ensure_grad_().grad
-        return self.grad
-    def maybe_use_grad_(self, params: TensorList | None) -> TensorList:
-        """If ascent direction is None, use cloned gradient as ascent direction and returns it.
-        Otherwise does nothing and returns existing ascent direction.
-        If gradient hasn't been computed, this also sets `fx0`."""
-        if self.ascent is None:
-            self.ascent = self.maybe_compute_grad_(params).clone()
-        return self.ascent
-    def set_grad_(self, grad: TensorList, params: TensorList):
-        """Sets gradient to this state and to params"""
-        self.grad = grad
-        params.set_grad_(grad)
-    def evaluate_fx0_(self, backward=True) -> _ScalarLoss:
-        """if fx0 is None or if backward is True and self.grad is None, evaluates closure and sets them. Returns fx0"""
-        if self.fx0 is not None:
-            if backward and self.grad is None:
-                warnings.warn('evaluating fx0 with backward=True after it has already been evaluated with backward=False. Something may be inefficient.')
-                with torch.enable_grad(): self.closure() # set grad #type:ignore
-            return self.fx0
-        if self.closure is None: raise ValueError("Closure is None")
-        loss = self.fx0 = _maybe_pass_backward(self.closure, backward)
-        return loss
-    def evaluate_fx0_approx_(self, backward=True) -> _ScalarLoss:
-        """evaluates closure, sets self.fx0_approx and returns it"""
-        if self.closure is None: raise ValueError("Closure is None")
-        loss = self.fx0_approx = _maybe_pass_backward(self.closure, backward)
-        return loss
-    def get_loss(self):
-        """Returns fx0 if it is not None otherwise fx0_approx"""
-        if self.fx0 is None: return self.fx0_approx
-        return self.fx0
-    def copy(self, clone_ascent = False):
-        """Copy this optimization state. This will not clone anything other than optionally ascent_direction.
-        Args:
-            clone_ascent (bool, optional): Whether to clone ascent direction. Defaults to False.
-        Returns:
-            A copy of this OptimizationState.
-        """
-        vars = OptimizationVars(self.closure, self.model)
-        vars.fx0 = self.fx0
-        vars.fx0_approx = self.fx0_approx
-        vars.grad = self.grad
-        if clone_ascent and self.ascent is not None: vars.ascent = self.ascent.clone()
-        else: vars.ascent = self.ascent
-        return vars
-    def update_attrs_(self, vars: "OptimizationVars"):
-        """Updates attributes of this state with attributes of another state.
-        This updates `grad`, `fx0` and `fx0_approx`."""
-        if vars.grad is not None: self.grad = vars.grad
-        if vars.fx0 is not None: self.fx0 = vars.fx0
-        if vars.fx0_approx is not None: self.fx0_approx = vars.fx0_approx
-    def add_post_step_hook(self, hook: Callable):
-        """add a hook that runs after each step. The hook should look like this:
-        .. code:: py
-            def hook(optimizer: tz.optim.Modular, state: tz.core.OptimizationState): ...
-        """
-        self.post_step_hooks.append(hook)
-_Targets = Literal['ascent', 'grad', 'closure',]
-class OptimizerModule(TensorListOptimizer, ABC): # type:ignore
-    r"""Base class for all modules.
-    Args:
-        defaults (dict): dictionary with default parameters for the module.
-        target (str, optional):
-            determines how _update method is used in the default step method.
-            "ascent" - it updates the ascent
-            "grad" - it updates the gradient (and sets `.grad` attributes to updated gradient).
-            "closure" - it makes a new closure that sets the updated ascent to the .`grad` attributes.
-    """
-    IS_LR_MODULE = False
-    def __init__(self, defaults: dict[str, Any], target: Literal['ascent', 'grad', 'closure',] = 'ascent'): # pylint:disable = super-init-not-called
-        # there can only be 1 LR module, which is placed in the appropriate location among other modules.
-        # scheduling and per-parameter "lr" options will be routed to that module.
-        # otherwise, since many update rules like Adam have baked in lr, if multiple such modules are used,
-        # any lr modification gets applied multiple times.
-        # Some optimizers will automatically be fused if followed an LR() module (only LR specifically is supported).
-        if not self.IS_LR_MODULE:
-            if 'lr' in defaults:
-                warnings.warn(
-                    f'{self.__class__.__name__} got an "lr" default, but it is not an LR module.\
-                    To support lr scheduling and per-parameter options, rename "lr" to "alpha" and set the default value to 1.\
-                    If this is a learning rate module, set a class attribute `IS_LR_MODULE=True`.'
-                )
-        self._defaults = defaults
-        self.next_module: OptimizerModule | None = None
-        """next module that takes this module's state and continues working on it."""
-        self.children: dict[Any, OptimizerModule] = {}
-        """children modules."""
-        self._initialized = False
-        """True if torch.optim.Optimzer.__init__ was called on this meaning this optimizer has parameters."""
-        self._default_step_target: Literal['ascent', 'grad', 'closure'] = target
-        """'ascent', 'grad' or 'closure'"""
-        self._has_custom_params = False
-        """Signifies that :any:`self.set_params` was called on this to set custom params.
-        When this is True, when parent calls :any:`_update_child_params_` with this module as child,
-        nothing will happen, as this module already has parameters set."""
-        self._passed_params: list[torch.Tensor] | list[dict[str, Any]] | None = None
-        """list of parameters or parameter groups that were passed to this module and will get passed to child modules."""
-        self.post_init_hooks: list[Callable[[Any, Self], Any]] = []
-        """Hooks that run once after a ModularOptimizer is initialized with this module.
-        Signature:
-        .. code:: py
-            def hook(optimizer: ModularOptimizer, module: OptimizerModule) -> None:
-                ...
-        where `module` is this module.
-        """
-    def __repr__(self):
-        if self._initialized: return super().__repr__()
-        return f"uninitialized {self.__class__.__name__}()"
-    def state_dict(self):
-        state_dict = {}
-        state_dict['__self__'] = super().state_dict()
-        for k,v in self.children.items():
-            state_dict[k] = v.state_dict()
-        return state_dict
-    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
-        super().load_state_dict(state_dict['__self__'])
-        for k, v in self.children.items():
-            if k in state_dict:
-                v.load_state_dict(state_dict[k])
-            else:
-                warnings.warn(f"Tried to load state dict for {k}: {v.__class__.__name__}, but it is not present in state_dict with {list(state_dict.keys()) = }")
-    def set_params(self, params: ParamsT):
-        """
-        Set parameters to this module. Use this to set per-parameter group settings.
-        """
-        self._initialize_(params, set_passed_params = False)
-        self._has_custom_params = True
-        return self
-    def _initialize_(self, params: ParamsT, set_passed_params: bool):
-        """Initializes this optimizer and all children with the given parameters."""
-        if isinstance(params, torch.Tensor): raise ValueError("Params must be an iterable of tensors, not torch.Tensor")
-        params_list = list(params)
-        if set_passed_params: self._passed_params = params_list.copy() # type:ignore
-        # super().__init__, which is torch.optim.Optimizer.__init__,
-        # calls self.add_param_group on each param group,
-        # which in turn calls _update_child_params_,
-        # which calls add_param_group on each child.
-        super().__init__(params_list.copy(), self._defaults) # type:ignore
-        self._initialized = True
-    def _set_child_(self, name, child: "_Chainable"):
-        """Set a child and initialize it's params."""
-        if not isinstance(child, OptimizerModule): child = _Chain(child)
-        self.children[name] = child
-        if self._initialized:
-            self._update_child_params_(child)
-    def _update_child_params_(self, child: "OptimizerModule"):
-        """Initializes or updates child params with parameters of this module."""
-        return self._update_next_module_params_(child)
-    def _set_next_module(self, next_module: "OptimizerModule"):
-        """Set next module and initialize it's params."""
-        self.next_module = next_module
-        if self._initialized:
-            self._update_next_module_params_(next_module)
-    def _update_next_module_params_(self, next_module: "OptimizerModule"):
-        """Initializes or updates next module params with parameters of this module."""
-        # Shouldn't forget that this method is overwritten by some modules
-        # So if I update it I need to keep that in mind
-        if self._passed_params is None:
-            raise RuntimeError(
-                f"{self.__class__.__name__} is not initialized, but _update_next_module_params_\
-                was called with next_module = {next_module.__class__.__name__}"
-            )
-        # if child is not initialized, torch.optim.Optimizer.__init__ is called on it by _initialize_ method
-        if not next_module._initialized:
-            next_module._initialize_(self._passed_params, set_passed_params=True)
-        # otherwise to avoid calling __init__ multiple twice, we erase the param groups and readd them
-        elif not next_module._has_custom_params:
-            next_module.param_groups = []
-            for group in self._passed_params:
-                if isinstance(group, torch.Tensor): group = {"params": group}
-                next_module.add_param_group(group)
-        else:
-            # still pass per-parameter settings so that they propagate to further modules
-            next_module._passed_params = self._passed_params.copy()
-    def add_param_group(self, param_group: dict[str, Any]) -> None:
-        super().add_param_group(param_group)
-        if self.next_module is not None: self._update_next_module_params_(self.next_module)
-        for c in self.children.values():
-            self._update_child_params_(c)
-    def _update_params_or_step_with_next(self, vars: OptimizationVars, params: TensorList | None = None) -> _ScalarLoss | None:
-        """If this has no children, update params and return loss. Otherwise step with the next module.
-        Optionally pass params to not recreate them if you've already made them.
-        Returns:
-            Loss (fx0 or fx0_approx)
-        """
-        # if this has no children, update params and return loss.
-        if self.next_module is None:
-            if vars.ascent is None: raise ValueError('Called _update_params_or_step_with_child but ascent_direction is None...')
-            if params is None: params = self.get_params()
-            params -= vars.ascent # type:ignore
-            return vars.get_loss()
-        # otherwise pass the updated ascent direction to the child
-        return self.next_module.step(vars)
-    @torch.no_grad
-    def _step_update_closure(self, vars: OptimizationVars) -> _ScalarLoss | None:
-        """Create a new closure which applies the `_update` method and passes it to the next module."""
-        if vars.closure is None: raise ValueError('If target == "closure", closure must be provided')
-        params = self.get_params()
-        closure = vars.closure # closure shouldn't reference state attribute because it can be changed
-        ascent_direction = vars.ascent
-        def update_closure(backward = True):
-            loss = _maybe_pass_backward(closure, backward)
-            # on backward, update the ascent direction
-            if backward:
-                grad = self._update(vars, ascent_direction) # type:ignore
-                # set new ascent direction as gradients
-                # (accumulation doesn't make sense here as closure always calls zero_grad)
-                for p, g in zip(params,grad):
-                    p.grad = g
-            return loss
-        # pass new closure to the child.
-        # if self.next_module is None:
-        #     raise ValueError(f'{self.__class__.__name__} has no child to step with (maybe set "target" from "closure" to something else??).')
-        vars.closure = update_closure
-        return self._update_params_or_step_with_next(vars)
-    @torch.no_grad
-    def _step_update_target(self, vars: OptimizationVars) -> _ScalarLoss | None:
-        """Apply _update method to the ascent direction and pass it to the child, or make a step if child is None."""
-        # the following code by default uses `_update` method which simple modules can override.
-        # But you can also just override the entire `step`.
-        params = None
-        # update ascent direction
-        if self._default_step_target == 'ascent':
-            # if this is the first module, it uses the gradients
-            if vars.grad is None: params = self.get_params()
-            t = vars.maybe_use_grad_(params)
-            vars.ascent = self._update(vars, t)
-        # update gradients
-        elif self._default_step_target == 'grad':
-            if params is None: params = self.get_params()
-            g = vars.maybe_compute_grad_(params)
-            g = self._update(vars, g)
-            vars.set_grad_(g, params)
-        else:
-            raise ValueError(f"Invalid {self._default_step_target = }")
-        # peform an update with the new state, or pass it to the child.
-        return self._update_params_or_step_with_next(vars, params=params)
-    @torch.no_grad
-    def step( # type:ignore # pylint:disable=signature-differs # pylint:disable = arguments-renamed
-        self,
-        vars: OptimizationVars
-    ) -> _ScalarLoss | None:
-        """Perform a single optimization step to update parameter."""
-        if self._default_step_target == 'closure': return self._step_update_closure(vars)
-        return self._step_update_target(vars)
-    @torch.no_grad
-    def _update(self, vars: OptimizationVars, ascent: TensorList) -> TensorList:
-        """Update `ascent_direction` and return the new ascent direction (but it may update it in place).
-        Make sure it doesn't return anything from `self.state` to avoid future modules modifying that in-place.
-        Before calling `_update`, if ascent direction was not provided to `step`, it will be set to the gradients.
-        After generating a new ascent direction with this `_update` method,
-        if this module has no child, ascent direction will be subtracted from params.
-        Otherwise everything is passed to the child."""
-        params = self.get_params()
-        gradients = ascent.grad
-        if gradients is None: gradients = [None] * len(params)
-        settings = tuple(self.get_all_group_keys(list).items())
-        new_ascent = TensorList()
-        for i, (asc, param, grad) in enumerate(zip(ascent, params, gradients)):
-            kwargs = {"vars": vars, "ascent": asc, "param": param, "grad": grad}
-            kwargs.update({k:v[i] for k,v in settings})
-            new_ascent.append(self._single_tensor_update(**kwargs))
-        return new_ascent
-    def _single_tensor_update(self, vars: OptimizationVars, ascent: torch.Tensor, param: torch.Tensor, grad: torch.Tensor | None, **kwargs) -> torch.Tensor:
-        """Update function for a single tensor.
-        Args:
-            vars (OptimizationState): holds loss, gradients, etc.
-            ascent (torch.Tensor): update tensor.
-            param (torch.Tensor): parameter tensor.
-            grad (torch.Tensor | None): gradient tensor (may be None)
-            **kwargs: all per-parameter settings (stuff that you put in `defaults = dict(beta1=beta1, beta2=beta2, eps=eps)`).
-        """
-        raise NotImplementedError()
-    def return_ascent(self, vars: OptimizationVars, params=None) -> TensorList:
-        """step with this module and return the ascent as tensorlist"""
-        if params is None: params = self.get_params()
-        true_next = self.next_module
-        self.next_module = _ReturnAscent(params) # type:ignore
-        ascent: TensorList = self.step(vars) # type:ignore
-        self.next_module = true_next
-        return ascent
-    def reset_stats(self):
-        """Resets running stats of this optimizer such as momentum. This is meant to be used stop all
-        momentum when significantly changing model parameters, for example when setting model parameters
-        to weighted average every once in a while, like periodic SWA does. Pediodic resetting
-        may also be beneficial for some optimizers.
-        By default this method completely clears per-parameter state.
-        Modules may override this to provide different functionality."""
-        for g in self.param_groups:
-            for p in g['params']:
-                state = self.state[p]
-                for k in state.copy().keys(): del state[k]
-class _ReturnAscent:
-    __slots__ = ('IS_LR_MODULE', 'params', 'children', 'next_module', )
-    def __init__(self, params):
-        self.params = params
-        self.IS_LR_MODULE = False
-        self.children = {}
-        self.next_module = None
-    @torch.no_grad
-    def step(self, vars: OptimizationVars) -> TensorList: # type:ignore
-        update = vars.maybe_use_grad_(self.params) # this will execute the closure which might be modified
-        return update
-class _MaybeReturnAscent(OptimizerModule):
-    """utility module that either returns ascent or updates the parameters depending on `_return_ascent`, used in Chain."""
-    def __init__(self):
-        super().__init__({})
-        self._return_ascent = False
-    @torch.no_grad
-    def step(self, vars: OptimizationVars):
-        assert self.next_module is None, self.next_module
-        if self._return_ascent:
-            return vars.ascent
-        return self._update_params_or_step_with_next(vars)
-_Chainable = OptimizerModule | Iterable[OptimizerModule]
-class _Chain(OptimizerModule):
-    """
-    Utility module that chains multiple modules together, usually used by other modules.
-    """
-    def __init__(self, *modules: _Chainable):
-        super().__init__({})
-        flat_modules: list[OptimizerModule] = flatten(modules)
-        if any(not hasattr(i, "step") for i in flat_modules):
-            raise TypeError(f"One of the modules is not an OptimizerModule, got {[i.__class__.__name__ for i in flat_modules]}")
-        # first module is chain's child, second module is first module's child, etc
-        self._set_child_('first', flat_modules[0])
-        if len(flat_modules) > 1:
-            for i, m in enumerate(flat_modules[:-1]):
-                m._set_next_module(flat_modules[i+1])
-            self._last_module = flat_modules[-1]
-        self._chain_modules = flat_modules
-    @torch.no_grad
-    def step(self, vars: OptimizationVars):
-        # no next module, step with the child
-        if self.next_module is None:
-            return self.children['first'].step(vars)
-        # return ascent and pass it to next module
-        # we do this because updating parameters directly is often more efficient
-        params = self.get_params()
-        self._last_module.next_module = _ReturnAscent(params) # type:ignore
-        vars.ascent: TensorList = self.children['first'].step(vars) # type:ignore
-        self._last_module.next_module = None
-        return self._update_params_or_step_with_next(vars)
+import warnings
+from abc import ABC, abstractmethod
+from collections import ChainMap, defaultdict
+from collections.abc import Callable, Iterable, MutableMapping, Sequence
+from operator import itemgetter
+from typing import Any, final, overload
+import torch
+from ..utils import (
+    Init,
+    ListLike,
+    Params,
+    _make_param_groups,
+    get_state_vals,
+)
+from ..utils.python_tools import flatten
+def _closure_backward(closure, params, retain_graph, create_graph):
+    with torch.enable_grad():
+        if not (retain_graph or create_graph):
+            return closure()
+        for p in params: p.grad = None
+        loss = closure(False)
+        grad = torch.autograd.grad(loss, params, retain_graph=retain_graph, create_graph=create_graph)
+        for p,g in zip(params,grad): p.grad = g
+        return loss
+# region Vars
+# ----------------------------------- vars ----------------------------------- #
+class Vars:
+    """
+    Holds the state and context passed between optimizer modules during a step.
+    This class acts as a mutable container for information relevant to the current
+    optimization step, such as parameters, gradients, loss, and the computed update.
+    Modules read from and write to this object to coordinate their actions.
+    """
+    def __init__(
+        self,
+        params: list[torch.Tensor],
+        closure: Callable | None,
+        model: torch.nn.Module | None,
+        current_step: int,
+    ):
+        self.params: list[torch.Tensor] = params
+        """List of all parameters with requires_grad = True."""
+        self.closure = closure
+        """A closure that reevaluates the model and returns the loss, None if it wasn't specified"""
+        self.model = model
+        """torch.nn.Module object of the model, None if it wasn't specified."""
+        self.current_step: int = current_step
+        """global current step, starts at 0"""
+        self.update: list[torch.Tensor] | None = None
+        """
+        current update, at the end this is subtracted from model parameters unless it is None.
+        If closure is None, this is initially set to cloned gradient. Otherwise this is set to None.
+        """
+        self.grad: list[torch.Tensor] | None = None
+        """gradient with current parameters. If closure is not None, this is set to None and can be calculated if needed."""
+        self.loss: torch.Tensor | Any | None = None
+        """loss with current parameters."""
+        self.loss_approx: torch.Tensor | Any | None = None
+        """loss at a point near current point. This can be useful as some modules only calculate loss at perturbed points,
+        whereas some other modules require loss strictly at current point."""
+        self.post_step_hooks: list[Callable[[Modular, Vars]]] = []
+        """list of functions to be called after optimizer step.
+        The signature is:
+        .. code:: py
+            def hook(optimizer: Modular, vars: Vars): ...
+        """
+        self.is_last: bool = False
+        """
+        Indicates that current module is either last or next-to-last before a learning rate module.
+        This is always False if current module has children or is a child.
+        """
+        self.nested_is_last: bool = False
+        """
+        Indicates that current module is either last or next-to-last before a learning rate module, for modules
+        that have children.
+        """
+        self.last_module_lrs: list[float] | None = None
+        """
+        List of per-parameter learning rates if current module is next-to-last before a
+        learning rate module, otherwise this is set to None. Ignore this unless you are manually applying
+        update to parameters.
+        """
+        self.stop: bool = False
+        """if True, all following modules will be skipped."""
+        self.skip_update: bool = False
+        """if True, the parameters will not be updated"""
+    def get_loss(self, backward: bool, retain_graph = None, create_graph: bool = False) -> torch.Tensor | float:
+        """Returns the loss at current parameters, computing it if it hasn't been computed already and assigning :code:`vars.loss`.
+        Do not call this at perturbed parameters. Backward always zeroes grads before recomputing."""
+        if self.loss is None:
+            if self.closure is None: raise RuntimeError("closure is None")
+            if backward:
+                with torch.enable_grad():
+                    self.loss = self.loss_approx = _closure_backward(
+                        closure=self.closure, params=self.params, retain_graph=retain_graph, create_graph=create_graph
+                    )
+                # initializing to zeros_like is equivalent to using zero_grad with set_to_none = False.
+                # it is technically a more correct approach for when some parameters conditionally receive gradients
+                # and in this case it shouldn't be slower.
+                self.grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in self.params]
+            else:
+                self.loss = self.loss_approx = self.closure(False)
+        # if self.loss was not None, above branch wasn't executed because loss has already been evaluated, but without backward since self.grad is None.
+        # and now it is requested to be evaluated with backward.
+        if backward and self.grad is None:
+            warnings.warn('get_loss was called with backward=False, and then with backward=True so it had to be re-evaluated, so the closure was evaluated twice where it could have been evaluated once.')
+            if self.closure is None: raise RuntimeError("closure is None")
+            with torch.enable_grad():
+                self.loss = self.loss_approx = _closure_backward(
+                    closure=self.closure, params=self.params, retain_graph=retain_graph, create_graph=create_graph
+                )
+            self.grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in self.params]
+        return self.loss # type:ignore
+    def get_grad(self, retain_graph: bool | None = None, create_graph: bool = False) -> list[torch.Tensor]:
+        """Returns the gradient at initial parameters, computing it if it hasn't been computed already and assigning
+        :code:`vars.grad` and potentially :code:`vars.loss`. Do not call this at perturbed parameters."""
+        if self.grad is None:
+            if self.closure is None: raise RuntimeError("closure is None")
+            self.get_loss(backward=True, retain_graph=retain_graph, create_graph=create_graph) # evaluate and set self.loss and self.grad
+        assert self.grad is not None
+        return self.grad
+    def get_update(self) -> list[torch.Tensor]:
+        """Returns the update. If update is None, it is initialized by cloning the gradients and assigning to :code:`vars.update`.
+        Computing the gradients may assign :code:`vars.grad` and :code:`vars.loss` if they haven't been computed.
+        Do not call this at perturbed parameters."""
+        if self.update is None: self.update = [g.clone() for g in self.get_grad()]
+        return self.update
+    def clone(self, clone_update: bool):
+        """Creates a shallow copy of the Vars object, update can optionally be deep-copied (via :code:`torch.clone`)."""
+        copy = Vars(params = self.params, closure=self.closure, model=self.model, current_step=self.current_step)
+        if clone_update and self.update is not None:
+            copy.update = [u.clone() for u in self.update]
+        else:
+            copy.update = self.update
+        copy.grad = self.grad
+        copy.loss = self.loss
+        copy.loss_approx = self.loss_approx
+        copy.post_step_hooks = self.post_step_hooks
+        copy.stop = self.stop
+        copy.skip_update = self.skip_update
+        return copy
+    def update_attrs_from_clone_(self, vars: "Vars"):
+        """Updates attributes of this `Vars` instance from a cloned instance.
+        Typically called after a child module has processed a cloned `Vars`
+        object. This propagates any newly computed loss or gradient values
+        from the child's context back to the parent `Vars` if the parent
+        didn't have them computed already.
+        """
+        if self.loss is None: self.loss = vars.loss
+        if self.loss_approx is None: self.loss_approx = vars.loss_approx
+        if self.grad is None: self.grad = vars.grad
+    def zero_grad(self, set_to_none=True):
+        if set_to_none:
+            for p in self.params: p.grad = None
+        else:
+            grads = [p.grad for p in self.params if p.grad is not None]
+            if len(grads) != 0: torch._foreach_zero_(grads)
+# endregion
+# region Module
+# ---------------------------------- module ---------------------------------- #
+class Module(ABC):
+    """Abstract base class for an optimizer modules.
+    Modules represent distinct steps or transformations within the optimization
+    process (e.g., momentum, line search, gradient accumulation).
+    A module does not store parameters, but it maintains per-parameter state and per-parameter settings
+    where tensors are used as keys (same as torch.optim.Optimizer state.)
+    Args:
+        defaults (dict[str, Any] | None):
+            a dict containing default values of optimization options (used when a parameter group doesn't specify them).
+"""
+    def __init__(self, defaults: dict[str, Any] | None = None):
+        if defaults is None: defaults = {}
+        self.defaults: dict[str, Any] = defaults
+        # settings are stored like state in per-tensor defaultdict, with per-parameter overrides possible
+        # 0 - this module specific per-parameter setting overrides set via `set_param_groups` - highest priority
+        # 1 - global per-parameter setting overrides in param_groups passed to Modular - medium priority
+        # 2 - `defaults` - lowest priority
+        self.settings: defaultdict[torch.Tensor, ChainMap[str, Any]] = defaultdict(lambda: ChainMap({}, {}, self.defaults))
+        """per-parameter settings."""
+        self.state: defaultdict[torch.Tensor, dict[str, Any]] = defaultdict(dict)
+        """Per-parameter state (e.g., momentum buffers)."""
+        self.global_state: dict[str, Any] = {}
+        """Global state for things that are not per-parameter."""
+        self.children: dict[str, Module] = {}
+        """A dictionary of child modules."""
+        self._overridden_keys = set()
+        """tracks keys overridden with `set_param_groups`, only used to not give a warning"""
+    def set_param_groups(self, param_groups: Params):
+        """Set custom parameter groups with per-parameter settings that this module will use."""
+        param_groups = _make_param_groups(param_groups, differentiable=False)
+        for group in param_groups:
+            settings = group.copy()
+            params = settings.pop('params')
+            if not settings: continue
+            self._overridden_keys.update(*settings.keys())
+            for param in params:
+                self.settings[param].maps[0].update(settings) # set module-specific per-parameter settings
+        return self
+    def set_child(self, key: str, module: "Module | Sequence[Module]"):
+        self.children[key] = maybe_chain(module)
+    def set_children_sequence(self, modules: "Iterable[Module | Sequence[Module]]", prefix = 'module_'):
+        modules = list(modules)
+        for i, m in enumerate(modules):
+            self.set_child(f'{prefix}{i}', maybe_chain(m))
+    def get_children_sequence(self, prefix = 'module_'):
+        return [self.children[f'{prefix}{i}'] for i in range(len(self.children)) if f'{prefix}{i}' in self.children]
+    def __repr__(self):
+        s = self.__class__.__name__
+        if self.children:
+            s = f'{s}('
+            for k,v in self.children.items():
+                s = f'{s}{k}={v}, '
+            s = f'{s[:-2]})'
+        return s
+    @overload
+    def get_settings(self, key: str, *,
+                     params: Sequence[torch.Tensor], cls: type[ListLike] = list) -> ListLike: ...
+    @overload
+    def get_settings(self, key: list[str] | tuple[str,...], *,
+                     params: Sequence[torch.Tensor], cls: type[ListLike] = list) -> list[ListLike]: ...
+    @overload
+    def get_settings(self, key: str, key2: str, *keys: str,
+                     params: Sequence[torch.Tensor], cls: type[ListLike] = list) -> list[ListLike]: ...
+    def get_settings(self, key: str | list[str] | tuple[str,...], key2: str | None = None, *keys: str,
+                     params: Sequence[torch.Tensor], cls: type[ListLike] = list) -> ListLike | list[ListLike]:
+        # if isinstance(params, Vars): params = params.params
+        return get_state_vals(self.settings, params, key, key2, *keys, must_exist=True, cls=cls) # pyright:ignore[reportArgumentType]
+    @overload
+    def get_state(self, key: str, *,
+                   params: Sequence[torch.Tensor], must_exist: bool = False, init: Init = torch.zeros_like,
+                   cls: type[ListLike] = list) -> ListLike: ...
+    @overload
+    def get_state(self, key: list[str] | tuple[str,...], *,
+                   params: Sequence[torch.Tensor], must_exist: bool = False, init: Init | Sequence[Init] = torch.zeros_like,
+                   cls: type[ListLike] = list) -> list[ListLike]: ...
+    @overload
+    def get_state(self, key: str, key2: str, *keys: str,
+                   params: Sequence[torch.Tensor], must_exist: bool = False, init: Init | Sequence[Init] = torch.zeros_like,
+                   cls: type[ListLike] = list) -> list[ListLike]: ...
+    def get_state(self, key: str | list[str] | tuple[str,...], key2: str | None = None, *keys: str,
+                   params: Sequence[torch.Tensor], must_exist: bool = False, init: Init | Sequence[Init] = torch.zeros_like,
+                   cls: type[ListLike] = list) -> ListLike | list[ListLike]:
+        """Returns values of per-parameter state for a given key.
+        If key doesn't exist, create it with inits.
+        This functions like `operator.itemgetter`, returning a single value if called with a single key,
+        or tuple of called with multiple keys.
+        If you want to force it to return a tuple even with a single key, pass a list/tuple of 1 or more keys.
+        .. code:: py
+            exp_avg = self.state_vals("exp_avg")
+            # returns cls (by default TensorList)
+            exp_avg, exp_avg_sq = self.state_vals("exp_avg", "exp_avg_sq")
+            # returns list of cls
+            exp_avg = self.state_vals(["exp_avg"])
+            # always returns a list of cls, even if got a single key
+        Args:
+            *keys (str):
+                the keys to look for in each parameters state.
+                if a single key is specified, this returns a single value or cls,
+                otherwise this returns a list of values or cls per each key.
+            params (Iterable[torch.Tensor]): parameters to return the states for.
+            must_exist (bool, optional):
+                If a key doesn't exist in state, if True, raises a KeyError, if False, creates the value
+                using `init` argument (default = False).
+            init (Init | Sequence[Init], optional):
+                how to initialize a key if it doesn't exist.
+                can be
+                - Callable like torch.zeros_like
+                - string - "param" or "grad" to use cloned params or cloned grads.
+                - anything else other than list/tuples will be used as-is, tensors will be cloned.
+                - list/tuple of values per each parameter, only if got a single key.
+                - list/tuple of values per each key, only if got multiple keys.
+                if multiple `keys` are specified, inits is per-key!
+                Defaults to torch.zeros_like.
+            cls (type[ListLike], optional):
+                MutableSequence class to return, this only has effect when state_keys is a list/tuple. Defaults to list.
+        Returns:
+            - if state_keys has a single key and keys has a single key, return a single value.
+            - if state_keys has a single key and keys has multiple keys, return a list of values.
+            - if state_keys has multiple keys and keys has a single key, return cls.
+            - if state_keys has multiple keys and keys has multiple keys, return list of cls.
+        """
+        # if isinstance(params, Vars): params = params.params
+        return get_state_vals(self.state, params, key, key2, *keys, must_exist=must_exist, init=init, cls=cls) # pyright:ignore[reportArgumentType]
+    # def first_setting(self, *keys:str, params:Sequence[torch.Tensor]):
+    #     # if isinstance(params, Vars): params = params.params
+    #     return itemgetter(*keys)(self.settings[params[0]])
+    def state_dict(self):
+        """state dict"""
+        packed_state = {id(k):v for k,v in self.state.items()}
+        packed_settings = {id(k):v for k,v in self.settings.items()}
+        state_dict = {
+            "state": packed_state,
+            "settings":
+                {
+                    "local": {k:v.maps[0] for k,v in packed_settings.items()},
+                    "global": {k:v.maps[1] for k,v in packed_settings.items()},
+                    "defaults": {k:v.maps[2] for k,v in packed_settings.items()},
+                },
+            "global_state": self.global_state,
+            "extra": self._extra_pack(),
+            "children": {k: v.state_dict() for k, v in self.children.items()}
+        }
+        return state_dict
+    def load_state_dict(self, state_dict: dict[str, Any], id_to_tensor: dict[int, torch.Tensor]):
+        # load state
+        state = state_dict['state']
+        self.state.clear()
+        self.state.update({id_to_tensor[k]:v for k,v in state.items()})
+        # load settings
+        settings = state_dict['settings']
+        self.settings.clear()
+        for k, v in settings['local'].items(): self.settings[id_to_tensor[k]].maps[0].update(v)
+        for k, v in settings['global'].items(): self.settings[id_to_tensor[k]].maps[1].update(v)
+        for k, v in settings['defaults'].items(): self.settings[id_to_tensor[k]].maps[2].update(v)
+        # load global state
+        self.global_state.clear()
+        self.global_state.update(state_dict['global_state'])
+        # children
+        for k, v in state_dict['children']:
+            if k in self.children: self.children[k].load_state_dict(v, id_to_tensor)
+            else: warnings.warn(f'State dict for {self} has child {k}, which is missing in {self}')
+        # extra info
+        self._extra_unpack(state_dict['extra'])
+    # ---------------------------- OVERRIDABLE METHODS --------------------------- #
+    @abstractmethod
+    def step(self, vars: Vars) -> Vars:
+        """performs a step, returns new vars but may update them in-place."""
+    def reset(self):
+        """Resets the internal state of the module (e.g. momentum)."""
+        # no complex logic is allowed there because this is overridden by many modules
+        # where super().reset() shouldn't be called
+        self.state.clear()
+        self.global_state.clear()
+    def _extra_pack(self):
+        return {}
+    def _extra_unpack(self, x):
+        pass
+# endregion
+Chainable = Module | Sequence[Module]
+def unroll_modules(*modules: Chainable) -> list[Module]:
+    unrolled = []
+    for m in modules:
+        if isinstance(m, Module):
+            unrolled.append(m)
+            unrolled.extend(unroll_modules(list(m.children.values())))
+        else:
+            unrolled.extend(unroll_modules(*m))
+    return unrolled
+# region Modular
+# ---------------------------------- Modular --------------------------------- #
+# have to inherit from Modular to support lr schedulers
+# although Accelerate doesn't work due to converting param_groups to a dict
+class Modular(torch.optim.Optimizer):
+    """Chains multiple modules into an optimizer.
+    Args:
+        params (Params | torch.nn.Module): An iterable of parameters to optimize
+            (typically `model.parameters()`), an iterable of parameter group dicts,
+            or a `torch.nn.Module` instance.
+        *modules (Module): A sequence of `Module` instances that define the
+            optimization algorithm steps.
+    """
+    # this is specifically for lr schedulers
+    param_groups: list[ChainMap[str, Any]] # pyright:ignore[reportIncompatibleVariableOverride]
+    def __init__(self, params: Params | torch.nn.Module, *modules: Module):
+        self.model: torch.nn.Module | None = None
+        """The model whose parameters are being optimized, if a model instance was passed to `__init__`."""
+        if isinstance(params, torch.nn.Module):
+            self.model = params
+            params = params.parameters()
+        self.modules = modules
+        """Top-level modules providedduring initialization."""
+        self.unrolled_modules = unroll_modules(self.modules)
+        """A flattened list of all modules including all children."""
+        param_groups = _make_param_groups(params, differentiable=False)
+        self._per_parameter_global_settings: dict[torch.Tensor, list[MutableMapping[str, Any]]] = {}
+        # make sure there is no more than a single learning rate module
+        lr_modules = [m for m in self.unrolled_modules if 'lr' in m.defaults]
+        if len(lr_modules) > 1:
+            warnings.warn(f'multiple learning rate modules detected: {lr_modules}. This may lead to componding of learning rate multiplication with per-parameter learning rates and schedulers.')
+        # iterate over all per-parameter settings overrides and check if they are applied at most once
+        for group in param_groups:
+            for k in group:
+                if k in ('params', 'lr'): continue
+                modules_with_k = [m for m in self.unrolled_modules if k in m.defaults and k not in m._overridden_keys]
+                if len(modules_with_k) > 1:
+                    warnings.warn(f'`params` has a `{k}` key, and multiple modules have that key: {modules_with_k}. If you intended to only set `{k}` to one of them, use `module.set_param_groups(params)`')
+        # defaults for schedulers
+        defaults = {}
+        for m in self.unrolled_modules: defaults.update(m.defaults)
+        super().__init__(param_groups, defaults=defaults)
+        # note - this is what super init does:
+        # self.defaults = defaults
+        # for param_group in param_groups:
+        #     self.add_param_group(param_group)
+        self.current_step = 0
+        """The global step counter for the optimizer."""
+    def add_param_group(self, param_group: dict[str, Any]):
+        proc_param_group = _make_param_groups([param_group], differentiable=False)[0]
+        self.param_groups.append(ChainMap(proc_param_group, self.defaults))
+        for p in proc_param_group['params']:
+            # updates global per-parameter setting overrides (medium priority)
+            self._per_parameter_global_settings[p] = [m.settings[p].maps[1] for m in self.unrolled_modules]
+    def state_dict(self):
+        all_params = [p for g in self.param_groups for p in g['params']]
+        id_to_idx = {id(p): i for i,p in enumerate(all_params)}
+        groups = []
+        for g in self.param_groups:
+            g = g.copy()
+            g['params'] = [id_to_idx[id(p)] for p in g['params']]
+            groups.append(g)
+        state_dict = {
+            "idx_to_id": {v:k for k,v in id_to_idx.items()},
+            "params": all_params,
+            "groups": groups,
+            "defaults": self.defaults,
+            "modules": {i: m.state_dict() for i, m in enumerate(self.unrolled_modules)}
+        }
+        return state_dict
+    def load_state_dict(self, state_dict: dict):
+        self.defaults.clear()
+        self.defaults.update(state_dict['defaults'])
+        idx_to_param = dict(enumerate(state_dict['params']))
+        groups = []
+        for g in state_dict['groups']:
+            g = g.copy()
+            g['params'] = [idx_to_param[p] for p in g['params']]
+            groups.append(g)
+        self.param_groups.clear()
+        for group in groups:
+            self.add_param_group(group)
+        id_to_tensor = {state_dict['idx_to_id'][i]: p for i,p in enumerate(state_dict['params'])}
+        for m, sd in zip(self.unrolled_modules, state_dict['modules'].values()):
+            m.load_state_dict(sd, id_to_tensor)
+    def step(self, closure=None): # pyright: ignore[reportIncompatibleMethodOverride]
+        # propagate global per-parameter setting overrides
+        for g in self.param_groups:
+            settings = dict(g.maps[0]) # ignore defaults
+            params = settings.pop('params')
+            if not settings: continue
+            for p in params:
+                if not p.requires_grad: continue
+                for map in self._per_parameter_global_settings[p]: map.update(settings)
+        # create vars
+        params = [p for g in self.param_groups for p in g['params'] if p.requires_grad]
+        vars = Vars(params=params, closure=closure, model=self.model, current_step=self.current_step)
+        # if closure is None, assume backward has been called and gather grads
+        if closure is None:
+            vars.grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
+        last_module = self.modules[-1]
+        last_lr = last_module.defaults.get('lr', None)
+        n_modules = len(self.modules)
+        # step
+        for i, module in enumerate(self.modules):
+            if i!=0: vars = vars.clone(clone_update=False)
+            # last module, or next to last module before lr
+            if (i == n_modules - 1) or ((i == n_modules - 2) and (last_lr is not None)):
+                if module.children: vars.nested_is_last = True
+                else: vars.is_last = True
+                if last_lr is not None: vars.last_module_lrs = last_module.get_settings('lr', params=vars.params)
+            vars = module.step(vars)
+            if vars.stop: break
+        # apply update
+        if not vars.skip_update:
+            with torch.no_grad():
+                torch._foreach_sub_(params, vars.get_update())
+        for hook in vars.post_step_hooks:
+            hook(self, vars)
+        self.current_step += 1
+        return vars.loss if vars.loss is not None else vars.loss_approx
+    def __repr__(self):
+        return f'Modular({", ".join(str(m) for m in self.modules)})'
+# endregion
+# region Chain
+# ----------------------------------- Chain ---------------------------------- #
+class Chain(Module):
+    """Chain of modules, mostly used internally"""
+    def __init__(self, *modules: Module | Iterable[Module]):
+        super().__init__()
+        flat_modules: list[Module] = flatten(modules)
+        for i, module in enumerate(flat_modules):
+            self.set_child(f'module_{i}', module)
+    def step(self, vars):
+        for i in range(len(self.children)):
+            vars = self.children[f'module_{i}'].step(vars)
+            if vars.stop: break
+        return vars
+    def __repr__(self):
+        s = self.__class__.__name__
+        if self.children:
+            if s == 'Chain': s = 'C' # to shorten it
+            s = f'{s}({", ".join(str(m) for m in self.children.values())}'
+        return s
+def maybe_chain(*modules: Chainable) -> Module:
+    """Returns a single module directly if only one is provided, otherwise wraps them in a :code:`Chain`."""
+    flat_modules: list[Module] = flatten(modules)
+    if len(flat_modules) == 1:
+        return flat_modules[0]
+    return Chain(*flat_modules)
+# endregion

torchzero 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl