PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

tests/test_opts.py +95 -76
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +229 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/spsa1.py +93 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/__init__.py +1 -1
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +6 -7
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +114 -175
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +16 -4
torchzero/modules/line_search/strong_wolfe.py +319 -220
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +253 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +207 -170
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +99 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +122 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/optimizer.py +2 -2
torchzero/utils/python_tools.py +7 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.14.dist-info/METADATA +14 -0
torchzero-0.3.14.dist-info/RECORD +167 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/WHEEL +0 -0

torchzero/core/module.py CHANGED Viewed

@@ -3,7 +3,7 @@ from abc import ABC, abstractmethod
 from collections import ChainMap, defaultdict
 from collections.abc import Callable, Iterable, MutableMapping, Sequence
 from operator import itemgetter
-from typing import Any, final, overload, Literal
+from typing import Any, final, overload, Literal, cast
 import torch
@@ -16,6 +16,7 @@ from ..utils import (
 )
 from ..utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
 from ..utils.python_tools import flatten
+from ..utils.linalg.linear_operator import LinearOperator
 def _closure_backward(closure, params, retain_graph, create_graph):
@@ -33,11 +34,9 @@ def _closure_backward(closure, params, retain_graph, create_graph):
 # ----------------------------------- var ----------------------------------- #
 class Var:
     """
-    Holds the state and context passed between optimizer modules during a step.
+    Holds parameters, gradient, update, objective function (closure) if supplied, loss, and some other info.
+    Modules take in a ``Var`` object, modify and it is passed to the next module.
-    This class acts as a mutable container for information relevant to the current
-    optimization step, such as parameters, gradients, loss, and the computed update.
-    Modules read from and write to this object to coordinate their actions.
     """
     def __init__(
         self,
@@ -45,6 +44,10 @@ class Var:
         closure: Callable | None,
         model: torch.nn.Module | None,
         current_step: int,
+        parent: "Var | None" = None,
+        modular: "Modular | None" = None,
+        loss: torch.Tensor | None = None,
+        storage: dict | None = None,
     ):
         self.params: list[torch.Tensor] = params
         """List of all parameters with requires_grad = True."""
@@ -56,19 +59,31 @@ class Var:
         """torch.nn.Module object of the model, None if it wasn't specified."""
         self.current_step: int = current_step
-        """global current step, starts at 0"""
+        """global current step, starts at 0. This may not correspond to module current step,
+        for example a module may step every 10 global steps."""
+        self.parent: "Var | None" = parent
+        """parent ``Var`` object. When ``self.get_grad()`` is called, it will also set ``parent.grad``.
+        Same with ``self.get_loss()``. This is useful when ``self.params`` are different from ``parent.params``,
+        e.g. when projecting."""
+        self.modular: "Modular" = cast(Modular, modular)
+        """Modular optimizer object that created this ``Var``."""
         self.update: list[torch.Tensor] | None = None
         """
-        current update, at the end this is subtracted from model parameters unless it is None.
+        current update. Update is assumed to be a transformed gradient, therefore it is subtracted.
         If closure is None, this is initially set to cloned gradient. Otherwise this is set to None.
+        At the end ``var.get_update()`` is subtracted from parameters. Therefore if ``var.update`` is ``None``,
+        gradient will be used and calculated if needed.
         """
         self.grad: list[torch.Tensor] | None = None
-        """gradient with current parameters. If closure is not None, this is set to None and can be calculated if needed."""
+        """gradient with current parameters. If closure is not ``None``, this is set to ``None`` and can be calculated if needed."""
-        self.loss: torch.Tensor | Any | None = None
+        self.loss: torch.Tensor | Any | None = loss
         """loss with current parameters."""
         self.loss_approx: torch.Tensor | Any | None = None
@@ -77,24 +92,28 @@ class Var:
         self.post_step_hooks: list[Callable[[Modular, Var]]] = []
         """list of functions to be called after optimizer step.
-        The signature is:
-        .. code:: py
+        This attribute should always be modified in-place (using ``append`` or ``extend``).
-            def hook(optimizer: Modular, var: Vars): ...
+        The signature is:
+        ```python
+        def hook(optimizer: Modular, var: Vars): ...
+        ```
         """
         self.is_last: bool = False
         """
         Indicates that current module is either last or next-to-last before a learning rate module.
         This is always False if current module has children or is a child.
+        This is because otherwise the ``is_last`` would be passed to child modules, even though they aren't last.
         """
         self.nested_is_last: bool = False
         """
         Indicates that current module is either last or next-to-last before a learning rate module, for modules
-        that have children.
+        that have children. This will be passed to the children unless ``var.clone()`` is used, therefore
+        a child of a last module may also receive ``var.nested_is_last=True``.
         """
         self.last_module_lrs: list[float] | None = None
@@ -105,19 +124,30 @@ class Var:
         """
         self.stop: bool = False
-        """if True, all following modules will be skipped."""
+        """if True, all following modules will be skipped.
+        If this module is a child, it only affects modules at the same level (in the same Chain)."""
         self.skip_update: bool = False
-        """if True, the parameters will not be updated"""
+        """if True, the parameters will not be updated."""
-        self.storage: dict = {}
-        """Storage for any other data, such as hessian estimates, etc"""
+        # self.storage: dict = {}
+        # """Storage for any other data, such as hessian estimates, etc."""
-    def get_loss(self, backward: bool, retain_graph = None, create_graph: bool = False) -> torch.Tensor | float:
-        """Returns the loss at current parameters, computing it if it hasn't been computed already and assigning :code:`var.loss`.
-        Do not call this at perturbed parameters. Backward always zeroes grads before recomputing."""
+        self.attrs: dict = {}
+        """attributes, Modular.attrs is updated with this after each step. This attribute should always be modified in-place"""
+        if storage is None: storage = {}
+        self.storage: dict = storage
+        """additional kwargs passed to closure will end up in this dict. This attribute should always be modified in-place"""
+        self.should_terminate: bool | None = None
+        """termination criteria, Modular.should_terminate is set to this after each step if not None"""
+    def get_loss(self, backward: bool, retain_graph = None, create_graph: bool = False) -> torch.Tensor | float:
+        """Returns the loss at current parameters, computing it if it hasn't been computed already and assigning ``var.loss``.
+        Do not call this at perturbed parameters. Backward always sets grads to None before recomputing."""
         if self.loss is None:
             if self.closure is None: raise RuntimeError("closure is None")
             if backward:
                 with torch.enable_grad():
@@ -128,7 +158,10 @@ class Var:
                 # initializing to zeros_like is equivalent to using zero_grad with set_to_none = False.
                 # it is technically a more correct approach for when some parameters conditionally receive gradients
                 # and in this case it shouldn't be slower.
-                self.grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in self.params]
+                # next time closure() is called, it will set grad to None.
+                # zero_grad(set_to_none=False) shouldn't be used (I should add a warning)
+                self.grad = [p.grad if p.grad  is not None else torch.zeros_like(p) for p in self.params]
             else:
                 self.loss = self.loss_approx = self.closure(False)
@@ -143,11 +176,24 @@ class Var:
                     closure=self.closure, params=self.params, retain_graph=retain_graph, create_graph=create_graph
                 )
             self.grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in self.params]
+        # set parent grad
+        if self.parent is not None:
+            # the way projections/split work, they make a new closure which evaluates original
+            # closure and projects the gradient, and set it as their var.closure.
+            # then on `get_loss(backward=True)` it is called, so it also sets original parameters gradient.
+            # and we set it to parent var here.
+            if self.parent.loss is None: self.parent.loss = self.loss
+            if self.parent.grad is None and backward:
+                if all(p.grad is None for p in self.parent.params):
+                    warnings.warn("Parent grad is None after backward.")
+                self.parent.grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in self.parent.params]
         return self.loss # type:ignore
     def get_grad(self, retain_graph: bool | None = None, create_graph: bool = False) -> list[torch.Tensor]:
         """Returns the gradient at initial parameters, computing it if it hasn't been computed already and assigning
-        :code:`var.grad` and potentially :code:`var.loss`. Do not call this at perturbed parameters."""
+        ``var.grad`` and potentially ``var.loss``. Do not call this at perturbed parameters."""
         if self.grad is None:
             if self.closure is None: raise RuntimeError("closure is None")
             self.get_loss(backward=True, retain_graph=retain_graph, create_graph=create_graph) # evaluate and set self.loss and self.grad
@@ -156,15 +202,21 @@ class Var:
         return self.grad
     def get_update(self) -> list[torch.Tensor]:
-        """Returns the update. If update is None, it is initialized by cloning the gradients and assigning to :code:`var.update`.
-        Computing the gradients may assign :code:`var.grad` and :code:`var.loss` if they haven't been computed.
+        """Returns the update. If update is None, it is initialized by cloning the gradients and assigning to ``var.update``.
+        Computing the gradients may assign ``var.grad`` and ``var.loss`` if they haven't been computed.
         Do not call this at perturbed parameters."""
         if self.update is None: self.update = [g.clone() for g in self.get_grad()]
         return self.update
-    def clone(self, clone_update: bool):
-        """Creates a shallow copy of the Vars object, update can optionally be deep-copied (via :code:`torch.clone`)."""
-        copy = Var(params = self.params, closure=self.closure, model=self.model, current_step=self.current_step)
+    def clone(self, clone_update: bool, parent: "Var | None" = None):
+        """Creates a shallow copy of the Vars object, update can optionally be deep-copied (via ``torch.clone``).
+        Doesn't copy ``is_last``, ``nested_is_last`` and ``last_module_lrs``. They will always be ``False``/``None``.
+        Setting ``parent`` is only if clone's parameters are something different,
+        while clone's closure referes to the same objective but with a "view" on parameters.
+        """
+        copy = Var(params = self.params, closure=self.closure, model=self.model, current_step=self.current_step, parent=parent)
         if clone_update and self.update is not None:
             copy.update = [u.clone() for u in self.update]
@@ -174,10 +226,16 @@ class Var:
         copy.grad = self.grad
         copy.loss = self.loss
         copy.loss_approx = self.loss_approx
+        copy.closure = self.closure
         copy.post_step_hooks = self.post_step_hooks
         copy.stop = self.stop
         copy.skip_update = self.skip_update
+        copy.modular = self.modular
+        copy.attrs = self.attrs
+        copy.storage = self.storage
+        copy.should_terminate = self.should_terminate
         return copy
     def update_attrs_from_clone_(self, var: "Var"):
@@ -186,11 +244,15 @@ class Var:
         object. This propagates any newly computed loss or gradient values
         from the child's context back to the parent `Vars` if the parent
         didn't have them computed already.
+        Also, as long as ``post_step_hooks`` and ``attrs`` are modified in-place,
+        if the child updates them, the update will affect the parent too.
         """
         if self.loss is None: self.loss = var.loss
         if self.loss_approx is None: self.loss_approx = var.loss_approx
         if self.grad is None: self.grad = var.grad
-        self.storage.update(var.storage)
+        if var.should_terminate is not None: self.should_terminate = var.should_terminate
     def zero_grad(self, set_to_none=True):
         if set_to_none:
@@ -201,6 +263,7 @@ class Var:
 # endregion
 # region Module
 # ---------------------------------- module ---------------------------------- #
 class Module(ABC):
@@ -313,17 +376,16 @@ class Module(ABC):
         If you want to force it to return a tuple even with a single key, pass a list/tuple of 1 or more keys.
-        .. code:: py
-            exp_avg = self.state_vals("exp_avg")
-            # returns cls (by default TensorList)
-            exp_avg, exp_avg_sq = self.state_vals("exp_avg", "exp_avg_sq")
-            # returns list of cls
+        ```python
+        exp_avg = self.state_vals("exp_avg")
+        # returns cls (by default TensorList)
-            exp_avg = self.state_vals(["exp_avg"])
-            # always returns a list of cls, even if got a single key
+        exp_avg, exp_avg_sq = self.state_vals("exp_avg", "exp_avg_sq")
+        # returns list of cls
+        exp_avg = self.state_vals(["exp_avg"])
+        # always returns a list of cls, even if got a single key
+        ```
         Args:
             *keys (str):
@@ -402,7 +464,8 @@ class Module(ABC):
         }
         return state_dict
-    def load_state_dict(self, state_dict: dict[str, Any], id_to_tensor: dict[int, torch.Tensor]):
+    def _load_state_dict(self, state_dict: dict[str, Any], id_to_tensor: dict[int, torch.Tensor]):
+        """loads state_dict, ``id_to_tensor`` is passed by ``Modular``"""
         # load state
         state = state_dict['state']
         self.state.clear()
@@ -421,7 +484,7 @@ class Module(ABC):
         # children
         for k, v in state_dict['children']:
-            if k in self.children: self.children[k].load_state_dict(v, id_to_tensor)
+            if k in self.children: self.children[k]._load_state_dict(v, id_to_tensor)
             else: warnings.warn(f'State dict for {self} has child {k}, which is missing in {self}')
         # extra info
@@ -429,37 +492,72 @@ class Module(ABC):
     # ---------------------------- OVERRIDABLE METHODS --------------------------- #
     def step(self, var: Var) -> Var:
-        """performs a step, returns new var but may update it in-place."""
+        """performs a step, returns new ``var`` but may update it in-place."""
         self.update(var)
         return self.apply(var)
     def update(self, var:Var) -> Any:
-        """Updates the internal state of this module. This should not modify `var.update`.
+        """Updates the internal state of this module. This should not modify ``var.update``.
         Specifying ``update`` and ``apply`` methods is optional and allows certain meta-modules to be used,
-        such as ::code::`tz.m.Online`.
+        such as ``tz.m.Online`` or trust regions. Alternatively, simply override the ``step`` method.
         """
     def apply(self, var: Var) -> Var:
-        """Applies this module to ``var.get_update()``. This should not modify the internal state of this module if possible."""
-        raise NotImplementedError(f"{self} doesn't implement the `apply` method.")
+        """Applies this module to ``var.get_update()``.
+        This should not modify the internal state of this module if possible.
+        Specifying ``update`` and ``apply`` methods is optional and allows certain meta-modules to be used,
+        such as ``tz.m.Online`` or trust regions. Alternatively, simply override the ``step`` method.
+        """
+        return self.step(var)
+    def get_H(self, var: Var) -> LinearOperator | None:
+        """returns a ``LinearOperator`` corresponding to hessian or hessian approximation.
+        The hessian approximation is assumed to be for all parameters concatenated to a vector."""
+        # if this method is not defined it searches in children
+        # this should be overwritten to return None if child params are different from this modules params
+        H = None
+        for k,v in self.children.items():
+            H_v = v.get_H(var)
+            if (H is not None) and (H_v is not None):
+                raise RuntimeError(f"Two children of {self} have a hessian, second one is {k}={v}")
+            if H_v is not None: H = H_v
+        return H
     def reset(self):
-        """Resets the internal state of the module (e.g. momentum). By default clears state and global state."""
-        # no complex logic is allowed there because this is overridden by many modules
-        # where super().reset() shouldn't be called
+        """Resets the internal state of the module (e.g. momentum) and all children. By default clears state and global state."""
         self.state.clear()
+        generator = self.global_state.get("generator", None)
         self.global_state.clear()
+        if generator is not None: self.global_state["generator"] = generator
+        for c in self.children.values(): c.reset()
     def reset_for_online(self):
-        """resets only the intermediate state of this module, e.g. previous parameters and gradient."""
+        """Resets buffers that depend on previous evaluation, such as previous gradient and loss,
+        which may become inaccurate due to mini-batching.
+        ``Online`` module calls ``reset_for_online``,
+        then it calls ``update`` with previous parameters,
+        then it calls ``update`` with current parameters,
+        and then ``apply``.
+        """
         for c in self.children.values(): c.reset_for_online()
     def _extra_pack(self):
+        """extra information to store in state_dict of this optimizer.
+        Will be passed to ``_extra_unpack`` when loading the state_dict."""
         return {}
     def _extra_unpack(self, x):
-        pass
+        """``_extra_pack`` return will be passed to this method when loading state_dict.
+        This method is called after loading the rest of the state dict"""
     # ------------------------------ HELPER METHODS ------------------------------ #
@@ -474,30 +572,33 @@ class Module(ABC):
         h: float,
         normalize: bool,
         retain_grad: bool,
-    ):
+    ) -> tuple[Sequence[torch.Tensor], Sequence[torch.Tensor] | None]:
         """
-        Returns ``(Hvp, rgrad)``. ``rgrad`` is gradient at current parameters, possibly with create_graph=True, or it may be None with ``hvp_method="central"``. Gradient is set to vars automatically if ``at_x0``, you can always access it with ``vars.get_grad()``
+        Returns ``(Hvp, rgrad)``, where ``rgrad`` is gradient at current parameters,
+        possibly with ``create_graph=True``, or it may be None with ``hvp_method="central"``.
+        Gradient is set to vars automatically if ``at_x0``, you can always access it with ``vars.get_grad()``
         Single sample example:
-        .. code:: py
-            Hvp, _ = self.hvp(v, at_x0=True, rgrad=None, ..., retain_graph=False)
+        ```python
+        Hvp, _ = self.hvp(v, at_x0=True, rgrad=None, ..., retain_graph=False)
+        ```
         Multiple samples example:
-        .. code:: py
+        ```python
+        D = None
+        rgrad = None
+        for i in range(n_samples):
+            v = [torch.randn_like(p) for p in params]
+            Hvp, rgrad = self.hvp(v, at_x0=True, rgrad=rgrad, ..., retain_graph=i < n_samples-1)
-            D = None
-            rgrad = None
-            for i in range(n_samples):
-                v = [torch.randn_like(p) for p in params]
-                Hvp, rgrad = self.hvp(v, at_x0=True, rgrad=rgrad, ..., retain_graph=i < n_samples-1)
+            if D is None: D = Hvp
+            else: torch._foreach_add_(D, Hvp)
-                if D is None: D = Hvp
-                else: torch._foreach_add_(D, Hvp)
+        if n_samples > 1: torch._foreach_div_(D, n_samples)
+        ```
-            if n_samples > 1: torch._foreach_div_(D, n_samples)
         Args:
             v (Sequence[torch.Tensor]): vector in hessian-vector product
             at_x0 (bool): whether this is being called at original or perturbed parameters.
@@ -533,6 +634,14 @@ class Module(ABC):
         return Hvp, rgrad
+    def get_generator(self, device: torch.types.Device, seed: int | None):
+        if seed is None: return None
+        if 'generator' not in self.global_state:
+            self.global_state['generator'] = torch.Generator(device).manual_seed(seed)
+        return self.global_state['generator']
 # endregion
 Chainable = Module | Sequence[Module]
@@ -555,7 +664,7 @@ def unroll_modules(*modules: Chainable) -> list[Module]:
 # ---------------------------------- Modular --------------------------------- #
 class _EvalCounterClosure:
-    """keeps track of how many times closure has been evaluated"""
+    """keeps track of how many times closure has been evaluated, and sets closure return"""
     __slots__ = ("modular", "closure")
     def __init__(self, modular: "Modular", closure):
         self.modular = modular
@@ -565,8 +674,14 @@ class _EvalCounterClosure:
         if self.closure is None:
             raise RuntimeError("One of the modules requires closure to be passed to the step method")
+        v = self.closure(*args, **kwargs)
+        # set closure return on 1st evaluation
+        if self.modular._closure_return is None:
+            self.modular._closure_return = v
         self.modular.num_evaluations += 1
-        return self.closure(*args, **kwargs)
+        return v
 # have to inherit from Modular to support lr schedulers
 # although Accelerate doesn't work due to converting param_groups to a dict
@@ -584,6 +699,7 @@ class Modular(torch.optim.Optimizer):
     param_groups: list[ChainMap[str, Any]] # pyright:ignore[reportIncompatibleVariableOverride]
     def __init__(self, params: Params | torch.nn.Module, *modules: Module):
+        if len(modules) == 0: raise RuntimeError("Empty list of modules passed to `Modular`")
         self.model: torch.nn.Module | None = None
         """The model whose parameters are being optimized, if a model instance was passed to `__init__`."""
         if isinstance(params, torch.nn.Module):
@@ -617,18 +733,34 @@ class Modular(torch.optim.Optimizer):
         for m in self.unrolled_modules: defaults.update(m.defaults)
         super().__init__(param_groups, defaults=defaults)
-        # note - this is what super init does:
+        # note - this is what super().__init__(param_groups, defaults=defaults) does:
         # self.defaults = defaults
         # for param_group in param_groups:
         #     self.add_param_group(param_group)
+        # add_param_group adds a ChainMap where defaults are lowest priority,
+        # and entries specifed in param_groups or scheduler are higher priority.
+        # pytorch schedulers do group["lr"] = new_lr, which sets higher priority key.
+        # in each module, settings passed to that module by calling set_param_groups are highest priority
         self.current_step = 0
         """global step counter for the optimizer."""
         self.num_evaluations = 0
         """number of times the objective has been evaluated (number of closure calls or number of steps if closure is None)."""
+        # reformulations will change the closure to return a different loss (e.g. a sqrt homotopy, gaussian homotopy)
+        # we want to return original loss so this attribute is used
+        self._closure_return = None
+        """on each step, first time a closure is evaluated, this attribute is set to the returned value. `step` method returns this."""
+        self.attrs = {}
+        """custom attributes that can be set by modules, for example EMA of weights or best so far"""
+        self.should_terminate = False
+        """is set to True by termination criteria modules."""
     def add_param_group(self, param_group: dict[str, Any]):
         proc_param_group = _make_param_groups([param_group], differentiable=False)[0]
         self.param_groups.append(ChainMap(proc_param_group, self.defaults))
@@ -673,10 +805,13 @@ class Modular(torch.optim.Optimizer):
         id_to_tensor = {state_dict['idx_to_id'][i]: p for i,p in enumerate(state_dict['params'])}
         for m, sd in zip(self.unrolled_modules, state_dict['modules'].values()):
-            m.load_state_dict(sd, id_to_tensor)
+            m._load_state_dict(sd, id_to_tensor)
-    def step(self, closure=None): # pyright: ignore[reportIncompatibleMethodOverride]
+    def step(self, closure=None, loss=None, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride]
+        # clear closure return from previous step
+        self._closure_return = None
         # propagate global per-parameter setting overrides
         for g in self.param_groups:
             settings = dict(g.maps[0]) # ignore defaults
@@ -689,16 +824,17 @@ class Modular(torch.optim.Optimizer):
         # create var
         params = [p for g in self.param_groups for p in g['params'] if p.requires_grad]
-        var = Var(params=params, closure=_EvalCounterClosure(self, closure), model=self.model, current_step=self.current_step)
+        var = Var(params=params, closure=_EvalCounterClosure(self, closure), model=self.model, current_step=self.current_step, modular=self, loss=loss, storage=kwargs)
         # if closure is None, assume backward has been called and gather grads
         if closure is None:
             var.grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
             self.num_evaluations += 1
+        n_modules = len(self.modules)
+        if n_modules == 0: raise RuntimeError("There are no modules in this `Modular` optimizer")
         last_module = self.modules[-1]
         last_lr = last_module.defaults.get('lr', None)
-        n_modules = len(self.modules)
         # step
         for i, module in enumerate(self.modules):
@@ -718,11 +854,17 @@ class Modular(torch.optim.Optimizer):
             with torch.no_grad():
                 torch._foreach_sub_(params, var.get_update())
+        # update attributes
+        self.attrs.update(var.attrs)
+        if var.should_terminate is not None: self.should_terminate = var.should_terminate
+        # hooks
         for hook in var.post_step_hooks:
             hook(self, var)
         self.current_step += 1
-        return var.loss if var.loss is not None else var.loss_approx
+        #return var.loss if var.loss is not None else var.loss_approx
+        return self._closure_return
     def __repr__(self):
         return f'Modular({", ".join(str(m) for m in self.modules)})'
@@ -738,6 +880,21 @@ class Chain(Module):
         for i, module in enumerate(flat_modules):
             self.set_child(f'module_{i}', module)
+    def update(self, var):
+        # note here that `update` and `apply` shouldn't be used directly
+        # as it will update all modules, and then apply all modules
+        # it is used in specific cases like Chain as trust region hessian module
+        for i in range(len(self.children)):
+            self.children[f'module_{i}'].update(var)
+            if var.stop: break
+        return var
+    def apply(self, var):
+        for i in range(len(self.children)):
+            var = self.children[f'module_{i}'].apply(var)
+            if var.stop: break
+        return var
     def step(self, var):
         for i in range(len(self.children)):
             var = self.children[f'module_{i}'].step(var)
@@ -748,7 +905,7 @@ class Chain(Module):
         s = self.__class__.__name__
         if self.children:
             if s == 'Chain': s = 'C' # to shorten it
-            s = f'{s}({", ".join(str(m) for m in self.children.values())}'
+            s = f'{s}({", ".join(str(m) for m in self.children.values())})'
         return s
 def maybe_chain(*modules: Chainable) -> Module:

torchzero/core/reformulation.py ADDED Viewed

@@ -0,0 +1,65 @@
+from abc import ABC, abstractmethod
+from collections.abc import Callable, Sequence
+import torch
+from .module import Chainable, Modular, Module, Var
+class Reformulation(Module, ABC):
+    def __init__(self, defaults: dict | None, modules: Chainable | None):
+        super().__init__(defaults)
+        if modules is not None:
+            self.set_child("modules", modules)
+    @abstractmethod
+    def closure(self, backward: bool, closure: Callable, params:list[torch.Tensor], var: Var) -> tuple[float | torch.Tensor, Sequence[torch.Tensor] | None]:
+        """
+        returns (loss, gradient), if backward is False then gradient can be None.
+        If evaluating original loss/gradient at x_0, set them to ``var``.
+        """
+    def pre_step(self, var: Var) -> Var | None:
+        """This runs once before each step, whereas `closure` may run multiple times per step if further modules
+        evaluate gradients at multiple points. This is useful for example to pre-generate new random perturbations."""
+    def step(self, var):
+        ret = self.pre_step(var) # pylint:disable = assignment-from-no-return
+        if isinstance(ret, Var): var = ret
+        if var.closure is None: raise RuntimeError("Reformulation requires closure")
+        params, closure = var.params, var.closure
+        # step with children
+        if 'modules' in self.children:
+            # make a reformulated closure
+            def modified_closure(backward=True):
+                loss, grad = self.closure(backward, closure, params, var)
+                if grad is not None:
+                    for p,g in zip(params, grad):
+                        p.grad = g
+                return loss
+            # set it to a new Var object
+            modified_var = var.clone(clone_update=False)
+            modified_var.closure = modified_closure
+            # step with child
+            modules = self.children['modules']
+            modified_var = modules.step(modified_var)
+            # modified_var.loss and grad refers to loss and grad of a modified objective
+            # so we only take the update
+            var.update = modified_var.update
+        # or just evaluate new closure and set to update
+        else:
+            loss, grad = self.closure(backward=True, closure=closure, params=params, var=var)
+            if grad is not None: var.update = list(grad)
+        return var

torchzero 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl