PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

tests/test_opts.py +95 -69
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +225 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +4 -2
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +144 -122
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +319 -218
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +141 -80
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/python_tools.py +6 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/misc/escape.py CHANGED Viewed

@@ -1,7 +1,10 @@
+import math
+from typing import Literal
 import torch
-from ...core import Module
-from ...utils import TensorList, NumberList
+from ...core import Modular, Module, Var, Chainable
+from ...utils import NumberList, TensorList
 class EscapeAnnealing(Module):
@@ -42,19 +45,18 @@ class EscapeAnnealing(Module):
         if n_bad >= n_tol:
             for i in range(1, max_iter+1):
                 alpha = max_region * (i / max_iter)
-                pert = params.sample_like(distribution='sphere').mul_(alpha)
+                pert = params.sphere_like(radius=alpha)
                 params.add_(pert)
                 f_star = closure(False)
-                if f_star < f_0-1e-10:
+                if math.isfinite(f_star) and f_star < f_0-1e-12:
                     var.update = None
                     var.stop = True
                     var.skip_update = True
                     return var
-                else:
-                    params.sub_(pert)
+                params.sub_(pert)
             self.global_state['n_bad'] = 0
-        return var
+        return var

torchzero/modules/misc/gradient_accumulation.py CHANGED Viewed

@@ -3,46 +3,112 @@ import torch
 from ...core import Chainable, Module
+# class GradientAccumulation(Module):
+#     """Uses :code:`n` steps to accumulate gradients, after :code:`n` gradients have been accumulated, they are passed to :code:`modules` and parameters are updates.
+#     Accumulating gradients for :code:`n` steps is equivalent to increasing batch size by :code:`n`. Increasing the batch size
+#     is more computationally efficient, but sometimes it is not feasible due to memory constraints.
+#     .. note::
+#         Technically this can accumulate any inputs, including updates generated by previous modules. As long as this module is first, it will accumulate the gradients.
+#     Args:
+#         modules (Chainable): modules that perform a step every :code:`n` steps using the accumulated gradients.
+#         n (int): number of gradients to accumulate.
+#         mean (bool, optional): if True, uses mean of accumulated gradients, otherwise uses sum. Defaults to True.
+#         stop (bool, optional):
+#             this module prevents next modules from stepping unless :code:`n` gradients have been accumulate. Setting this argument to False disables that. Defaults to True.
+#     Examples:
+#         Adam with gradients accumulated for 16 batches.
+#         .. code-block:: python
+#             opt = tz.Modular(
+#                 model.parameters(),
+#                 tz.m.GradientAccumulation(
+#                     [tz.m.Adam(), tz.m.LR(1e-2)],
+#                     n=16
+#                 )
+#             )
+#     """
+#     def __init__(self, modules: Chainable, n: int, mean=True, stop=True):
+#         defaults = dict(n=n, mean=mean, stop=stop)
+#         super().__init__(defaults)
+#         self.set_child('modules', modules)
+#     @torch.no_grad
+#     def step(self, var):
+#         accumulator = self.get_state(var.params, 'accumulator')
+#         settings = self.defaults
+#         n = settings['n']; mean = settings['mean']; stop = settings['stop']
+#         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
+#         # add update to accumulator
+#         torch._foreach_add_(accumulator, var.get_update())
+#         # step with accumulated updates
+#         if step % n == 0:
+#             if mean:
+#                 torch._foreach_div_(accumulator, n)
+#             var.update = [a.clone() for a in accumulator]
+#             var = self.children['modules'].step(var)
+#             # zero accumulator
+#             torch._foreach_zero_(accumulator)
+#         else:
+#             # prevent update
+#             if stop:
+#                 var.update = None
+#                 var.stop=True
+#                 var.skip_update=True
+#         return var
 class GradientAccumulation(Module):
-    """Uses :code:`n` steps to accumulate gradients, after :code:`n` gradients have been accumulated, they are passed to :code:`modules` and parameters are updates.
+    """Uses ``n`` steps to accumulate gradients, after ``n`` gradients have been accumulated, they are passed to :code:`modules` and parameters are updates.
-    Accumulating gradients for :code:`n` steps is equivalent to increasing batch size by :code:`n`. Increasing the batch size
+    Accumulating gradients for ``n`` steps is equivalent to increasing batch size by ``n``. Increasing the batch size
     is more computationally efficient, but sometimes it is not feasible due to memory constraints.
-    .. note::
+    Note:
         Technically this can accumulate any inputs, including updates generated by previous modules. As long as this module is first, it will accumulate the gradients.
     Args:
-        modules (Chainable): modules that perform a step every :code:`n` steps using the accumulated gradients.
         n (int): number of gradients to accumulate.
         mean (bool, optional): if True, uses mean of accumulated gradients, otherwise uses sum. Defaults to True.
         stop (bool, optional):
-            this module prevents next modules from stepping unless :code:`n` gradients have been accumulate. Setting this argument to False disables that. Defaults to True.
-    Examples:
-        Adam with gradients accumulated for 16 batches.
+            this module prevents next modules from stepping unless ``n`` gradients have been accumulate. Setting this argument to False disables that. Defaults to True.
-        .. code-block:: python
+    ## Examples:
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.GradientAccumulation(
-                    modules=[tz.m.Adam(), tz.m.LR(1e-2)],
-                    n=16
-                )
-            )
+    Adam with gradients accumulated for 16 batches.
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.GradientAccumulation(),
+        tz.m.Adam(),
+        tz.m.LR(1e-2),
+    )
+    ```
     """
-    def __init__(self, modules: Chainable, n: int, mean=True, stop=True):
+    def __init__(self, n: int, mean=True, stop=True):
         defaults = dict(n=n, mean=mean, stop=stop)
         super().__init__(defaults)
-        self.set_child('modules', modules)
     @torch.no_grad
     def step(self, var):
         accumulator = self.get_state(var.params, 'accumulator')
-        settings = self.settings[var.params[0]]
+        settings = self.defaults
         n = settings['n']; mean = settings['mean']; stop = settings['stop']
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
@@ -54,15 +120,15 @@ class GradientAccumulation(Module):
             if mean:
                 torch._foreach_div_(accumulator, n)
-            var.update = [a.clone() for a in accumulator]
-            var = self.children['modules'].step(var)
+            var.update = accumulator
             # zero accumulator
-            torch._foreach_zero_(accumulator)
+            self.clear_state_keys('accumulator')
         else:
             # prevent update
             if stop:
+                var.update = None
                 var.stop=True
                 var.skip_update=True

torchzero/modules/misc/homotopy.py ADDED Viewed

@@ -0,0 +1,59 @@
+from collections.abc import Callable
+from abc import ABC, abstractmethod
+import torch
+from ...core import Module
+from ...core import Chainable
+class HomotopyBase(Module):
+    def __init__(self, defaults: dict | None = None):
+        super().__init__(defaults)
+    @abstractmethod
+    def loss_transform(self, loss: torch.Tensor) -> torch.Tensor:
+        """transform the loss"""
+    @torch.no_grad
+    def step(self, var):
+        if var.loss is not None:
+            var.loss = self.loss_transform(var.loss)
+        closure = var.closure
+        if closure is None: raise RuntimeError("SquareHomotopy requires closure")
+        def homotopy_closure(backward=True):
+            if backward:
+                with torch.enable_grad():
+                    loss = self.loss_transform(closure(False))
+                    grad = torch.autograd.grad(loss, var.params, allow_unused=True)
+                    for p,g in zip(var.params, grad):
+                        p.grad = g
+            else:
+                loss = self.loss_transform(closure(False))
+            return loss
+        var.closure = homotopy_closure
+        return var
+class SquareHomotopy(HomotopyBase):
+    def __init__(self): super().__init__()
+    def loss_transform(self, loss): return loss.square().copysign(loss)
+class SqrtHomotopy(HomotopyBase):
+    def __init__(self): super().__init__()
+    def loss_transform(self, loss): return (loss+1e-12).sqrt()
+class ExpHomotopy(HomotopyBase):
+    def __init__(self): super().__init__()
+    def loss_transform(self, loss): return loss.exp()
+class LogHomotopy(HomotopyBase):
+    def __init__(self): super().__init__()
+    def loss_transform(self, loss): return (loss+1e-12).log()
+class LambdaHomotopy(HomotopyBase):
+    def __init__(self, fn: Callable[[torch.Tensor], torch.Tensor]):
+        defaults = dict(fn=fn)
+        super().__init__(defaults)
+    def loss_transform(self, loss): return self.defaults['fn'](loss)

torchzero/modules/misc/misc.py CHANGED Viewed

@@ -1,12 +1,22 @@
 from collections import deque
-from collections.abc import Iterable
+from collections.abc import Iterable, Sequence
+from functools import partial
 from operator import itemgetter
 from typing import Literal
 import torch
 from ...core import Chainable, Module, Target, TensorwiseTransform, Transform, Var
-from ...utils import Distributions, NumberList, TensorList, unpack_dicts, unpack_states
+from ...utils import (
+    Distributions,
+    Metrics,
+    NumberList,
+    TensorList,
+    set_storage_,
+    tofloat,
+    unpack_dicts,
+    unpack_states,
+)
 class Previous(TensorwiseTransform):
@@ -139,7 +149,7 @@ class UpdateSign(Transform):
 class GraftToGrad(Transform):
     """Grafts update to the gradient, that is update is rescaled to have the same norm as the gradient."""
-    def __init__(self, tensorwise:bool=False, ord:float=2, eps:float = 1e-6, target: Target = 'update'):
+    def __init__(self, tensorwise:bool=False, ord:Metrics=2, eps:float = 1e-6, target: Target = 'update'):
         defaults = dict(tensorwise=tensorwise, ord=ord, eps=eps)
         super().__init__(defaults, uses_grad=True, target=target)
@@ -151,7 +161,7 @@ class GraftToGrad(Transform):
 class GraftGradToUpdate(Transform):
     """Outputs gradient grafted to update, that is gradient rescaled to have the same norm as the update."""
-    def __init__(self, tensorwise:bool=False, ord:float=2, eps:float = 1e-6, target: Target = 'update'):
+    def __init__(self, tensorwise:bool=False, ord:Metrics=2, eps:float = 1e-6, target: Target = 'update'):
         defaults = dict(tensorwise=tensorwise, ord=ord, eps=eps)
         super().__init__(defaults, uses_grad=True, target=target)
@@ -164,7 +174,7 @@ class GraftGradToUpdate(Transform):
 class GraftToParams(Transform):
     """Grafts update to the parameters, that is update is rescaled to have the same norm as the parameters, but no smaller than :code:`eps`."""
-    def __init__(self, tensorwise:bool=False, ord:float=2, eps:float = 1e-4, target: Target = 'update'):
+    def __init__(self, tensorwise:bool=False, ord:Metrics=2, eps:float = 1e-4, target: Target = 'update'):
         defaults = dict(tensorwise=tensorwise, ord=ord, eps=eps)
         super().__init__(defaults, uses_grad=False, target=target)
@@ -194,7 +204,7 @@ class FillLoss(Module):
     @torch.no_grad
     def step(self, var):
         alpha = self.get_settings(var.params, 'alpha')
-        loss = var.get_loss(backward=self.settings[var.params[0]]['backward'])
+        loss = var.get_loss(backward=self.defaults['backward'])
         var.update = [torch.full_like(p, loss*a) for p,a in zip(var.params, alpha)]
         return var
@@ -207,7 +217,7 @@ class MulByLoss(Module):
     @torch.no_grad
     def step(self, var):
         alpha, min_value = self.get_settings(var.params, 'alpha', 'min_value')
-        loss = var.get_loss(backward=self.settings[var.params[0]]['backward'])
+        loss = var.get_loss(backward=self.defaults['backward'])
         mul = [max(loss*a, mv) for a,mv in zip(alpha, min_value)]
         torch._foreach_mul_(var.update, mul)
         return var
@@ -221,7 +231,7 @@ class DivByLoss(Module):
     @torch.no_grad
     def step(self, var):
         alpha, min_value = self.get_settings(var.params, 'alpha', 'min_value')
-        loss = var.get_loss(backward=self.settings[var.params[0]]['backward'])
+        loss = var.get_loss(backward=self.defaults['backward'])
         mul = [max(loss*a, mv) for a,mv in zip(alpha, min_value)]
         torch._foreach_div_(var.update, mul)
         return var
@@ -229,15 +239,14 @@ class DivByLoss(Module):
 class NoiseSign(Transform):
     """Outputs random tensors with sign copied from the update."""
-    def __init__(self, distribution:Distributions = 'normal', alpha = 1):
-        defaults = dict(distribution=distribution, alpha=alpha)
+    def __init__(self, distribution:Distributions = 'normal', variance:float | None = None):
+        defaults = dict(distribution=distribution, variance=variance)
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
     def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        alpha = [s['alpha'] for s in settings]
-        distribution = self.settings[params[0]]['distribution']
-        return TensorList(tensors).sample_like(alpha, distribution).copysign_(tensors)
+        variance = unpack_dicts(settings, 'variance')
+        return TensorList(tensors).sample_like(settings[0]['distribution'], variance=variance).copysign_(tensors)
 class HpuEstimate(Transform):
     """returns ``y/||s||``, where ``y`` is difference between current and previous update (gradient), ``s`` is difference between current and previous parameters. The returned tensors are a finite difference approximation to hessian times previous update."""
@@ -257,7 +266,7 @@ class HpuEstimate(Transform):
         for p, c in zip(prev_params, params): p.copy_(c)
         for p, c in zip(prev_update, tensors): p.copy_(c)
         torch._foreach_div_(y, torch.linalg.norm(torch.cat([t.ravel() for t in s])).clip(min=1e-8)) # pylint:disable=not-callable
-        self.store(params, ['s', 'y'], [s, y])
+        self.store(params, 'y', y)
     @torch.no_grad
     def apply_tensors(self, tensors, params, grads, loss, states, settings):
@@ -295,7 +304,7 @@ class RandomHvp(Module):
             rgrad = None
             for i in range(n_samples):
-                u = params.sample_like(distribution=distribution)
+                u = params.sample_like(distribution=distribution, variance=1)
                 Hvp, rgrad = self.Hvp(u, at_x0=True, var=var, rgrad=rgrad, hvp_method=hvp_method,
                                     h=h, normalize=True, retain_grad=i < n_samples-1)
@@ -314,3 +323,61 @@ class RandomHvp(Module):
         var.update = list(D)
         return var
+@torch.no_grad
+def _load_best_parameters(params: Sequence[torch.Tensor], best_params: Sequence[torch.Tensor]):
+    for p_cur, p_best in zip(params, best_params):
+        set_storage_(p_cur, p_best)
+class SaveBest(Module):
+    """Saves best parameters found so far, ones that have lowest loss. Put this as the last module.
+    Adds the following attrs:
+    - ``best_params`` - a list of tensors with best parameters.
+    - ``best_loss`` - loss value with ``best_params``.
+    - ``load_best_parameters`` - a function that sets parameters to the best parameters./
+    ## Examples
+    ```python
+    def rosenbrock(x, y):
+        return (1 - x)**2 + (100 * (y - x**2))**2
+    xy = torch.tensor((-1.1, 2.5), requires_grad=True)
+    opt = tz.Modular(
+        [xy],
+        tz.m.NAG(0.999),
+        tz.m.LR(1e-6),
+        tz.m.SaveBest()
+    )
+    # optimize for 1000 steps
+    for i in range(1000):
+        loss = rosenbrock(*xy)
+        opt.zero_grad()
+        loss.backward()
+        opt.step(loss=loss) # SaveBest needs closure or loss
+    # NAG overshot, but we saved the best params
+    print(f'{rosenbrock(*xy) = }') # >> 3.6583
+    print(f"{opt.attrs['best_loss'] = }") # >> 0.000627
+    # load best parameters
+    opt.attrs['load_best_params']()
+    print(f'{rosenbrock(*xy) = }') # >> 0.000627
+    """
+    def __init__(self):
+        super().__init__()
+    @torch.no_grad
+    def step(self, var):
+        loss = tofloat(var.get_loss(False))
+        lowest_loss = self.global_state.get('lowest_loss', float("inf"))
+        if loss < lowest_loss:
+            self.global_state['lowest_loss'] = loss
+            best_params = var.attrs['best_params'] = [p.clone() for p in var.params]
+            var.attrs['best_loss'] = loss
+            var.attrs['load_best_params'] = partial(_load_best_parameters, params=var.params, best_params=best_params)
+        return var

torchzero/modules/misc/multistep.py CHANGED Viewed

@@ -97,7 +97,7 @@ class NegateOnLossIncrease(Module):
     def step(self, var):
         closure = var.closure
         if closure is None: raise RuntimeError('NegateOnLossIncrease requires closure')
-        backtrack = self.settings[var.params[0]]['backtrack']
+        backtrack = self.defaults['backtrack']
         update = var.get_update()
         f_0 = var.get_loss(backward=False)
@@ -123,36 +123,72 @@ class NegateOnLossIncrease(Module):
 class Online(Module):
-    """Allows certain modules to be used for mini-batch optimization."""
-    def __init__(self, module: Chainable,):
+    """Allows certain modules to be used for mini-batch optimization.
+    Examples:
+    Online L-BFGS with Backtracking line search
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Online(tz.m.LBFGS()),
+        tz.m.Backtracking()
+    )
+    ```
+    Online L-BFGS trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.TrustCG(tz.m.Online(tz.m.LBFGS()))
+    )
+    ```
+    """
+    def __init__(self, *modules: Module,):
         super().__init__()
-        self.set_child('module', module)
+        self.set_child('module', modules)
     @torch.no_grad
-    def step(self, var):
+    def update(self, var):
         closure = var.closure
         if closure is None: raise ValueError("Closure must be passed for Online")
         step = self.global_state.get('step', 0) + 1
         self.global_state['step'] = step
         params = TensorList(var.params)
         p_cur = params.clone()
         p_prev = self.get_state(params, 'p_prev', cls=TensorList)
         module = self.children['module']
+        var_c = var.clone(clone_update=False)
+        # on 1st step just step and store previous params
         if step == 1:
-            var = module.step(var.clone(clone_update=False))
             p_prev.copy_(params)
-            return var
-        # restore previous params
+            module.update(var_c)
+            var.update_attrs_from_clone_(var_c)
+            return
+        # restore previous params and update
         var_prev = Var(params=params, closure=closure, model=var.model, current_step=var.current_step)
         params.set_(p_prev)
         module.reset_for_online()
         module.update(var_prev)
-        # restore current params
+        # restore current params and update
         params.set_(p_cur)
         p_prev.copy_(params)
-        return module.step(var.clone(clone_update=False))
+        module.update(var_c)
+        var.update_attrs_from_clone_(var_c)
+    @torch.no_grad
+    def apply(self, var):
+        module = self.children['module']
+        return module.apply(var.clone(clone_update=False))
+    def get_H(self, var):
+        return self.children['module'].get_H(var)

torchzero/modules/misc/regularization.py CHANGED Viewed

@@ -1,12 +1,8 @@
-from collections import deque
-from collections.abc import Iterable
-from operator import itemgetter
-from typing import Literal
 import torch
-from ...core import Chainable, Module, Target, TensorwiseTransform, Transform, Var
-from ...utils import Distributions, NumberList, TensorList, unpack_dicts, unpack_states
+from ...core import Chainable, Module, Target, Transform
+from ...core.reformulation import Reformulation
+from ...utils import Distributions, NumberList, TensorList
 class Dropout(Transform):
@@ -121,8 +117,8 @@ class PerturbWeights(Module):
     Args:
         alpha (float, optional): multiplier for perturbation magnitude. Defaults to 0.1.
         relative (bool, optional): whether to multiply perturbation by mean absolute value of the parameter. Defaults to True.
-        graft (bool, optional):
-            if True, parameters after dropout are rescaled to have the same norm as before dropout. Defaults to False.
+        distribution (bool, optional):
+            distribution of the random perturbation. Defaults to False.
     """
     def __init__(self, alpha: float = 0.1, relative:bool=True, distribution:Distributions = 'normal'):
         defaults = dict(alpha=alpha, relative=relative, distribution=distribution, perturb=True)

torchzero/modules/misc/split.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from collections.abc import Callable
+import warnings
+from collections.abc import Callable, Sequence, Iterable
 from typing import cast
 import torch
@@ -22,59 +23,78 @@ def _split(
     if var.update is not None:
         split_update = [u for i,u in enumerate(var.update) if i in idxs]
-    split_var = var.clone(clone_update=False)
+    split_var = var.clone(clone_update=False, parent=var)
     split_var.params = split_params
     split_var.grad = split_grad
     split_var.update = split_update
     split_var = module.step(split_var)
-    if (var.grad is None) and (split_var.grad is not None):
-        var.grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
+    # those should be set due to var being parent
+    if split_var.grad is not None:
+        assert var.grad is not None
+    if split_var.loss is not None:
+        assert var.loss is not None
     if split_var.update is not None:
+        # make sure update is set, it will be filled with ``true`` and ``false`` tensors
         if var.update is None:
             if var.grad is None: var.update = [cast(torch.Tensor, None) for _ in var.params]
             else: var.update = [g.clone() for g in var.grad]
+        # set all tensors from this split
         for idx, u in zip(idxs, split_var.update):
             var.update[idx] = u
-    var.update_attrs_from_clone_(split_var)
     return var
-class Split(Module):
-    """Apply `true` modules to all parameters filtered by `filter`, apply `false` modules to all other parameters.
+_SingleFilter = Callable[[torch.Tensor], bool] | torch.Tensor | Iterable[torch.Tensor] | torch.nn.Module | Iterable[torch.nn.Module]
+Filter = _SingleFilter | Iterable[_SingleFilter]
-    Args:
-        filter (Callable[[torch.Tensor], bool]): a function that takes in a parameter tensor and returns a boolean value.
-        true (Chainable | None): modules that are applied to tensors where :code:`filter` returned True.
-        false (Chainable | None): modules that are applied to tensors where :code:`filter` returned False.
-    Examples:
-        standard Muon with Adam fallback
-        .. code-block:: python
-            opt = tz.Modular(
-                model.head.parameters(),
-                tz.m.Split(
-                    # apply muon only to 2D+ parameters
-                    filter = lambda t: t.ndim >= 2,
-                    true = [
-                        tz.m.HeavyBall(),
-                        tz.m.Orthogonalize(),
-                        tz.m.LR(1e-2),
-                    ],
-                    false = tz.m.Adam()
-                ),
-                tz.m.LR(1e-2)
-            )
+def _make_filter(filter: Filter):
+    if callable(filter): return filter
+    if isinstance(filter, torch.Tensor):
+        return lambda x: x is filter
+    if isinstance(filter, torch.nn.Module):
+        return _make_filter(filter.parameters())
+    # iterable
+    filters = [_make_filter(f) for f in filter]
+    return lambda x: any(f(x) for f in filters)
+class Split(Module):
+    """Apply ``true`` modules to all parameters filtered by ``filter``, apply ``false`` modules to all other parameters.
+    Args:
+        filter (Filter, bool]):
+            a filter that selects tensors to be optimized by ``true``.
+            - tensor or iterable of tensors (e.g. ``encoder.parameters()``).
+            - function that takes in tensor and outputs a bool (e.g. ``lambda x: x.ndim >= 2``).
+            - a sequence of above (acts as "or", so returns true if any of them is true).
+        true (Chainable | None): modules that are applied to tensors where ``filter`` is ``True``.
+        false (Chainable | None): modules that are applied to tensors where ``filter`` is ``False``.
+    ### Examples:
+    Muon with Adam fallback using same hyperparams as https://github.com/KellerJordan/Muon
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.NAG(0.95),
+        tz.m.Split(
+            lambda p: p.ndim >= 2,
+            true = tz.m.Orthogonalize(),
+            false = [tz.m.Adam(0.9, 0.95), tz.m.Mul(1/66)],
+        ),
+        tz.m.LR(1e-2),
+    )
+    ```
     """
-    def __init__(self, filter: Callable[[torch.Tensor], bool], true: Chainable | None, false: Chainable | None):
+    def __init__(self, filter: Filter, true: Chainable | None, false: Chainable | None):
         defaults = dict(filter=filter)
         super().__init__(defaults)
@@ -84,7 +104,7 @@ class Split(Module):
     def step(self, var):
         params = var.params
-        filter = self.settings[params[0]]['filter']
+        filter = _make_filter(self.settings[params[0]]['filter'])
         true_idxs = []
         false_idxs = []
@@ -92,11 +112,11 @@ class Split(Module):
             if filter(p): true_idxs.append(i)
             else: false_idxs.append(i)
-        if 'true' in self.children:
+        if 'true' in self.children and len(true_idxs) > 0:
             true = self.children['true']
             var = _split(true, idxs=true_idxs, params=params, var=var)
-        if 'false' in self.children:
+        if 'false' in self.children and len(false_idxs) > 0:
             false = self.children['false']
             var = _split(false, idxs=false_idxs, params=params, var=var)

torchzero 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl