PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/adaptive/sophia_h.py ADDED Viewed

@@ -0,0 +1,185 @@
+from typing import Literal
+from collections.abc import Callable
+import torch
+from ...core import Module, Target, Transform, Chainable, apply_transform
+from ...utils import NumberList, TensorList, as_tensorlist
+def sophia_H(
+    tensors: TensorList,
+    h: TensorList | None,
+    exp_avg_: TensorList,
+    h_exp_avg_: TensorList,
+    beta1: float | NumberList,
+    beta2: float | NumberList,
+    update_freq: int,
+    precond_scale: float | NumberList,
+    clip: float | NumberList,
+    eps: float | NumberList,
+    step: int
+):
+    # momentum
+    exp_avg_.lerp_(tensors, 1-beta1)
+    # update preconditioner
+    if step % update_freq == 0:
+        assert h is not None
+        h_exp_avg_.lerp_(h, 1-beta2)
+    else:
+        assert h is None
+    denom = (h_exp_avg_ * precond_scale).clip_(min=eps)
+    return (exp_avg_ / denom).clip_(-clip, clip)
+class SophiaH(Module):
+    """SophiaH optimizer from https://arxiv.org/abs/2305.14342
+    This is similar to Adam, but the second momentum is replaced by an exponential moving average of randomized hessian diagonal estimates, and the update is agressively clipped.
+    .. note::
+        In most cases SophiaH should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply SophiaH preconditioning to another module's output.
+    .. note::
+        If you are using gradient estimators or reformulations, set :code:`hvp_method` to "forward" or "central".
+    .. note::
+        This module requires the a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients for calculating HVPs.
+        The closure must accept a ``backward`` argument (refer to documentation).
+    Args:
+        beta1 (float, optional): first momentum. Defaults to 0.96.
+        beta2 (float, optional): momentum for hessian diagonal estimate. Defaults to 0.99.
+        update_freq (int, optional):
+            frequency of updating hessian diagonal estimate via a hessian-vector product. Defaults to 10.
+        precond_scale (float, optional):
+            scale of the preconditioner. Defaults to 1.
+        clip (float, optional):
+            clips update to (-clip, clip). Defaults to 1.
+        eps (float, optional):
+            clips hessian diagonal esimate to be no less than this value. Defaults to 1e-12.
+        hvp_method (str, optional):
+            Determines how Hessian-vector products are evaluated.
+            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
+              This requires creating a graph for the gradient.
+            - ``"forward"``: Use a forward finite difference formula to
+              approximate the HVP. This requires one extra gradient evaluation.
+            - ``"central"``: Use a central finite difference formula for a
+              more accurate HVP approximation. This requires two extra
+              gradient evaluations.
+            Defaults to "autograd".
+        fd_h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
+        n_samples (int, optional):
+            number of hessian-vector products with random vectors to evaluate each time when updating
+            the preconditioner. Larger values may lead to better hessian diagonal estimate. Defaults to 1.
+        seed (int | None, optional): seed for random vectors. Defaults to None.
+        inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
+    Examples:
+        Using SophiaH:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.SophiaH(),
+                tz.m.LR(0.1)
+            )
+        SophiaH preconditioner can be applied to any other module by passing it to the :code:`inner` argument.
+        Turn off SophiaH's first momentum to get just the preconditioning. Here is an example of applying
+        SophiaH preconditioning to nesterov momentum (:code:`tz.m.NAG`):
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.SophiaH(beta1=0, inner=tz.m.NAG(0.96)),
+                tz.m.LR(0.1)
+            )
+    """
+    def __init__(
+        self,
+        beta1: float = 0.96,
+        beta2: float = 0.99,
+        update_freq: int = 10,
+        precond_scale: float = 1,
+        clip: float = 1,
+        eps: float = 1e-12,
+        hvp_method: Literal['autograd', 'forward', 'central'] = 'autograd',
+        fd_h: float = 1e-3,
+        n_samples = 1,
+        seed: int | None = None,
+        inner: Chainable | None = None
+    ):
+        defaults = dict(beta1=beta1, beta2=beta2, update_freq=update_freq, precond_scale=precond_scale, clip=clip, eps=eps, hvp_method=hvp_method, n_samples=n_samples, fd_h=fd_h, seed=seed)
+        super().__init__(defaults)
+        if inner is not None:
+            self.set_child('inner', inner)
+    @torch.no_grad
+    def step(self, var):
+        params = var.params
+        settings = self.settings[params[0]]
+        hvp_method = settings['hvp_method']
+        fd_h = settings['fd_h']
+        update_freq = settings['update_freq']
+        n_samples = settings['n_samples']
+        seed = settings['seed']
+        generator = None
+        if seed is not None:
+            if 'generator' not in self.global_state:
+                self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
+            generator = self.global_state['generator']
+        beta1, beta2, precond_scale, clip, eps = self.get_settings(params,
+            'beta1', 'beta2', 'precond_scale', 'clip', 'eps', cls=NumberList)
+        exp_avg, h_exp_avg = self.get_state(params, 'exp_avg', 'h_exp_avg', cls=TensorList)
+        step = self.global_state.get('step', 0)
+        self.global_state['step'] = step + 1
+        closure = var.closure
+        assert closure is not None
+        h = None
+        if step % update_freq == 0:
+            rgrad=None
+            for i in range(n_samples):
+                u = [torch.randn(p.shape, device=p.device, dtype=p.dtype, generator=generator) for p in params]
+                Hvp, rgrad = self.Hvp(u, at_x0=True, var=var, rgrad=rgrad, hvp_method=hvp_method,
+                                     h=fd_h, normalize=True, retain_grad=i < n_samples-1)
+                Hvp = tuple(Hvp)
+                if h is None: h = Hvp
+                else: torch._foreach_add_(h, Hvp)
+            assert h is not None
+            if n_samples > 1: torch._foreach_div_(h, n_samples)
+        update = var.get_update()
+        if 'inner' in self.children:
+            update = apply_transform(self.children['inner'], tensors=update, params=params, grads=var.grad, var=var)
+        var.update = sophia_H(
+            tensors=TensorList(update),
+            h=TensorList(h) if h is not None else None,
+            exp_avg_=exp_avg,
+            h_exp_avg_=h_exp_avg,
+            beta1=beta1,
+            beta2=beta2,
+            update_freq=update_freq,
+            precond_scale=precond_scale,
+            clip=clip,
+            eps=eps,
+            step=step,
+        )
+        return var

torchzero/modules/clipping/clipping.py CHANGED Viewed

@@ -1,11 +1,13 @@
+import math
+from collections.abc import Iterable, Sequence
 from operator import itemgetter
 from typing import Literal
-from collections.abc import Iterable, Sequence
-import math
 import torch
 from ...core import Module, Target, Transform
-from ...utils import NumberList, TensorList, generic_eq
+from ...utils import Metrics, NumberList, TensorList
+from ...utils.metrics import _METRICS
 def clip_grad_value_(params: Iterable[torch.Tensor], value: float):
@@ -24,7 +26,7 @@ def _clip_norm_(
     min: float | NumberList | None,
     max: float | NumberList | None,
     norm_value: float | NumberList | None,
-    ord: float,
+    ord: Metrics,
     dim: int | Sequence[int] | Literal["global"] | None,
     inverse_dims: bool,
     min_size: int,
@@ -35,7 +37,7 @@ def _clip_norm_(
             raise ValueError(f'if norm_value is given then min and max must be None got {min = }; {max = }')
         # if dim is None: return tensors_.mul_(norm_value / tensors_.norm(ord=ord))
-        if dim == 'global': return tensors_.mul_(norm_value / tensors_.global_vector_norm(ord=ord))
+        if dim == 'global': return tensors_.mul_(norm_value / tensors_.global_metric(ord))
     # if dim is None: return tensors_.clip_norm_(min,max,tensorwise=True,ord=ord)
     if dim == 'global': return tensors_.clip_norm_(min,max,tensorwise=False,ord=ord)
@@ -54,9 +56,13 @@ def _clip_norm_(
         size = math.prod(tensor.size(d) for d in real_dim)
         if size < min_size: continue
-        norm: torch.Tensor = torch.linalg.vector_norm(tensor, ord=ord, dim=real_dim, keepdim=True) # pylint:disable=not-callable
+        if isinstance(ord, str):
+            norm = _METRICS[ord].evaluate_tensor(tensor, dim=real_dim, keepdim=True)
+        else:
+            norm: torch.Tensor = torch.linalg.vector_norm(tensor, ord=ord, dim=real_dim, keepdim=True) # pylint:disable=not-callable
         if norm.numel() == 1 and norm == 0: continue
-        norm = torch.where(norm == 0, 1, norm)
+        norm = torch.where(norm <= 1e-12, 1, norm)
         # normalize = True, perform normalization
         norm_v = norm_value[i] if isinstance(norm_value, (list,tuple)) else norm_value
@@ -90,7 +96,7 @@ def _clip_norm_(
 def clip_grad_norm_(
     params: Iterable[torch.Tensor],
     max_norm: float | None,
-    ord: float = 2,
+    ord: Metrics = 2,
     dim: int | Sequence[int] | Literal["global"] | None = None,
     inverse_dims: bool = False,
     min_size: int = 2,
@@ -101,7 +107,7 @@ def clip_grad_norm_(
     Args:
         params (Iterable[torch.Tensor]): parameters with gradients to clip.
-        value (float): value to clip norm to.
+        max_norm (float): value to clip norm to.
         ord (float, optional): norm order. Defaults to 2.
         dim (int | Sequence[int] | str | None, optional):
             calculates norm along those dimensions.
@@ -118,7 +124,7 @@ def clip_grad_norm_(
 def normalize_grads_(
     params: Iterable[torch.Tensor],
     norm_value: float,
-    ord: float = 2,
+    ord: Metrics = 2,
     dim: int | Sequence[int] | Literal["global"] | None = None,
     inverse_dims: bool = False,
     min_size: int = 1,
@@ -145,13 +151,41 @@ def normalize_grads_(
 class ClipValue(Transform):
-    """Clips update magnitude to be within `(-value, value)` range."""
+    """Clips update magnitude to be within ``(-value, value)`` range.
+    Args:
+        value (float): value to clip to.
+        target (str): refer to ``target argument`` in documentation.
+    Examples:
+    Gradient clipping:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.ClipValue(1),
+        tz.m.Adam(),
+        tz.m.LR(1e-2),
+    )
+    ```
+    Update clipping:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Adam(),
+        tz.m.ClipValue(1),
+        tz.m.LR(1e-2),
+    )
+    ```
+    """
     def __init__(self, value: float, target: Target = 'update'):
         defaults = dict(value=value)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         value = [s['value'] for s in settings]
         return TensorList(tensors).clip_([-v for v in value], value)
@@ -159,7 +193,7 @@ class ClipNorm(Transform):
     """Clips update norm to be no larger than `value`.
     Args:
-        value (float): value to clip norm to.
+        max_norm (float): value to clip norm to.
         ord (float, optional): norm order. Defaults to 2.
         dim (int | Sequence[int] | str | None, optional):
             calculates norm along those dimensions.
@@ -172,21 +206,43 @@ class ClipNorm(Transform):
             minimal numer of elements in a parameter or slice to clip norm. Defaults to 1.
         target (str, optional):
             what this affects.
+    Examples:
+    Gradient norm clipping:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.ClipNorm(1),
+        tz.m.Adam(),
+        tz.m.LR(1e-2),
+    )
+    ```
+    Update norm clipping:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Adam(),
+        tz.m.ClipNorm(1),
+        tz.m.LR(1e-2),
+    )
+    ```
     """
     def __init__(
         self,
         max_norm: float,
-        ord: float = 2,
+        ord: Metrics = 2,
         dim: int | Sequence[int] | Literal["global"] | None = None,
         inverse_dims: bool = False,
         min_size: int = 1,
         target: Target = "update",
     ):
         defaults = dict(max_norm=max_norm,ord=ord,dim=dim,min_size=min_size,inverse_dims=inverse_dims)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         max_norm = NumberList(s['max_norm'] for s in settings)
         ord, dim, min_size, inverse_dims = itemgetter('ord', 'dim', 'min_size', 'inverse_dims')(settings[0])
         _clip_norm_(
@@ -205,7 +261,7 @@ class Normalize(Transform):
     """Normalizes the update.
     Args:
-        value (float): desired norm value.
+        norm_value (float): desired norm value.
         ord (float, optional): norm order. Defaults to 2.
         dim (int | Sequence[int] | str | None, optional):
             calculates norm along those dimensions.
@@ -218,21 +274,43 @@ class Normalize(Transform):
             minimal size of a dimension to normalize along it. Defaults to 1.
         target (str, optional):
             what this affects.
+    Examples:
+    Gradient normalization:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Normalize(1),
+        tz.m.Adam(),
+        tz.m.LR(1e-2),
+    )
+    ```
+    Update normalization:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Adam(),
+        tz.m.Normalize(1),
+        tz.m.LR(1e-2),
+    )
+    ```
     """
     def __init__(
         self,
         norm_value: float = 1,
-        ord: float = 2,
+        ord: Metrics = 2,
         dim: int | Sequence[int] | Literal["global"] | None = None,
         inverse_dims: bool = False,
         min_size: int = 1,
         target: Target = "update",
     ):
         defaults = dict(norm_value=norm_value,ord=ord,dim=dim,min_size=min_size, inverse_dims=inverse_dims)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         norm_value = NumberList(s['norm_value'] for s in settings)
         ord, dim, min_size, inverse_dims = itemgetter('ord', 'dim', 'min_size', 'inverse_dims')(settings[0])
@@ -288,8 +366,6 @@ class Centralize(Transform):
     """Centralizes the update.
     Args:
-        value (float): desired norm value.
-        ord (float, optional): norm order. Defaults to 2.
         dim (int | Sequence[int] | str | None, optional):
             calculates norm along those dimensions.
             If list/tuple, tensors are centralized along all dimensios in `dim` that they have.
@@ -299,6 +375,20 @@ class Centralize(Transform):
             if True, the `dims` argument is inverted, and all other dimensions are centralized.
         min_size (int, optional):
             minimal size of a dimension to normalize along it. Defaults to 1.
+    Examples:
+    Standard gradient centralization:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Centralize(dim=0),
+        tz.m.LR(1e-2),
+    )
+    ```
+    References:
+    - Yong, H., Huang, J., Hua, X., & Zhang, L. (2020). Gradient centralization: A new optimization technique for deep neural networks. In Computer Vision–ECCV 2020: 16th European Conference, Glasgow, UK, August 23–28, 2020, Proceedings, Part I 16 (pp. 635-652). Springer International Publishing. https://arxiv.org/abs/2004.01461
     """
     def __init__(
         self,
@@ -308,10 +398,10 @@ class Centralize(Transform):
         target: Target = "update",
     ):
         defaults = dict(dim=dim,min_size=min_size,inverse_dims=inverse_dims)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         dim, min_size, inverse_dims = itemgetter('dim', 'min_size', 'inverse_dims')(settings[0])
         _centralize_(tensors_ = TensorList(tensors), dim=dim, inverse_dims=inverse_dims, min_size=min_size)

torchzero/modules/clipping/ema_clipping.py CHANGED Viewed

@@ -5,7 +5,7 @@ from collections.abc import Iterable, Sequence
 import torch
 from ...core import Module, Target, Transform, apply_transform, Chainable
-from ...utils import NumberList, TensorList, generic_eq, unpack_dicts, unpack_states
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states, Metrics
 class ClipNormByEMA(Transform):
     """Clips norm to be no larger than the norm of an exponential moving average of past updates.
@@ -14,9 +14,10 @@ class ClipNormByEMA(Transform):
         beta (float, optional): beta for the exponential moving average. Defaults to 0.99.
         ord (float, optional): order of the norm. Defaults to 2.
         eps (float, optional): epsilon for division. Defaults to 1e-6.
-        tensorwise (bool, optional): whether to calculate norm separately for each layer, or global norm for all layers. Defaults to True.
+        tensorwise (bool, optional):
+            if True, norms are calculated parameter-wise, otherwise treats all parameters as single vector. Defaults to True.
         max_ema_growth (float | None, optional):
-            if specified, exponential moving average norm can grow but at most this value per step. Defaults to 1.5.
+            if specified, restricts how quickly exponential moving average norm can grow. The norm is allowed to grow by at most this value per step. Defaults to 1.5.
         ema_init (str, optional):
             How to initialize exponential moving average on first step, "update" to use the first update or "zeros". Defaults to 'zeros'.
     """
@@ -24,17 +25,18 @@ class ClipNormByEMA(Transform):
     def __init__(
         self,
         beta=0.99,
-        ord: float = 2,
+        ord: Metrics = 2,
         eps=1e-6,
         tensorwise:bool=True,
         max_ema_growth: float | None = 1.5,
         ema_init: Literal['zeros', 'update'] = 'zeros',
+        inner: Chainable | None = None,
     ):
         defaults = dict(beta=beta, ord=ord, tensorwise=tensorwise, ema_init=ema_init, eps=eps, max_ema_growth=max_ema_growth)
-        super().__init__(defaults, uses_grad=False)
+        super().__init__(defaults, inner=inner)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
         tensors = TensorList(tensors)
         ord, tensorwise, ema_init, max_ema_growth = itemgetter('ord', 'tensorwise', 'ema_init', 'max_ema_growth')(settings[0])
@@ -45,7 +47,7 @@ class ClipNormByEMA(Transform):
         ema.lerp_(tensors, 1-beta)
         if tensorwise:
-            ema_norm = ema.norm(ord)
+            ema_norm = ema.metric(ord)
             # clip ema norm growth
             if max_ema_growth is not None:
@@ -62,7 +64,7 @@ class ClipNormByEMA(Transform):
             else: denom.clip_(min=1)
         else:
-            ema_norm = ema.global_vector_norm(ord)
+            ema_norm = ema.global_metric(ord)
             # clip ema norm growth
             if max_ema_growth is not None:
@@ -73,12 +75,17 @@ class ClipNormByEMA(Transform):
                     ema_norm = allowed_norm
                 prev_ema_norm.set_(ema_norm)
-            tensors_norm = tensors.global_vector_norm(ord)
+            tensors_norm = tensors.global_metric(ord)
             denom = tensors_norm / ema_norm.clip(min=eps[0])
             if self.NORMALIZE: denom.clip_(min=eps[0])
             else: denom.clip_(min=1)
-        tensors.div_(denom)
+        self.global_state['denom'] = denom
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        denom = self.global_state.pop('denom')
+        torch._foreach_div_(tensors, denom)
         return tensors
 class NormalizeByEMA(ClipNormByEMA):
@@ -88,9 +95,10 @@ class NormalizeByEMA(ClipNormByEMA):
         beta (float, optional): beta for the exponential moving average. Defaults to 0.99.
         ord (float, optional): order of the norm. Defaults to 2.
         eps (float, optional): epsilon for division. Defaults to 1e-6.
-        tensorwise (bool, optional): whether to calculate norm separately for each layer, or global norm for all layers. Defaults to True.
+        tensorwise (bool, optional):
+            if True, norms are calculated parameter-wise, otherwise treats all parameters as single vector. Defaults to True.
         max_ema_growth (float | None, optional):
-            if specified, exponential moving average norm can grow but at most this value per step. Defaults to 1.5.
+            if specified, restricts how quickly exponential moving average norm can grow. The norm is allowed to grow by at most this value per step. Defaults to 1.5.
         ema_init (str, optional):
             How to initialize exponential moving average on first step, "update" to use the first update or "zeros". Defaults to 'zeros'.
     """
@@ -99,28 +107,30 @@ class NormalizeByEMA(ClipNormByEMA):
 # TODO Centralize by EMA?
 class ClipValueByEMA(Transform):
-    """Clips magnitude of update to be no larger than magnitude of an exponential moving average of past (unclipped) updates.
+    """Clips magnitude of update to be no larger than magnitude of exponential moving average of past (unclipped) updates.
     Args:
         beta (float, optional): beta for the exponential moving average. Defaults to 0.99.
         ema_init (str, optional):
             How to initialize exponential moving average on first step, "update" to use the first update or "zeros". Defaults to 'zeros'.
-        ema_tfm (Chainable | None, optional): optional modules applied to exponential moving average before clipping by it. Defaults to None.
+        ema_tfm (Chainable | None, optional):
+            optional modules applied to exponential moving average before clipping by it. Defaults to None.
     """
     def __init__(
         self,
         beta=0.99,
         ema_init: Literal['zeros', 'update'] = 'zeros',
         ema_tfm:Chainable | None=None,
+        inner: Chainable | None = None,
     ):
         defaults = dict(beta=beta, ema_init=ema_init)
-        super().__init__(defaults, uses_grad=False)
+        super().__init__(defaults, inner=inner)
         if ema_tfm is not None:
             self.set_child('ema_tfm', ema_tfm)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
         ema_init = itemgetter('ema_init')(settings[0])
         beta = unpack_dicts(settings, 'beta', cls=NumberList)
@@ -129,8 +139,12 @@ class ClipValueByEMA(Transform):
         ema = unpack_states(states, tensors, 'ema', init = (torch.zeros_like if ema_init=='zeros' else lambda t: t.abs()), cls=TensorList)
         ema.lerp_(tensors.abs(), 1-beta)
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        tensors = TensorList(tensors)
+        ema = unpack_states(states, tensors, 'ema', cls=TensorList)
         if 'ema_tfm' in self.children:
-            ema = TensorList(apply_transform(self.children['ema_tfm'], ema, params, grads, loss))
+            ema = TensorList(apply_transform(self.children['ema_tfm'], ema.clone(), params, grads, loss))
         tensors.clip_(-ema, ema)
         return tensors

torchzero/modules/clipping/growth_clipping.py CHANGED Viewed

@@ -19,7 +19,7 @@ class ClipValueGrowth(TensorwiseTransform):
             bounds the tracked multiplicative clipping decay to prevent collapse to 0.
             Next update is at most :code:`max(previous update * mul, max_decay)`.
             Defaults to 2.
-        target (Target, optional): what to set on var.. Defaults to "update".
+        target (Target, optional): what to set on var. Defaults to "update".
     """
     def __init__(
         self,
@@ -30,11 +30,11 @@ class ClipValueGrowth(TensorwiseTransform):
         target: Target = "update",
     ):
         defaults = dict(add=add, mul=mul, min_value=min_value, max_decay=max_decay)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults, target=target)
-    def apply_tensor(self, tensor, param, grad, loss, state, settings):
-        add, mul, min_value, max_decay = itemgetter('add','mul','min_value','max_decay')(settings)
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+        add, mul, min_value, max_decay = itemgetter('add','mul','min_value','max_decay')(setting)
         add: float | None
         if add is None and mul is None:
@@ -120,7 +120,8 @@ class ClipNormGrowth(Transform):
     Args:
         add (float | None, optional): additive clipping, next update norm is at most `previous norm + add`. Defaults to None.
-        mul (float | None, optional): multiplicative clipping, next update norm is at most `previous norm * mul`. Defaults to 1.5.
+        mul (float | None, optional):
+            multiplicative clipping, next update norm is at most `previous norm * mul`. Defaults to 1.5.
         min_value (float | None, optional):
             minimum value for multiplicative clipping to prevent collapse to 0.
             Next norm is at most :code:`max(prev_norm, min_value) * mul`. Defaults to 1e-4.
@@ -144,11 +145,11 @@ class ClipNormGrowth(Transform):
         target: Target = "update",
     ):
         defaults = dict(add=add, mul=mul, min_value=min_value, max_decay=max_decay, ord=ord, parameterwise=parameterwise)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults, target=target)
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         parameterwise = settings[0]['parameterwise']
         tensors = TensorList(tensors)

torchzero/modules/conjugate_gradient/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .cg import (
+    DYHS,
+    ConjugateDescent,
+    DaiYuan,
+    FletcherReeves,
+    HagerZhang,
+    HestenesStiefel,
+    LiuStorey,
+    PolakRibiere,
+    ProjectedGradientMethod,
+)

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl