PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +47 -36
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +8 -2
torchzero/core/chain.py +47 -0
torchzero/core/functional.py +103 -0
torchzero/core/modular.py +233 -0
torchzero/core/module.py +132 -643
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +56 -23
torchzero/core/transform.py +261 -365
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +132 -34
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +3 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/{higher_order → experimental}/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +20 -17
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +5 -5
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +8 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +3 -3
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +42 -51
torchzero/modules/line_search/strong_wolfe.py +5 -5
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +63 -61
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +25 -16
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +58 -0
torchzero/modules/second_order/inm.py +101 -0
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +105 -228
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +158 -178
torchzero/modules/second_order/rsn.py +237 -0
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +57 -50
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +112 -88
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/modules/experimental/momentum.py +0 -160
torchzero/modules/higher_order/__init__.py +0 -1
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.14.dist-info/RECORD +0 -167
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/sophia_h.py CHANGED Viewed

@@ -1,52 +1,19 @@
-from typing import Literal
-from collections.abc import Callable
 import torch
-from ...core import Module, Target, Transform, Chainable, apply_transform
-from ...utils import NumberList, TensorList, as_tensorlist
-def sophia_H(
-    tensors: TensorList,
-    h: TensorList | None,
-    exp_avg_: TensorList,
-    h_exp_avg_: TensorList,
-    beta1: float | NumberList,
-    beta2: float | NumberList,
-    update_freq: int,
-    precond_scale: float | NumberList,
-    clip: float | NumberList,
-    eps: float | NumberList,
-    step: int
-):
-    # momentum
-    exp_avg_.lerp_(tensors, 1-beta1)
-    # update preconditioner
-    if step % update_freq == 0:
-        assert h is not None
-        h_exp_avg_.lerp_(h, 1-beta2)
-    else:
-        assert h is None
-    denom = (h_exp_avg_ * precond_scale).clip_(min=eps)
-    return (exp_avg_ / denom).clip_(-clip, clip)
-class SophiaH(Module):
+from ...core import Chainable, Transform, HVPMethod
+from ...utils import Distributions, NumberList, TensorList, unpack_dicts, unpack_states
+class SophiaH(Transform):
     """SophiaH optimizer from https://arxiv.org/abs/2305.14342
     This is similar to Adam, but the second momentum is replaced by an exponential moving average of randomized hessian diagonal estimates, and the update is agressively clipped.
-    .. note::
-        In most cases SophiaH should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply SophiaH preconditioning to another module's output.
+    Notes:
+        - In most cases SophiaH should be the first module in the chain because it relies on autograd. Use the ``inner`` argument if you wish to apply SophiaH preconditioning to another module's output.
-    .. note::
-        If you are using gradient estimators or reformulations, set :code:`hvp_method` to "forward" or "central".
-    .. note::
-        This module requires the a closure passed to the optimizer step,
-        as it needs to re-evaluate the loss and gradients for calculating HVPs.
-        The closure must accept a ``backward`` argument (refer to documentation).
+        - This module requires the a closure passed to the optimizer step, as it needs to re-evaluate the loss and gradients for calculating HVPs. The closure must accept a ``backward`` argument (refer to documentation).
     Args:
         beta1 (float, optional): first momentum. Defaults to 0.96.
@@ -60,46 +27,48 @@ class SophiaH(Module):
         eps (float, optional):
             clips hessian diagonal esimate to be no less than this value. Defaults to 1e-12.
         hvp_method (str, optional):
-            Determines how Hessian-vector products are evaluated.
-            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
-              This requires creating a graph for the gradient.
-            - ``"forward"``: Use a forward finite difference formula to
-              approximate the HVP. This requires one extra gradient evaluation.
-            - ``"central"``: Use a central finite difference formula for a
-              more accurate HVP approximation. This requires two extra
-              gradient evaluations.
-            Defaults to "autograd".
-        fd_h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
+            Determines how Hessian-vector products are computed.
+            - ``"batched_autograd"`` - uses autograd with batched hessian-vector products. If a single hessian-vector is evaluated, equivalent to ``"autograd"``. Faster than ``"autograd"`` but uses more memory.
+            - ``"autograd"`` - uses autograd hessian-vector products. If multiple hessian-vector products are evaluated, uses a for-loop. Slower than ``"batched_autograd"`` but uses less memory.
+            - ``"fd_forward"`` - uses gradient finite difference approximation with a less accurate forward formula which requires one extra gradient evaluation per hessian-vector product.
+            - ``"fd_central"`` - uses gradient finite difference approximation with a more accurate central formula which requires two gradient evaluations per hessian-vector product.
+            Defaults to ``"autograd"``.
+        h (float, optional):
+            The step size for finite difference if ``hvp_method`` is
+            ``"fd_forward"`` or ``"fd_central"``. Defaults to 1e-3.
         n_samples (int, optional):
             number of hessian-vector products with random vectors to evaluate each time when updating
             the preconditioner. Larger values may lead to better hessian diagonal estimate. Defaults to 1.
         seed (int | None, optional): seed for random vectors. Defaults to None.
         inner (Chainable | None, optional): preconditioning is applied to the output of this module. Defaults to None.
-    Examples:
-        Using SophiaH:
+    ### Examples:
-        .. code-block:: python
+    Using SophiaH:
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.SophiaH(),
-                tz.m.LR(0.1)
-            )
+    ```python
-        SophiaH preconditioner can be applied to any other module by passing it to the :code:`inner` argument.
-        Turn off SophiaH's first momentum to get just the preconditioning. Here is an example of applying
-        SophiaH preconditioning to nesterov momentum (:code:`tz.m.NAG`):
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.SophiaH(),
+        tz.m.LR(0.1)
+    )
+    ```
-        .. code-block:: python
+    SophiaH preconditioner can be applied to any other module by passing it to the ``inner`` argument.
+    Turn off SophiaH's first momentum to get just the preconditioning. Here is an example of applying
+    SophiaH preconditioning to nesterov momentum (``tz.m.NAG``):
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.SophiaH(beta1=0, inner=tz.m.NAG(0.96)),
-                tz.m.LR(0.1)
-            )
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.SophiaH(beta1=0, inner=tz.m.NAG(0.96)),
+        tz.m.LR(0.1)
+    )
+    ```
     """
     def __init__(
         self,
@@ -109,77 +78,84 @@ class SophiaH(Module):
         precond_scale: float = 1,
         clip: float = 1,
         eps: float = 1e-12,
-        hvp_method: Literal['autograd', 'forward', 'central'] = 'autograd',
-        fd_h: float = 1e-3,
+        hvp_method: HVPMethod = 'autograd',
+        distribution: Distributions = 'gaussian',
+        h: float = 1e-3,
         n_samples = 1,
+        zHz: bool = True,
+        debias: bool = False,
         seed: int | None = None,
-        inner: Chainable | None = None
+        exp_avg_tfm: Chainable | None = None,
+        D_exp_avg_tfm: Chainable | None = None,
     ):
-        defaults = dict(beta1=beta1, beta2=beta2, update_freq=update_freq, precond_scale=precond_scale, clip=clip, eps=eps, hvp_method=hvp_method, n_samples=n_samples, fd_h=fd_h, seed=seed)
+        defaults = locals().copy()
+        del defaults['self'], defaults['exp_avg_tfm'], defaults["D_exp_avg_tfm"]
         super().__init__(defaults)
-        if inner is not None:
-            self.set_child('inner', inner)
+        self.set_child('exp_avg', exp_avg_tfm)
+        self.set_child('D_exp_avg', D_exp_avg_tfm)
     @torch.no_grad
-    def step(self, var):
-        params = var.params
-        settings = self.settings[params[0]]
-        hvp_method = settings['hvp_method']
-        fd_h = settings['fd_h']
-        update_freq = settings['update_freq']
-        n_samples = settings['n_samples']
+    def update_states(self, objective, states, settings):
+        params = objective.params
-        seed = settings['seed']
-        generator = None
-        if seed is not None:
-            if 'generator' not in self.global_state:
-                self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
-            generator = self.global_state['generator']
+        beta1, beta2 = unpack_dicts(settings, 'beta1', 'beta2', cls=NumberList)
-        beta1, beta2, precond_scale, clip, eps = self.get_settings(params,
-            'beta1', 'beta2', 'precond_scale', 'clip', 'eps', cls=NumberList)
+        exp_avg, D_exp_avg = unpack_states(states, params, 'exp_avg', 'D_exp_avg', cls=TensorList)
-        exp_avg, h_exp_avg = self.get_state(params, 'exp_avg', 'h_exp_avg', cls=TensorList)
+        step = self.increment_counter("step", start=0) # 0 on 1st update
-        step = self.global_state.get('step', 0)
-        self.global_state['step'] = step + 1
+        # ---------------------------- hutchinson hessian ---------------------------- #
+        fs = settings[0]
+        update_freq = fs['update_freq']
-        closure = var.closure
-        assert closure is not None
-        h = None
         if step % update_freq == 0:
+            self.increment_counter("num_Ds", start=1)
+            D, _ = objective.hutchinson_hessian(
+                rgrad = None,
+                at_x0 = True,
+                n_samples = fs['n_samples'],
+                distribution = fs['distribution'],
+                hvp_method = fs['hvp_method'],
+                h = fs['h'],
+                zHz = fs["zHz"],
+                generator = self.get_generator(params[0].device, fs["seed"]),
+            )
+            D_exp_avg.lerp_(D, weight=1-beta2)
+        # --------------------------------- momentum --------------------------------- #
+        tensors = objective.get_updates() # do this after hutchinson to not disturb autograd
+        exp_avg.lerp_(tensors, 1-beta1)
+    @torch.no_grad
+    def apply_states(self, objective, states, settings):
+        params = objective.params
+        beta1, beta2, eps, precond_scale, clip = unpack_dicts(
+            settings, 'beta1', 'beta2', 'eps', 'precond_scale', 'clip', cls=NumberList)
+        exp_avg, D_exp_avg = unpack_states(states, params, 'exp_avg', 'D_exp_avg')
+        # ---------------------------------- debias ---------------------------------- #
+        if settings[0]["debias"]:
+            bias_correction1 = 1.0 - (beta1 ** (self.global_state["step"] + 1))
+            bias_correction2 = 1.0 - (beta2 ** self.global_state["num_Ds"])
+            exp_avg = exp_avg / bias_correction1
+            D_exp_avg = D_exp_avg / bias_correction2
+        # -------------------------------- transforms -------------------------------- #
+        exp_avg = TensorList(self.inner_step_tensors(
+            "exp_avg", tensors=exp_avg, clone=True, objective=objective, must_exist=False))
+        D_exp_avg = TensorList(self.inner_step_tensors(
+            "D_exp_avg", tensors=D_exp_avg, clone=True, objective=objective, must_exist=False))
-            rgrad=None
-            for i in range(n_samples):
-                u = [torch.randn(p.shape, device=p.device, dtype=p.dtype, generator=generator) for p in params]
-                Hvp, rgrad = self.Hvp(u, at_x0=True, var=var, rgrad=rgrad, hvp_method=hvp_method,
-                                     h=fd_h, normalize=True, retain_grad=i < n_samples-1)
-                Hvp = tuple(Hvp)
-                if h is None: h = Hvp
-                else: torch._foreach_add_(h, Hvp)
-            assert h is not None
-            if n_samples > 1: torch._foreach_div_(h, n_samples)
-        update = var.get_update()
-        if 'inner' in self.children:
-            update = apply_transform(self.children['inner'], tensors=update, params=params, grads=var.grad, var=var)
-        var.update = sophia_H(
-            tensors=TensorList(update),
-            h=TensorList(h) if h is not None else None,
-            exp_avg_=exp_avg,
-            h_exp_avg_=h_exp_avg,
-            beta1=beta1,
-            beta2=beta2,
-            update_freq=update_freq,
-            precond_scale=precond_scale,
-            clip=clip,
-            eps=eps,
-            step=step,
-        )
-        return var
+        # ------------------------------ compute update ------------------------------ #
+        denom = D_exp_avg.lazy_mul(precond_scale).clip(min=eps)
+        objective.updates = (exp_avg / denom).clip_(-clip, clip)
+        return objective

torchzero/modules/clipping/clipping.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Literal
 import torch
-from ...core import Module, Target, Transform
+from ...core import Module,  TensorTransform
 from ...utils import Metrics, NumberList, TensorList
 from ...utils.metrics import _METRICS
@@ -150,7 +150,7 @@ def normalize_grads_(
     _clip_norm_(grads, min=None, max=None, norm_value=norm_value, ord=ord, dim=dim, inverse_dims=inverse_dims, min_size=min_size)
-class ClipValue(Transform):
+class ClipValue(TensorTransform):
     """Clips update magnitude to be within ``(-value, value)`` range.
     Args:
@@ -180,17 +180,17 @@ class ClipValue(Transform):
     ```
     """
-    def __init__(self, value: float, target: Target = 'update'):
+    def __init__(self, value: float):
         defaults = dict(value=value)
-        super().__init__(defaults, target=target)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         value = [s['value'] for s in settings]
         return TensorList(tensors).clip_([-v for v in value], value)
-class ClipNorm(Transform):
-    """Clips update norm to be no larger than `value`.
+class ClipNorm(TensorTransform):
+    """Clips update norm to be no larger than ``value``.
     Args:
         max_norm (float): value to clip norm to.
@@ -236,13 +236,12 @@ class ClipNorm(Transform):
         dim: int | Sequence[int] | Literal["global"] | None = None,
         inverse_dims: bool = False,
         min_size: int = 1,
-        target: Target = "update",
     ):
         defaults = dict(max_norm=max_norm,ord=ord,dim=dim,min_size=min_size,inverse_dims=inverse_dims)
-        super().__init__(defaults, target=target)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         max_norm = NumberList(s['max_norm'] for s in settings)
         ord, dim, min_size, inverse_dims = itemgetter('ord', 'dim', 'min_size', 'inverse_dims')(settings[0])
         _clip_norm_(
@@ -257,7 +256,7 @@ class ClipNorm(Transform):
         )
         return tensors
-class Normalize(Transform):
+class Normalize(TensorTransform):
     """Normalizes the update.
     Args:
@@ -304,13 +303,12 @@ class Normalize(Transform):
         dim: int | Sequence[int] | Literal["global"] | None = None,
         inverse_dims: bool = False,
         min_size: int = 1,
-        target: Target = "update",
     ):
         defaults = dict(norm_value=norm_value,ord=ord,dim=dim,min_size=min_size, inverse_dims=inverse_dims)
-        super().__init__(defaults, target=target)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         norm_value = NumberList(s['norm_value'] for s in settings)
         ord, dim, min_size, inverse_dims = itemgetter('ord', 'dim', 'min_size', 'inverse_dims')(settings[0])
@@ -362,7 +360,7 @@ def _centralize_(
     return tensors_
-class Centralize(Transform):
+class Centralize(TensorTransform):
     """Centralizes the update.
     Args:
@@ -395,13 +393,12 @@ class Centralize(Transform):
         dim: int | Sequence[int] | Literal["global"] | None = None,
         inverse_dims: bool = False,
         min_size: int = 2,
-        target: Target = "update",
     ):
         defaults = dict(dim=dim,min_size=min_size,inverse_dims=inverse_dims)
-        super().__init__(defaults, target=target)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         dim, min_size, inverse_dims = itemgetter('dim', 'min_size', 'inverse_dims')(settings[0])
         _centralize_(tensors_ = TensorList(tensors), dim=dim, inverse_dims=inverse_dims, min_size=min_size)

torchzero/modules/clipping/ema_clipping.py CHANGED Viewed

@@ -1,13 +1,14 @@
+from collections.abc import Iterable, Sequence
 from operator import itemgetter
 from typing import Literal
-from collections.abc import Iterable, Sequence
 import torch
-from ...core import Module, Target, Transform, apply_transform, Chainable
-from ...utils import NumberList, TensorList, unpack_dicts, unpack_states, Metrics
+from ...core import Chainable, TensorTransform, step
+from ...utils import Metrics, NumberList, TensorList, unpack_dicts, unpack_states
-class ClipNormByEMA(Transform):
+class ClipNormByEMA(TensorTransform):
     """Clips norm to be no larger than the norm of an exponential moving average of past updates.
     Args:
@@ -36,7 +37,7 @@ class ClipNormByEMA(Transform):
         super().__init__(defaults, inner=inner)
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         tensors = TensorList(tensors)
         ord, tensorwise, ema_init, max_ema_growth = itemgetter('ord', 'tensorwise', 'ema_init', 'max_ema_growth')(settings[0])
@@ -83,7 +84,7 @@ class ClipNormByEMA(Transform):
         self.global_state['denom'] = denom
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         denom = self.global_state.pop('denom')
         torch._foreach_div_(tensors, denom)
         return tensors
@@ -106,45 +107,50 @@ class NormalizeByEMA(ClipNormByEMA):
 # TODO Centralize by EMA?
-class ClipValueByEMA(Transform):
+class ClipValueByEMA(TensorTransform):
     """Clips magnitude of update to be no larger than magnitude of exponential moving average of past (unclipped) updates.
     Args:
         beta (float, optional): beta for the exponential moving average. Defaults to 0.99.
         ema_init (str, optional):
-            How to initialize exponential moving average on first step, "update" to use the first update or "zeros". Defaults to 'zeros'.
-        ema_tfm (Chainable | None, optional):
+            How to initialize exponential moving average on first step,
+            "update" to use the first update or "zeros". Defaults to 'zeros'.
+        exp_avg_tfm (Chainable | None, optional):
             optional modules applied to exponential moving average before clipping by it. Defaults to None.
     """
     def __init__(
         self,
         beta=0.99,
-        ema_init: Literal['zeros', 'update'] = 'zeros',
-        ema_tfm:Chainable | None=None,
+        init: Literal['zeros', 'update'] = 'zeros',
         inner: Chainable | None = None,
+        exp_avg_tfm:Chainable | None=None,
     ):
-        defaults = dict(beta=beta, ema_init=ema_init)
+        defaults = dict(beta=beta, init=init)
         super().__init__(defaults, inner=inner)
-        if ema_tfm is not None:
-            self.set_child('ema_tfm', ema_tfm)
+        self.set_child('exp_avg', exp_avg_tfm)
-    @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
-        ema_init = itemgetter('ema_init')(settings[0])
+    def single_tensor_initialize(self, tensor, param, grad, loss, state, setting):
+        if setting["init"] == "zeros":
+            state["exp_avg"] = torch.zeros_like(tensor)
+        else:
+            state["exp_avg"] = tensor.abs()
-        beta = unpack_dicts(settings, 'beta', cls=NumberList)
+    @torch.no_grad
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         tensors = TensorList(tensors)
+        beta = unpack_dicts(settings, 'beta', cls=NumberList)
-        ema = unpack_states(states, tensors, 'ema', init = (torch.zeros_like if ema_init=='zeros' else lambda t: t.abs()), cls=TensorList)
-        ema.lerp_(tensors.abs(), 1-beta)
+        exp_avg = unpack_states(states, tensors, 'exp_avg', must_exist=True, cls=TensorList)
+        exp_avg.lerp_(tensors.abs(), 1-beta)
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         tensors = TensorList(tensors)
-        ema = unpack_states(states, tensors, 'ema', cls=TensorList)
+        exp_avg = unpack_states(states, tensors, 'exp_avg')
-        if 'ema_tfm' in self.children:
-            ema = TensorList(apply_transform(self.children['ema_tfm'], ema.clone(), params, grads, loss))
+        exp_avg = TensorList(
+            self.inner_step_tensors("exp_avg", exp_avg, clone=True, params=params, grads=grads, loss=loss, must_exist=False))
-        tensors.clip_(-ema, ema)
+        tensors.clip_(-exp_avg, exp_avg)
         return tensors

torchzero/modules/clipping/growth_clipping.py CHANGED Viewed

@@ -2,11 +2,11 @@ from operator import itemgetter
 import torch
-from ...core import TensorwiseTransform, Target, Transform
-from ...utils import TensorList, as_tensorlist
+from ...core import TensorTransform
+from ...utils import TensorList
-class ClipValueGrowth(TensorwiseTransform):
+class ClipValueGrowth(TensorTransform):
     """Clips update value magnitude growth.
     Args:
@@ -27,13 +27,12 @@ class ClipValueGrowth(TensorwiseTransform):
         mul: float | None = 1.5,
         min_value: float | None = 1e-4,
         max_decay: float | None = 2,
-        target: Target = "update",
     ):
         defaults = dict(add=add, mul=mul, min_value=min_value, max_decay=max_decay)
-        super().__init__(defaults, target=target)
+        super().__init__(defaults)
-    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
         add, mul, min_value, max_decay = itemgetter('add','mul','min_value','max_decay')(setting)
         add: float | None
@@ -115,7 +114,7 @@ def norm_growth_clip_(
     return tensor_.div_(denom), new_prev_norm, denom
-class ClipNormGrowth(Transform):
+class ClipNormGrowth(TensorTransform):
     """Clips update norm growth.
     Args:
@@ -130,7 +129,7 @@ class ClipNormGrowth(Transform):
             Next norm is at most :code:`max(previous norm * mul, max_decay)`.
             Defaults to 2.
         ord (float, optional): norm order. Defaults to 2.
-        parameterwise (bool, optional):
+        tensorwise (bool, optional):
             if True, norms are calculated parameter-wise, otherwise treats all parameters as single vector. Defaults to True.
         target (Target, optional): what to set on var. Defaults to "update".
     """
@@ -141,19 +140,17 @@ class ClipNormGrowth(Transform):
         min_value: float | None = 1e-4,
         max_decay: float | None = 2,
         ord: float = 2,
-        parameterwise=True,
-        target: Target = "update",
+        tensorwise=True,
     ):
-        defaults = dict(add=add, mul=mul, min_value=min_value, max_decay=max_decay, ord=ord, parameterwise=parameterwise)
-        super().__init__(defaults, target=target)
+        defaults = dict(add=add, mul=mul, min_value=min_value, max_decay=max_decay, ord=ord, tensorwise=tensorwise)
+        super().__init__(defaults)
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        parameterwise = settings[0]['parameterwise']
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
+        tensorwise = settings[0]['tensorwise']
         tensors = TensorList(tensors)
-        if parameterwise:
+        if tensorwise:
             ts = tensors
             stts = states
             stns = settings
@@ -180,7 +177,7 @@ class ClipNormGrowth(Transform):
                 ord = setting['ord'],
             )
-        if not parameterwise:
+        if not tensorwise:
             tensors.from_vec_(ts[0])
         return tensors

torchzero 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl