PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +64 -50
tests/test_vars.py +1 -0
torchzero/core/module.py +138 -6
torchzero/core/transform.py +158 -51
torchzero/modules/__init__.py +3 -2
torchzero/modules/clipping/clipping.py +114 -17
torchzero/modules/clipping/ema_clipping.py +27 -13
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/experimental/__init__.py +22 -5
torchzero/modules/experimental/absoap.py +5 -2
torchzero/modules/experimental/adadam.py +8 -2
torchzero/modules/experimental/adamY.py +8 -2
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +21 -4
torchzero/modules/experimental/adasoap.py +7 -2
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +4 -1
torchzero/modules/experimental/etf.py +32 -9
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +27 -28
torchzero/modules/experimental/newtonnewton.py +7 -3
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +11 -4
torchzero/modules/experimental/{tada.py → tensor_adagrad.py} +10 -6
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +30 -3
torchzero/modules/grad_approximation/forward_gradient.py +13 -3
torchzero/modules/grad_approximation/grad_approximator.py +51 -6
torchzero/modules/grad_approximation/rfdm.py +285 -38
torchzero/modules/higher_order/higher_order_newton.py +152 -89
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +34 -9
torchzero/modules/line_search/line_search.py +70 -12
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +2 -2
torchzero/modules/line_search/strong_wolfe.py +34 -7
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/{ops → misc}/debug.py +24 -1
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/{ops → misc}/split.py +29 -1
torchzero/modules/{ops → misc}/switch.py +44 -3
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +6 -6
torchzero/modules/momentum/cautious.py +45 -8
torchzero/modules/momentum/ema.py +7 -7
torchzero/modules/momentum/experimental.py +2 -2
torchzero/modules/momentum/matrix_momentum.py +90 -63
torchzero/modules/momentum/momentum.py +2 -1
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +72 -26
torchzero/modules/ops/multi.py +77 -16
torchzero/modules/ops/reduce.py +15 -7
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +20 -12
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +23 -13
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +7 -6
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/{experimental/spectral.py → optimizers/ladagrad.py} +91 -71
torchzero/modules/optimizers/lion.py +1 -1
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +30 -5
torchzero/modules/optimizers/orthograd.py +1 -1
torchzero/modules/optimizers/rmsprop.py +7 -4
torchzero/modules/optimizers/rprop.py +42 -8
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +39 -5
torchzero/modules/optimizers/soap.py +29 -19
torchzero/modules/optimizers/sophia_h.py +71 -14
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +188 -94
torchzero/modules/quasi_newton/__init__.py +12 -2
torchzero/modules/quasi_newton/cg.py +160 -59
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +101 -57
torchzero/modules/quasi_newton/quasi_newton.py +863 -215
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +220 -41
torchzero/modules/second_order/newton_cg.py +300 -11
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/gaussian.py +34 -0
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +89 -7
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/optim/wrappers/directsearch.py +39 -2
torchzero/optim/wrappers/fcmaes.py +21 -13
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/optuna.py +1 -1
torchzero/optim/wrappers/scipy.py +5 -3
torchzero/utils/__init__.py +2 -2
torchzero/utils/derivatives.py +3 -3
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +10 -0
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/METADATA +65 -40
torchzero-0.3.11.dist-info/RECORD +159 -0
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.10.dist-info/RECORD +0 -139
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/WHEEL +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/second_order/nystrom.py CHANGED Viewed

@@ -10,12 +10,60 @@ from ...core import Chainable, apply_transform, Module
 from ...utils.linalg.solve import nystrom_sketch_and_solve, nystrom_pcg
 class NystromSketchAndSolve(Module):
+    """Newton's method with a Nyström sketch-and-solve solver.
+    .. note::
+        This module requires the a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients for calculating HVPs.
+        The closure must accept a ``backward`` argument (refer to documentation).
+    .. note::
+        In most cases NystromSketchAndSolve should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply Newton preconditioning to another module's output.
+    .. note::
+        If this is unstable, increase the :code:`reg` parameter and tune the rank.
+    .. note:
+        :code:`tz.m.NystromPCG` usually outperforms this.
+    Args:
+        rank (int): size of the sketch, this many hessian-vector products will be evaluated per step.
+        reg (float, optional): regularization parameter. Defaults to 1e-3.
+        hvp_method (str, optional):
+            Determines how Hessian-vector products are evaluated.
+            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
+              This requires creating a graph for the gradient.
+            - ``"forward"``: Use a forward finite difference formula to
+              approximate the HVP. This requires one extra gradient evaluation.
+            - ``"central"``: Use a central finite difference formula for a
+              more accurate HVP approximation. This requires two extra
+              gradient evaluations.
+            Defaults to "autograd".
+        h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
+        inner (Chainable | None, optional): modules to apply hessian preconditioner to. Defaults to None.
+        seed (int | None, optional): seed for random generator. Defaults to None.
+    Examples:
+        NystromSketchAndSolve with backtracking line search
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.NystromSketchAndSolve(10),
+                tz.m.Backtracking()
+            )
+    Reference:
+        Frangella, Z., Tropp, J. A., & Udell, M. (2023). Randomized nyström preconditioning. SIAM Journal on Matrix Analysis and Applications, 44(2), 718-752. https://arxiv.org/abs/2110.02820
+    """
     def __init__(
         self,
         rank: int,
         reg: float = 1e-3,
         hvp_method: Literal["forward", "central", "autograd"] = "autograd",
-        h=1e-3,
+        h: float = 1e-3,
         inner: Chainable | None = None,
         seed: int | None = None,
     ):
@@ -86,6 +134,61 @@ class NystromSketchAndSolve(Module):
 class NystromPCG(Module):
+    """Newton's method with a Nyström-preconditioned conjugate gradient solver.
+    This tends to outperform NewtonCG but requires tuning sketch size.
+    An adaptive version exists in https://arxiv.org/abs/2110.02820, I might implement it too at some point.
+    .. note::
+        This module requires the a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients for calculating HVPs.
+        The closure must accept a ``backward`` argument (refer to documentation).
+    .. note::
+        In most cases NystromPCG should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply Newton preconditioning to another module's output.
+    Args:
+        sketch_size (int):
+            size of the sketch for preconditioning, this many hessian-vector products will be evaluated before
+            running the conjugate gradient solver. Larger value improves the preconditioning and speeds up
+            conjugate gradient.
+        maxiter (int | None, optional):
+            maximum number of iterations. By default this is set to the number of dimensions
+            in the objective function, which is supposed to be enough for conjugate gradient
+            to have guaranteed convergence. Setting this to a small value can still generate good enough directions.
+            Defaults to None.
+        tol (float, optional): relative tolerance for conjugate gradient solver. Defaults to 1e-4.
+        reg (float, optional): regularization parameter. Defaults to 1e-8.
+        hvp_method (str, optional):
+            Determines how Hessian-vector products are evaluated.
+            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
+              This requires creating a graph for the gradient.
+            - ``"forward"``: Use a forward finite difference formula to
+              approximate the HVP. This requires one extra gradient evaluation.
+            - ``"central"``: Use a central finite difference formula for a
+              more accurate HVP approximation. This requires two extra
+              gradient evaluations.
+            Defaults to "autograd".
+        h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
+        inner (Chainable | None, optional): modules to apply hessian preconditioner to. Defaults to None.
+        seed (int | None, optional): seed for random generator. Defaults to None.
+    Examples:
+        NystromPCG with backtracking line search
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.NystromPCG(10),
+                tz.m.Backtracking()
+            )
+    Reference:
+        Frangella, Z., Tropp, J. A., & Udell, M. (2023). Randomized nyström preconditioning. SIAM Journal on Matrix Analysis and Applications, 44(2), 718-752. https://arxiv.org/abs/2110.02820
+    """
     def __init__(
         self,
         sketch_size: int,

torchzero/modules/smoothing/gaussian.py CHANGED Viewed

@@ -64,6 +64,40 @@ def _clear_state_hook(optimizer: Modular, var: Var, self: Module):
             m.reset()
 class GaussianHomotopy(Reformulation):
+    """Approximately smoothes the function with a gaussian kernel by sampling it at random perturbed points around current point. Both function values and gradients are averaged over all samples. The perturbed points are generated before each
+    step and remain the same throughout the step.
+    .. note::
+        This module reformulates the objective, it modifies the closure to evaluate value and gradients of a smoothed function. All modules after this will operate on the modified objective.
+    .. note::
+        This module requires the a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients at perturbed points.
+    Args:
+        n_samples (int): number of points to sample, larger values lead to a more accurate smoothing.
+        init_sigma (float): initial scale of perturbations.
+        tol (float | None, optional):
+            if maximal parameters change value is smaller than this, sigma is reduced by :code:`decay`. Defaults to 1e-4.
+        decay (float, optional): multiplier to sigma when converged on a smoothed function. Defaults to 0.5.
+        max_steps (int | None, optional): maximum number of steps before decaying sigma. Defaults to None.
+        clear_state (bool, optional):
+            whether to clear all other module states when sigma is decayed, because the objective function changes. Defaults to True.
+        seed (int | None, optional): seed for random perturbationss. Defaults to None.
+    Examples:
+        Gaussian-smoothed NewtonCG
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.GaussianHomotopy(100),
+                tz.m.NewtonCG(maxiter=20),
+                tz.m.AdaptiveBacktracking(),
+            )
+    """
     def __init__(
         self,
         n_samples: int,

torchzero/modules/smoothing/laplacian.py CHANGED Viewed

@@ -56,7 +56,7 @@ def _precompute_denominator(tensor: torch.Tensor, sigma) -> torch.Tensor:
     return 1 - sigma * torch.fft.fft(v) # pylint: disable = not-callable
 class LaplacianSmoothing(Transform):
-    """Applies laplacian smoothing via a fast Fourier transform solver.
+    """Applies laplacian smoothing via a fast Fourier transform solver which can improve generalization.
     Args:
         sigma (float, optional): controls the amount of smoothing. Defaults to 1.
@@ -69,9 +69,19 @@ class LaplacianSmoothing(Transform):
         target (str, optional):
             what to set on var.
+    Examples:
+        Laplacian Smoothing Gradient Descent optimizer as in the paper
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.LaplacianSmoothing(),
+                tz.m.LR(1e-2),
+            )
     Reference:
-        *Osher, S., Wang, B., Yin, P., Luo, X., Barekat, F., Pham, M., & Lin, A. (2022).
-        Laplacian smoothing gradient descent. Research in the Mathematical Sciences, 9(3), 55.*
+        Osher, S., Wang, B., Yin, P., Luo, X., Barekat, F., Pham, M., & Lin, A. (2022). Laplacian smoothing gradient descent. Research in the Mathematical Sciences, 9(3), 55.
     """
     def __init__(self, sigma:float = 1, layerwise=True, min_numel = 4, target: Target = 'update'):
@@ -82,7 +92,7 @@ class LaplacianSmoothing(Transform):
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         layerwise = settings[0]['layerwise']
         # layerwise laplacian smoothing

torchzero/modules/step_size/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .lr import LR, StepSize, Warmup, WarmupNormClip, RandomStepSize
2	+ from .adaptive import PolyakStepSize, BarzilaiBorwein

torchzero/modules/step_size/adaptive.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""Various step size strategies"""
+from typing import Any, Literal
+from operator import itemgetter
+import torch
+from ...core import Transform, Chainable
+from ...utils import TensorList, unpack_dicts, unpack_states, NumberList
+class PolyakStepSize(Transform):
+    """Polyak's subgradient method.
+    Args:
+        f_star (int, optional):
+            (estimated) minimal possible value of the objective function (lowest possible loss). Defaults to 0.
+        max (float | None, optional): maximum possible step size. Defaults to None.
+        use_grad (bool, optional):
+            if True, uses dot product of update and gradient to compute the step size.
+            Otherwise, dot product of update with itself is used, which has no geometric meaning so it probably won't work well.
+            Defaults to False.
+        alpha (float, optional): multiplier to Polyak step-size. Defaults to 1.
+    """
+    def __init__(self, f_star: float = 0, max: float | None = None, use_grad=False, alpha: float = 1, inner: Chainable | None = None):
+        defaults = dict(alpha=alpha, max=max, f_star=f_star, use_grad=use_grad)
+        super().__init__(defaults, uses_grad=use_grad, uses_loss=True, inner=inner)
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
+        assert grads is not None and loss is not None
+        tensors = TensorList(tensors)
+        grads = TensorList(grads)
+        use_grad, max, f_star = itemgetter('use_grad', 'max', 'f_star')(settings[0])
+        if use_grad: gg = tensors.dot(grads)
+        else: gg = tensors.dot(tensors)
+        if gg.abs() <= torch.finfo(gg.dtype).eps: step_size = 0 # converged
+        else: step_size = (loss - f_star) / gg
+        if max is not None:
+            if step_size > max: step_size = max
+        self.global_state['step_size'] = step_size
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        step_size = self.global_state.get('step_size', 1)
+        torch._foreach_mul_(tensors, step_size * unpack_dicts(settings, 'alpha', cls=NumberList))
+        return tensors
+def _bb_short(s: TensorList, y: TensorList, sy, eps, fallback):
+    yy = y.dot(y)
+    if yy < eps:
+        if sy < eps: return fallback # try to fallback on long
+        ss = s.dot(s)
+        return ss/sy
+    return sy/yy
+def _bb_long(s: TensorList, y: TensorList, sy, eps, fallback):
+    ss = s.dot(s)
+    if sy < eps:
+        yy = y.dot(y) # try to fallback on short
+        if yy < eps: return fallback
+        return sy/yy
+    return ss/sy
+def _bb_geom(s: TensorList, y: TensorList, sy, eps, fallback):
+    short = _bb_short(s, y, sy, eps, fallback)
+    long = _bb_long(s, y, sy, eps, fallback)
+    return (short * long) ** 0.5
+class BarzilaiBorwein(Transform):
+    """Barzilai-Borwein method.
+    Args:
+        type (str, optional):
+            one of "short" with formula sᵀy/yᵀy, "long" with formula sᵀs/sᵀy, or "geom" to use geometric mean of short and long.
+            Defaults to 'geom'.
+        scale_first (bool, optional):
+            whether to make first step very small when previous gradient is not available. Defaults to True.
+        fallback (float, optional): step size when denominator is less than 0 (will happen on negative curvature). Defaults to 1e-3.
+        inner (Chainable | None, optional):
+            step size will be applied to outputs of this module. Defaults to None.
+    """
+    def __init__(self, type: Literal['long', 'short', 'geom'] = 'geom', scale_first:bool=True, fallback:float=1e-3, inner:Chainable|None = None):
+        defaults = dict(type=type, fallback=fallback)
+        super().__init__(defaults, uses_grad=False, scale_first=scale_first, inner=inner)
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('prev_p', 'prev_g')
+    @torch.no_grad
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
+        prev_p, prev_g = unpack_states(states, tensors, 'prev_p', 'prev_g', cls=TensorList)
+        fallback = unpack_dicts(settings, 'fallback', cls=NumberList)
+        type = settings[0]['type']
+        s = params-prev_p
+        y = tensors-prev_g
+        sy = s.dot(y)
+        eps = torch.finfo(sy.dtype).eps
+        if type == 'short': step_size = _bb_short(s, y, sy, eps, fallback)
+        elif type == 'long': step_size = _bb_long(s, y, sy, eps, fallback)
+        elif type == 'geom': step_size = _bb_geom(s, y, sy, eps, fallback)
+        else: raise ValueError(type)
+        self.global_state['step_size'] = step_size
+        prev_p.copy_(params)
+        prev_g.copy_(tensors)
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        step_size = self.global_state.get('step_size', 1)
+        torch._foreach_mul_(tensors, step_size)
+        return tensors

torchzero/modules/step_size/lr.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""Learning rate"""
+import torch
+import random
+from ...core import Transform
+from ...utils import NumberList, TensorList, generic_ne, unpack_dicts
+def lazy_lr(tensors: TensorList, lr: float | list, inplace:bool):
+    """multiplies by lr if lr is not 1"""
+    if generic_ne(lr, 1):
+        if inplace: return tensors.mul_(lr)
+        return tensors * lr
+    return tensors
+class LR(Transform):
+    """Learning rate. Adding this module also adds support for LR schedulers."""
+    def __init__(self, lr: float):
+        defaults=dict(lr=lr)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        return lazy_lr(TensorList(tensors), lr=[s['lr'] for s in settings], inplace=True)
+class StepSize(Transform):
+    """this is exactly the same as LR, except the `lr` parameter can be renamed to any other name to avoid clashes"""
+    def __init__(self, step_size: float, key = 'step_size'):
+        defaults={"key": key, key: step_size}
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        return lazy_lr(TensorList(tensors), lr=[s[s['key']] for s in settings], inplace=True)
+def _warmup_lr(step: int, start_lr: float | NumberList, end_lr: float | NumberList, steps: float):
+    """returns warm up lr scalar"""
+    if step > steps: return end_lr
+    return start_lr + (end_lr - start_lr) * (step / steps)
+class Warmup(Transform):
+    """Learning rate warmup, linearly increases learning rate multiplier from :code:`start_lr` to :code:`end_lr` over :code:`steps` steps.
+    Args:
+        steps (int, optional): number of steps to perform warmup for. Defaults to 100.
+        start_lr (_type_, optional): initial learning rate multiplier on first step. Defaults to 1e-5.
+        end_lr (float, optional): learning rate multiplier at the end and after warmup. Defaults to 1.
+    Example:
+        Adam with 1000 steps warmup
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.LR(1e-2),
+                tz.m.Warmup(steps=1000)
+            )
+    """
+    def __init__(self, steps = 100, start_lr = 1e-5, end_lr:float = 1):
+        defaults = dict(start_lr=start_lr,end_lr=end_lr, steps=steps)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        start_lr, end_lr = unpack_dicts(settings, 'start_lr', 'end_lr', cls = NumberList)
+        num_steps = settings[0]['steps']
+        step = self.global_state.get('step', 0)
+        tensors = lazy_lr(
+            TensorList(tensors),
+            lr=_warmup_lr(step=step, start_lr=start_lr, end_lr=end_lr, steps=num_steps),
+            inplace=True
+        )
+        self.global_state['step'] = step + 1
+        return tensors
+class WarmupNormClip(Transform):
+    """Warmup via clipping of the update norm.
+    Args:
+        start_norm (_type_, optional): maximal norm on the first step. Defaults to 1e-5.
+        end_norm (float, optional): maximal norm on the last step. After that, norm clipping is disabled. Defaults to 1.
+        steps (int, optional): number of steps to perform warmup for. Defaults to 100.
+    Example:
+        Adam with 1000 steps norm clip warmup
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.WarmupNormClip(steps=1000)
+                tz.m.LR(1e-2),
+            )
+    """
+    def __init__(self, steps = 100, start_norm = 1e-5, end_norm:float = 1):
+        defaults = dict(start_norm=start_norm,end_norm=end_norm, steps=steps)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        start_norm, end_norm = unpack_dicts(settings, 'start_norm', 'end_norm', cls = NumberList)
+        num_steps = settings[0]['steps']
+        step = self.global_state.get('step', 0)
+        if step > num_steps: return tensors
+        tensors = TensorList(tensors)
+        norm = tensors.global_vector_norm()
+        current_max_norm = _warmup_lr(step, start_norm[0], end_norm[0], num_steps)
+        if norm > current_max_norm:
+            tensors.mul_(current_max_norm / norm)
+        self.global_state['step'] = step + 1
+        return tensors
+class RandomStepSize(Transform):
+    """Uses random global or layer-wise step size from `low` to `high`.
+    Args:
+        low (float, optional): minimum learning rate. Defaults to 0.
+        high (float, optional): maximum learning rate. Defaults to 1.
+        parameterwise (bool, optional):
+            if True, generate random step size for each parameter separately,
+            if False generate one global random step size. Defaults to False.
+    """
+    def __init__(self, low: float = 0, high: float = 1, parameterwise=False, seed:int|None=None):
+        defaults = dict(low=low, high=high, parameterwise=parameterwise,seed=seed)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        s = settings[0]
+        parameterwise = s['parameterwise']
+        seed = s['seed']
+        if 'generator' not in self.global_state:
+            self.global_state['generator'] = random.Random(seed)
+        generator: random.Random = self.global_state['generator']
+        if parameterwise:
+            low, high = unpack_dicts(settings, 'low', 'high')
+            lr = [generator.uniform(l, h) for l, h in zip(low, high)]
+        else:
+            low = s['low']
+            high = s['high']
+            lr = generator.uniform(low, high)
+        torch._foreach_mul_(tensors, lr)
+        return tensors

torchzero/modules/weight_decay/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_, ~~NormalizedWeightDecay~~
1	+ from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_, RelativeWeightDecay

torchzero/modules/weight_decay/weight_decay.py CHANGED Viewed

@@ -22,22 +22,99 @@ def weight_decay_(
 class WeightDecay(Transform):
+    """Weight decay.
+    Args:
+        weight_decay (float): weight decay scale.
+        ord (int, optional): order of the penalty, e.g. 1 for L1 and 2 for L2. Defaults to 2.
+        target (Target, optional): what to set on var. Defaults to 'update'.
+    Examples:
+        Adam with non-decoupled weight decay
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.WeightDecay(1e-3),
+                tz.m.Adam(),
+                tz.m.LR(1e-3)
+            )
+        Adam with decoupled weight decay that still scales with learning rate
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.WeightDecay(1e-3),
+                tz.m.LR(1e-3)
+            )
+        Adam with fully decoupled weight decay that doesn't scale with learning rate
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.LR(1e-3),
+                tz.m.WeightDecay(1e-6)
+            )
+    """
     def __init__(self, weight_decay: float, ord: int = 2, target: Target = 'update'):
         defaults = dict(weight_decay=weight_decay, ord=ord)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         weight_decay = NumberList(s['weight_decay'] for s in settings)
         ord = settings[0]['ord']
         return weight_decay_(as_tensorlist(tensors), as_tensorlist(params), weight_decay, ord)
-class NormalizedWeightDecay(Transform):
+class RelativeWeightDecay(Transform):
+    """Weight decay relative to the mean absolute value of update, gradient or parameters depending on value of :code:`norm_input` argument.
+    Args:
+        weight_decay (float): relative weight decay scale.
+        ord (int, optional): order of the penalty, e.g. 1 for L1 and 2 for L2. Defaults to 2.
+        norm_input (str, optional):
+            determines what should weight decay be relative to. "update", "grad" or "params".
+            Defaults to "update".
+        target (Target, optional): what to set on var. Defaults to 'update'.
+    Examples:
+        Adam with non-decoupled relative weight decay
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.RelativeWeightDecay(1e-3),
+                tz.m.Adam(),
+                tz.m.LR(1e-3)
+            )
+        Adam with decoupled relative weight decay
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.RelativeWeightDecay(1e-3),
+                tz.m.LR(1e-3)
+            )
+    """
     def __init__(
         self,
         weight_decay: float = 0.1,
-        ord: int = 2,
+        ord: int  = 2,
         norm_input: Literal["update", "grad", "params"] = "update",
         target: Target = "update",
     ):
@@ -45,7 +122,7 @@ class NormalizedWeightDecay(Transform):
         super().__init__(defaults, uses_grad=norm_input == 'grad', target=target)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         weight_decay = NumberList(s['weight_decay'] for s in settings)
         ord = settings[0]['ord']
@@ -60,9 +137,9 @@ class NormalizedWeightDecay(Transform):
         else:
             raise ValueError(norm_input)
-        norm = src.global_vector_norm(ord)
+        mean_abs = src.abs().global_mean()
-        return weight_decay_(as_tensorlist(tensors), as_tensorlist(params), weight_decay * norm, ord)
+        return weight_decay_(as_tensorlist(tensors), as_tensorlist(params), weight_decay * mean_abs, ord)
 @torch.no_grad
@@ -72,7 +149,12 @@ def decay_weights_(params: Iterable[torch.Tensor], weight_decay: float | NumberL
     weight_decay_(params, params, -weight_decay, ord)
 class DirectWeightDecay(Module):
-    """directly decays weights in-place"""
+    """Directly applies weight decay to parameters.
+    Args:
+        weight_decay (float): weight decay scale.
+        ord (int, optional): order of the penalty, e.g. 1 for L1 and 2 for L2. Defaults to 2.
+    """
     def __init__(self, weight_decay: float, ord: int = 2,):
         defaults = dict(weight_decay=weight_decay, ord=ord)
         super().__init__(defaults)

torchzero/modules/wrappers/optim_wrapper.py CHANGED Viewed

@@ -7,7 +7,35 @@ from ...utils import Params, _copy_param_groups, _make_param_groups
 class Wrap(Module):
-    """Custom param groups are supported only by `set_param_groups`. Settings passed to Modular will be ignored."""
+    """
+    Wraps a pytorch optimizer to use it as a module.
+    .. note::
+        Custom param groups are supported only by `set_param_groups`, settings passed to Modular will be ignored.
+    Args:
+        opt_fn (Callable[..., torch.optim.Optimizer] | torch.optim.Optimizer):
+            function that takes in parameters and returns the optimizer, for example :code:`torch.optim.Adam`
+            or :code:`lambda parameters: torch.optim.Adam(parameters, lr=1e-3)`
+        *args:
+        **kwargs:
+            Extra args to be passed to opt_fn. The function is called as :code:`opt_fn(parameters, *args, **kwargs)`.
+    Example:
+        wrapping pytorch_optimizer.StableAdamW
+        .. code-block:: py
+            from pytorch_optimizer import StableAdamW
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Wrap(StableAdamW, lr=1),
+                tz.m.Cautious(),
+                tz.m.LR(1e-2)
+            )
+    """
     def __init__(self, opt_fn: Callable[..., torch.optim.Optimizer] | torch.optim.Optimizer, *args, **kwargs):
         super().__init__()
         self._opt_fn = opt_fn

torchzero 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl