PyPI - torchzero - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +115 -68
tests/test_tensorlist.py +2 -2
tests/test_vars.py +62 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +185 -53
torchzero/core/transform.py +327 -159
torchzero/modules/__init__.py +3 -1
torchzero/modules/clipping/clipping.py +120 -23
torchzero/modules/clipping/ema_clipping.py +37 -22
torchzero/modules/clipping/growth_clipping.py +20 -21
torchzero/modules/experimental/__init__.py +30 -4
torchzero/modules/experimental/absoap.py +53 -156
torchzero/modules/experimental/adadam.py +22 -15
torchzero/modules/experimental/adamY.py +21 -25
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +37 -8
torchzero/modules/experimental/adasoap.py +24 -129
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +120 -0
torchzero/modules/experimental/etf.py +195 -0
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +49 -50
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +92 -0
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +10 -7
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +20 -10
torchzero/modules/experimental/tensor_adagrad.py +42 -0
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +31 -4
torchzero/modules/grad_approximation/forward_gradient.py +17 -7
torchzero/modules/grad_approximation/grad_approximator.py +69 -24
torchzero/modules/grad_approximation/rfdm.py +310 -50
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +319 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +75 -31
torchzero/modules/line_search/line_search.py +107 -49
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +20 -5
torchzero/modules/line_search/strong_wolfe.py +52 -36
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/misc/split.py +103 -0
torchzero/modules/{ops → misc}/switch.py +48 -7
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +115 -40
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +145 -76
torchzero/modules/momentum/momentum.py +25 -4
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +51 -25
torchzero/modules/ops/binary.py +108 -62
torchzero/modules/ops/multi.py +95 -34
torchzero/modules/ops/reduce.py +31 -23
torchzero/modules/ops/unary.py +37 -21
torchzero/modules/ops/utility.py +53 -45
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +48 -29
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +35 -37
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/optimizers/ladagrad.py +183 -0
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +32 -7
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +19 -19
torchzero/modules/optimizers/rprop.py +89 -52
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +55 -27
torchzero/modules/optimizers/soap.py +40 -37
torchzero/modules/optimizers/sophia_h.py +82 -25
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +4 -2
torchzero/modules/projections/projection.py +212 -118
torchzero/modules/quasi_newton/__init__.py +44 -5
torchzero/modules/quasi_newton/cg.py +190 -39
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +102 -58
torchzero/modules/quasi_newton/quasi_newton.py +1032 -177
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +245 -54
torchzero/modules/second_order/newton_cg.py +311 -21
torchzero/modules/second_order/nystrom.py +124 -21
torchzero/modules/smoothing/gaussian.py +55 -21
torchzero/modules/smoothing/laplacian.py +20 -12
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +126 -10
torchzero/modules/wrappers/optim_wrapper.py +40 -12
torchzero/optim/wrappers/directsearch.py +281 -0
torchzero/optim/wrappers/fcmaes.py +105 -0
torchzero/optim/wrappers/mads.py +89 -0
torchzero/optim/wrappers/nevergrad.py +20 -5
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +167 -16
torchzero/utils/__init__.py +3 -7
torchzero/utils/derivatives.py +5 -4
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +27 -4
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/METADATA +76 -51
torchzero-0.3.11.dist-info/RECORD +159 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/soapy.py +0 -290
torchzero/modules/experimental/spectral.py +0 -288
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/lr.py +0 -59
torchzero/modules/lr/step_size.py +0 -97
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -419
torchzero/modules/ops/split.py +0 -75
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.9.dist-info/RECORD +0 -131
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/smoothing/gaussian.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Literal
 import torch
-from ...core import Modular, Module, Vars
+from ...core import Modular, Module, Var
 from ...utils import NumberList, TensorList
 from ...utils.derivatives import jacobian_wrt
 from ..grad_approximation import GradApproximator, GradTarget
@@ -17,24 +17,24 @@ class Reformulation(Module, ABC):
         super().__init__(defaults)
     @abstractmethod
-    def closure(self, backward: bool, closure: Callable, params:list[torch.Tensor], vars: Vars) -> tuple[float | torch.Tensor, Sequence[torch.Tensor] | None]:
+    def closure(self, backward: bool, closure: Callable, params:list[torch.Tensor], var: Var) -> tuple[float | torch.Tensor, Sequence[torch.Tensor] | None]:
         """returns loss and gradient, if backward is False then gradient can be None"""
-    def pre_step(self, vars: Vars) -> Vars | None:
+    def pre_step(self, var: Var) -> Var | None:
         """This runs once before each step, whereas `closure` may run multiple times per step if further modules
         evaluate gradients at multiple points. This is useful for example to pre-generate new random perturbations."""
-        return vars
+        return var
-    def step(self, vars):
-        ret = self.pre_step(vars)
-        if isinstance(ret, Vars): vars = ret
+    def step(self, var):
+        ret = self.pre_step(var)
+        if isinstance(ret, Var): var = ret
-        if vars.closure is None: raise RuntimeError("Reformulation requires closure")
-        params, closure = vars.params, vars.closure
+        if var.closure is None: raise RuntimeError("Reformulation requires closure")
+        params, closure = var.params, var.closure
         def modified_closure(backward=True):
-            loss, grad = self.closure(backward, closure, params, vars)
+            loss, grad = self.closure(backward, closure, params, var)
             if grad is not None:
                 for p,g in zip(params, grad):
@@ -42,8 +42,8 @@ class Reformulation(Module, ABC):
             return loss
-        vars.closure = modified_closure
-        return vars
+        var.closure = modified_closure
+        return var
 def _decay_sigma_(self: Module, params):
@@ -58,12 +58,46 @@ def _generate_perturbations_to_state_(self: Module, params: TensorList, n_sample
     for param, prt in zip(params, zip(*perturbations)):
         self.state[param]['perturbations'] = prt
-def _clear_state_hook(optimizer: Modular, vars: Vars, self: Module):
+def _clear_state_hook(optimizer: Modular, var: Var, self: Module):
     for m in optimizer.unrolled_modules:
         if m is not self:
             m.reset()
 class GaussianHomotopy(Reformulation):
+    """Approximately smoothes the function with a gaussian kernel by sampling it at random perturbed points around current point. Both function values and gradients are averaged over all samples. The perturbed points are generated before each
+    step and remain the same throughout the step.
+    .. note::
+        This module reformulates the objective, it modifies the closure to evaluate value and gradients of a smoothed function. All modules after this will operate on the modified objective.
+    .. note::
+        This module requires the a closure passed to the optimizer step,
+        as it needs to re-evaluate the loss and gradients at perturbed points.
+    Args:
+        n_samples (int): number of points to sample, larger values lead to a more accurate smoothing.
+        init_sigma (float): initial scale of perturbations.
+        tol (float | None, optional):
+            if maximal parameters change value is smaller than this, sigma is reduced by :code:`decay`. Defaults to 1e-4.
+        decay (float, optional): multiplier to sigma when converged on a smoothed function. Defaults to 0.5.
+        max_steps (int | None, optional): maximum number of steps before decaying sigma. Defaults to None.
+        clear_state (bool, optional):
+            whether to clear all other module states when sigma is decayed, because the objective function changes. Defaults to True.
+        seed (int | None, optional): seed for random perturbationss. Defaults to None.
+    Examples:
+        Gaussian-smoothed NewtonCG
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.GaussianHomotopy(100),
+                tz.m.NewtonCG(maxiter=20),
+                tz.m.AdaptiveBacktracking(),
+            )
+    """
     def __init__(
         self,
         n_samples: int,
@@ -85,12 +119,12 @@ class GaussianHomotopy(Reformulation):
             else: self.global_state['generator'] = None
         return self.global_state['generator']
-    def pre_step(self, vars):
-        params = TensorList(vars.params)
+    def pre_step(self, var):
+        params = TensorList(var.params)
         settings = self.settings[params[0]]
         n_samples = settings['n_samples']
-        init_sigma = self.get_settings('init_sigma', params=params)
-        sigmas = self.get_state('sigma', params = params, init=init_sigma)
+        init_sigma = [self.settings[p]['init_sigma'] for p in params]
+        sigmas = self.get_state(params, 'sigma', init=init_sigma)
         if any('perturbations' not in self.state[p] for p in params):
             generator = self._get_generator(settings['seed'], params)
@@ -109,9 +143,9 @@ class GaussianHomotopy(Reformulation):
         tol = settings['tol']
         if tol is not None and not decayed:
             if not any('prev_params' in self.state[p] for p in params):
-                prev_params = self.get_state('prev_params', params=params, cls=TensorList, init='param')
+                prev_params = self.get_state(params, 'prev_params', cls=TensorList, init='param')
             else:
-                prev_params = self.get_state('prev_params', params=params, cls=TensorList, init='param')
+                prev_params = self.get_state(params, 'prev_params', cls=TensorList, init='param')
                 s = params - prev_params
                 if s.abs().global_max() <= tol:
@@ -124,10 +158,10 @@ class GaussianHomotopy(Reformulation):
             generator = self._get_generator(settings['seed'], params)
             _generate_perturbations_to_state_(self, params=params, n_samples=n_samples, sigmas=sigmas, generator=generator)
             if settings['clear_state']:
-                vars.post_step_hooks.append(partial(_clear_state_hook, self=self))
+                var.post_step_hooks.append(partial(_clear_state_hook, self=self))
     @torch.no_grad
-    def closure(self, backward, closure, params, vars):
+    def closure(self, backward, closure, params, var):
         params = TensorList(params)
         settings = self.settings[params[0]]

torchzero/modules/smoothing/laplacian.py CHANGED Viewed

@@ -56,7 +56,7 @@ def _precompute_denominator(tensor: torch.Tensor, sigma) -> torch.Tensor:
     return 1 - sigma * torch.fft.fft(v) # pylint: disable = not-callable
 class LaplacianSmoothing(Transform):
-    """Applies laplacian smoothing via a fast Fourier transform solver.
+    """Applies laplacian smoothing via a fast Fourier transform solver which can improve generalization.
     Args:
         sigma (float, optional): controls the amount of smoothing. Defaults to 1.
@@ -67,11 +67,21 @@ class LaplacianSmoothing(Transform):
             minimum number of elements in a parameter to apply laplacian smoothing to.
             Only has effect if `layerwise` is True. Defaults to 4.
         target (str, optional):
-            what to set on vars.
+            what to set on var.
+    Examples:
+        Laplacian Smoothing Gradient Descent optimizer as in the paper
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.LaplacianSmoothing(),
+                tz.m.LR(1e-2),
+            )
     Reference:
-        *Osher, S., Wang, B., Yin, P., Luo, X., Barekat, F., Pham, M., & Lin, A. (2022).
-        Laplacian smoothing gradient descent. Research in the Mathematical Sciences, 9(3), 55.*
+        Osher, S., Wang, B., Yin, P., Luo, X., Barekat, F., Pham, M., & Lin, A. (2022). Laplacian smoothing gradient descent. Research in the Mathematical Sciences, 9(3), 55.
     """
     def __init__(self, sigma:float = 1, layerwise=True, min_numel = 4, target: Target = 'update'):
@@ -82,19 +92,17 @@ class LaplacianSmoothing(Transform):
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        layerwise = self.settings[params[0]]['layerwise']
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        layerwise = settings[0]['layerwise']
         # layerwise laplacian smoothing
         if layerwise:
             # precompute the denominator for each layer and store it in each parameters state
             smoothed_target = TensorList()
-            for p, t in zip(params, tensors):
-                settings = self.settings[p]
-                if p.numel() > settings['min_numel']:
-                    state = self.state[p]
-                    if 'denominator' not in state: state['denominator'] = _precompute_denominator(p, settings['sigma'])
+            for p, t, state, setting in zip(params, tensors, states, settings):
+                if p.numel() > setting['min_numel']:
+                    if 'denominator' not in state: state['denominator'] = _precompute_denominator(p, setting['sigma'])
                     smoothed_target.append(torch.fft.ifft(torch.fft.fft(t.view(-1)) / state['denominator']).real.view_as(t)) #pylint:disable=not-callable
                 else:
                     smoothed_target.append(t)
@@ -106,7 +114,7 @@ class LaplacianSmoothing(Transform):
         # precompute full denominator
         tensors = TensorList(tensors)
         if self.global_state.get('full_denominator', None) is None:
-            self.global_state['full_denominator'] = _precompute_denominator(tensors.to_vec(), self.settings[params[0]]['sigma'])
+            self.global_state['full_denominator'] = _precompute_denominator(tensors.to_vec(), settings[0]['sigma'])
         # apply the smoothing
         vec = tensors.to_vec()

torchzero/modules/step_size/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .lr import LR, StepSize, Warmup, WarmupNormClip, RandomStepSize
2	+ from .adaptive import PolyakStepSize, BarzilaiBorwein

torchzero/modules/step_size/adaptive.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""Various step size strategies"""
+from typing import Any, Literal
+from operator import itemgetter
+import torch
+from ...core import Transform, Chainable
+from ...utils import TensorList, unpack_dicts, unpack_states, NumberList
+class PolyakStepSize(Transform):
+    """Polyak's subgradient method.
+    Args:
+        f_star (int, optional):
+            (estimated) minimal possible value of the objective function (lowest possible loss). Defaults to 0.
+        max (float | None, optional): maximum possible step size. Defaults to None.
+        use_grad (bool, optional):
+            if True, uses dot product of update and gradient to compute the step size.
+            Otherwise, dot product of update with itself is used, which has no geometric meaning so it probably won't work well.
+            Defaults to False.
+        alpha (float, optional): multiplier to Polyak step-size. Defaults to 1.
+    """
+    def __init__(self, f_star: float = 0, max: float | None = None, use_grad=False, alpha: float = 1, inner: Chainable | None = None):
+        defaults = dict(alpha=alpha, max=max, f_star=f_star, use_grad=use_grad)
+        super().__init__(defaults, uses_grad=use_grad, uses_loss=True, inner=inner)
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
+        assert grads is not None and loss is not None
+        tensors = TensorList(tensors)
+        grads = TensorList(grads)
+        use_grad, max, f_star = itemgetter('use_grad', 'max', 'f_star')(settings[0])
+        if use_grad: gg = tensors.dot(grads)
+        else: gg = tensors.dot(tensors)
+        if gg.abs() <= torch.finfo(gg.dtype).eps: step_size = 0 # converged
+        else: step_size = (loss - f_star) / gg
+        if max is not None:
+            if step_size > max: step_size = max
+        self.global_state['step_size'] = step_size
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        step_size = self.global_state.get('step_size', 1)
+        torch._foreach_mul_(tensors, step_size * unpack_dicts(settings, 'alpha', cls=NumberList))
+        return tensors
+def _bb_short(s: TensorList, y: TensorList, sy, eps, fallback):
+    yy = y.dot(y)
+    if yy < eps:
+        if sy < eps: return fallback # try to fallback on long
+        ss = s.dot(s)
+        return ss/sy
+    return sy/yy
+def _bb_long(s: TensorList, y: TensorList, sy, eps, fallback):
+    ss = s.dot(s)
+    if sy < eps:
+        yy = y.dot(y) # try to fallback on short
+        if yy < eps: return fallback
+        return sy/yy
+    return ss/sy
+def _bb_geom(s: TensorList, y: TensorList, sy, eps, fallback):
+    short = _bb_short(s, y, sy, eps, fallback)
+    long = _bb_long(s, y, sy, eps, fallback)
+    return (short * long) ** 0.5
+class BarzilaiBorwein(Transform):
+    """Barzilai-Borwein method.
+    Args:
+        type (str, optional):
+            one of "short" with formula sᵀy/yᵀy, "long" with formula sᵀs/sᵀy, or "geom" to use geometric mean of short and long.
+            Defaults to 'geom'.
+        scale_first (bool, optional):
+            whether to make first step very small when previous gradient is not available. Defaults to True.
+        fallback (float, optional): step size when denominator is less than 0 (will happen on negative curvature). Defaults to 1e-3.
+        inner (Chainable | None, optional):
+            step size will be applied to outputs of this module. Defaults to None.
+    """
+    def __init__(self, type: Literal['long', 'short', 'geom'] = 'geom', scale_first:bool=True, fallback:float=1e-3, inner:Chainable|None = None):
+        defaults = dict(type=type, fallback=fallback)
+        super().__init__(defaults, uses_grad=False, scale_first=scale_first, inner=inner)
+    def reset_for_online(self):
+        super().reset_for_online()
+        self.clear_state_keys('prev_p', 'prev_g')
+    @torch.no_grad
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
+        prev_p, prev_g = unpack_states(states, tensors, 'prev_p', 'prev_g', cls=TensorList)
+        fallback = unpack_dicts(settings, 'fallback', cls=NumberList)
+        type = settings[0]['type']
+        s = params-prev_p
+        y = tensors-prev_g
+        sy = s.dot(y)
+        eps = torch.finfo(sy.dtype).eps
+        if type == 'short': step_size = _bb_short(s, y, sy, eps, fallback)
+        elif type == 'long': step_size = _bb_long(s, y, sy, eps, fallback)
+        elif type == 'geom': step_size = _bb_geom(s, y, sy, eps, fallback)
+        else: raise ValueError(type)
+        self.global_state['step_size'] = step_size
+        prev_p.copy_(params)
+        prev_g.copy_(tensors)
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        step_size = self.global_state.get('step_size', 1)
+        torch._foreach_mul_(tensors, step_size)
+        return tensors

torchzero/modules/step_size/lr.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""Learning rate"""
+import torch
+import random
+from ...core import Transform
+from ...utils import NumberList, TensorList, generic_ne, unpack_dicts
+def lazy_lr(tensors: TensorList, lr: float | list, inplace:bool):
+    """multiplies by lr if lr is not 1"""
+    if generic_ne(lr, 1):
+        if inplace: return tensors.mul_(lr)
+        return tensors * lr
+    return tensors
+class LR(Transform):
+    """Learning rate. Adding this module also adds support for LR schedulers."""
+    def __init__(self, lr: float):
+        defaults=dict(lr=lr)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        return lazy_lr(TensorList(tensors), lr=[s['lr'] for s in settings], inplace=True)
+class StepSize(Transform):
+    """this is exactly the same as LR, except the `lr` parameter can be renamed to any other name to avoid clashes"""
+    def __init__(self, step_size: float, key = 'step_size'):
+        defaults={"key": key, key: step_size}
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        return lazy_lr(TensorList(tensors), lr=[s[s['key']] for s in settings], inplace=True)
+def _warmup_lr(step: int, start_lr: float | NumberList, end_lr: float | NumberList, steps: float):
+    """returns warm up lr scalar"""
+    if step > steps: return end_lr
+    return start_lr + (end_lr - start_lr) * (step / steps)
+class Warmup(Transform):
+    """Learning rate warmup, linearly increases learning rate multiplier from :code:`start_lr` to :code:`end_lr` over :code:`steps` steps.
+    Args:
+        steps (int, optional): number of steps to perform warmup for. Defaults to 100.
+        start_lr (_type_, optional): initial learning rate multiplier on first step. Defaults to 1e-5.
+        end_lr (float, optional): learning rate multiplier at the end and after warmup. Defaults to 1.
+    Example:
+        Adam with 1000 steps warmup
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.LR(1e-2),
+                tz.m.Warmup(steps=1000)
+            )
+    """
+    def __init__(self, steps = 100, start_lr = 1e-5, end_lr:float = 1):
+        defaults = dict(start_lr=start_lr,end_lr=end_lr, steps=steps)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        start_lr, end_lr = unpack_dicts(settings, 'start_lr', 'end_lr', cls = NumberList)
+        num_steps = settings[0]['steps']
+        step = self.global_state.get('step', 0)
+        tensors = lazy_lr(
+            TensorList(tensors),
+            lr=_warmup_lr(step=step, start_lr=start_lr, end_lr=end_lr, steps=num_steps),
+            inplace=True
+        )
+        self.global_state['step'] = step + 1
+        return tensors
+class WarmupNormClip(Transform):
+    """Warmup via clipping of the update norm.
+    Args:
+        start_norm (_type_, optional): maximal norm on the first step. Defaults to 1e-5.
+        end_norm (float, optional): maximal norm on the last step. After that, norm clipping is disabled. Defaults to 1.
+        steps (int, optional): number of steps to perform warmup for. Defaults to 100.
+    Example:
+        Adam with 1000 steps norm clip warmup
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.WarmupNormClip(steps=1000)
+                tz.m.LR(1e-2),
+            )
+    """
+    def __init__(self, steps = 100, start_norm = 1e-5, end_norm:float = 1):
+        defaults = dict(start_norm=start_norm,end_norm=end_norm, steps=steps)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        start_norm, end_norm = unpack_dicts(settings, 'start_norm', 'end_norm', cls = NumberList)
+        num_steps = settings[0]['steps']
+        step = self.global_state.get('step', 0)
+        if step > num_steps: return tensors
+        tensors = TensorList(tensors)
+        norm = tensors.global_vector_norm()
+        current_max_norm = _warmup_lr(step, start_norm[0], end_norm[0], num_steps)
+        if norm > current_max_norm:
+            tensors.mul_(current_max_norm / norm)
+        self.global_state['step'] = step + 1
+        return tensors
+class RandomStepSize(Transform):
+    """Uses random global or layer-wise step size from `low` to `high`.
+    Args:
+        low (float, optional): minimum learning rate. Defaults to 0.
+        high (float, optional): maximum learning rate. Defaults to 1.
+        parameterwise (bool, optional):
+            if True, generate random step size for each parameter separately,
+            if False generate one global random step size. Defaults to False.
+    """
+    def __init__(self, low: float = 0, high: float = 1, parameterwise=False, seed:int|None=None):
+        defaults = dict(low=low, high=high, parameterwise=parameterwise,seed=seed)
+        super().__init__(defaults, uses_grad=False)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        s = settings[0]
+        parameterwise = s['parameterwise']
+        seed = s['seed']
+        if 'generator' not in self.global_state:
+            self.global_state['generator'] = random.Random(seed)
+        generator: random.Random = self.global_state['generator']
+        if parameterwise:
+            low, high = unpack_dicts(settings, 'low', 'high')
+            lr = [generator.uniform(l, h) for l, h in zip(low, high)]
+        else:
+            low = s['low']
+            high = s['high']
+            lr = generator.uniform(low, high)
+        torch._foreach_mul_(tensors, lr)
+        return tensors

torchzero/modules/weight_decay/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_
1	+ from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_, RelativeWeightDecay

torchzero/modules/weight_decay/weight_decay.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from collections.abc import Iterable, Sequence
+from typing import Literal
 import torch
 from ...core import Module, Target, Transform
-from ...utils import NumberList, TensorList, as_tensorlist
+from ...utils import NumberList, TensorList, as_tensorlist, unpack_dicts, unpack_states
 @torch.no_grad
 def weight_decay_(
@@ -20,17 +22,126 @@ def weight_decay_(
 class WeightDecay(Transform):
+    """Weight decay.
+    Args:
+        weight_decay (float): weight decay scale.
+        ord (int, optional): order of the penalty, e.g. 1 for L1 and 2 for L2. Defaults to 2.
+        target (Target, optional): what to set on var. Defaults to 'update'.
+    Examples:
+        Adam with non-decoupled weight decay
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.WeightDecay(1e-3),
+                tz.m.Adam(),
+                tz.m.LR(1e-3)
+            )
+        Adam with decoupled weight decay that still scales with learning rate
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.WeightDecay(1e-3),
+                tz.m.LR(1e-3)
+            )
+        Adam with fully decoupled weight decay that doesn't scale with learning rate
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.LR(1e-3),
+                tz.m.WeightDecay(1e-6)
+            )
+    """
     def __init__(self, weight_decay: float, ord: int = 2, target: Target = 'update'):
         defaults = dict(weight_decay=weight_decay, ord=ord)
         super().__init__(defaults, uses_grad=False, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
-        weight_decay = self.get_settings('weight_decay', params=params, cls=NumberList)
-        ord = self.settings[params[0]]['ord']
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        weight_decay = NumberList(s['weight_decay'] for s in settings)
+        ord = settings[0]['ord']
         return weight_decay_(as_tensorlist(tensors), as_tensorlist(params), weight_decay, ord)
+class RelativeWeightDecay(Transform):
+    """Weight decay relative to the mean absolute value of update, gradient or parameters depending on value of :code:`norm_input` argument.
+    Args:
+        weight_decay (float): relative weight decay scale.
+        ord (int, optional): order of the penalty, e.g. 1 for L1 and 2 for L2. Defaults to 2.
+        norm_input (str, optional):
+            determines what should weight decay be relative to. "update", "grad" or "params".
+            Defaults to "update".
+        target (Target, optional): what to set on var. Defaults to 'update'.
+    Examples:
+        Adam with non-decoupled relative weight decay
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.RelativeWeightDecay(1e-3),
+                tz.m.Adam(),
+                tz.m.LR(1e-3)
+            )
+        Adam with decoupled relative weight decay
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.RelativeWeightDecay(1e-3),
+                tz.m.LR(1e-3)
+            )
+    """
+    def __init__(
+        self,
+        weight_decay: float = 0.1,
+        ord: int  = 2,
+        norm_input: Literal["update", "grad", "params"] = "update",
+        target: Target = "update",
+    ):
+        defaults = dict(weight_decay=weight_decay, ord=ord, norm_input=norm_input)
+        super().__init__(defaults, uses_grad=norm_input == 'grad', target=target)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        weight_decay = NumberList(s['weight_decay'] for s in settings)
+        ord = settings[0]['ord']
+        norm_input = settings[0]['norm_input']
+        if norm_input == 'update': src = TensorList(tensors)
+        elif norm_input == 'grad':
+            assert grads is not None
+            src = TensorList(grads)
+        elif norm_input == 'params':
+            src = TensorList(params)
+        else:
+            raise ValueError(norm_input)
+        mean_abs = src.abs().global_mean()
+        return weight_decay_(as_tensorlist(tensors), as_tensorlist(params), weight_decay * mean_abs, ord)
 @torch.no_grad
 def decay_weights_(params: Iterable[torch.Tensor], weight_decay: float | NumberList, ord:int=2):
     """directly decays weights in-place"""
@@ -38,15 +149,20 @@ def decay_weights_(params: Iterable[torch.Tensor], weight_decay: float | NumberL
     weight_decay_(params, params, -weight_decay, ord)
 class DirectWeightDecay(Module):
-    """directly decays weights in-place"""
+    """Directly applies weight decay to parameters.
+    Args:
+        weight_decay (float): weight decay scale.
+        ord (int, optional): order of the penalty, e.g. 1 for L1 and 2 for L2. Defaults to 2.
+    """
     def __init__(self, weight_decay: float, ord: int = 2,):
         defaults = dict(weight_decay=weight_decay, ord=ord)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, vars):
-        weight_decay = self.get_settings('weight_decay', params=vars.params, cls=NumberList)
-        ord = self.settings[vars.params[0]]['ord']
+    def step(self, var):
+        weight_decay = self.get_settings(var.params, 'weight_decay', cls=NumberList)
+        ord = self.settings[var.params[0]]['ord']
-        decay_weights_(vars.params, weight_decay, ord)
-        return vars
+        decay_weights_(var.params, weight_decay, ord)
+        return var

torchzero 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl