PyPI - torchzero - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +115 -68
tests/test_tensorlist.py +2 -2
tests/test_vars.py +62 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +185 -53
torchzero/core/transform.py +327 -159
torchzero/modules/__init__.py +3 -1
torchzero/modules/clipping/clipping.py +120 -23
torchzero/modules/clipping/ema_clipping.py +37 -22
torchzero/modules/clipping/growth_clipping.py +20 -21
torchzero/modules/experimental/__init__.py +30 -4
torchzero/modules/experimental/absoap.py +53 -156
torchzero/modules/experimental/adadam.py +22 -15
torchzero/modules/experimental/adamY.py +21 -25
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +37 -8
torchzero/modules/experimental/adasoap.py +24 -129
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +120 -0
torchzero/modules/experimental/etf.py +195 -0
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +49 -50
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +92 -0
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +10 -7
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +20 -10
torchzero/modules/experimental/tensor_adagrad.py +42 -0
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +31 -4
torchzero/modules/grad_approximation/forward_gradient.py +17 -7
torchzero/modules/grad_approximation/grad_approximator.py +69 -24
torchzero/modules/grad_approximation/rfdm.py +310 -50
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +319 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +75 -31
torchzero/modules/line_search/line_search.py +107 -49
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +20 -5
torchzero/modules/line_search/strong_wolfe.py +52 -36
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/misc/split.py +103 -0
torchzero/modules/{ops → misc}/switch.py +48 -7
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +115 -40
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +145 -76
torchzero/modules/momentum/momentum.py +25 -4
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +51 -25
torchzero/modules/ops/binary.py +108 -62
torchzero/modules/ops/multi.py +95 -34
torchzero/modules/ops/reduce.py +31 -23
torchzero/modules/ops/unary.py +37 -21
torchzero/modules/ops/utility.py +53 -45
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +48 -29
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +35 -37
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/optimizers/ladagrad.py +183 -0
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +32 -7
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +19 -19
torchzero/modules/optimizers/rprop.py +89 -52
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +55 -27
torchzero/modules/optimizers/soap.py +40 -37
torchzero/modules/optimizers/sophia_h.py +82 -25
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +4 -2
torchzero/modules/projections/projection.py +212 -118
torchzero/modules/quasi_newton/__init__.py +44 -5
torchzero/modules/quasi_newton/cg.py +190 -39
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +102 -58
torchzero/modules/quasi_newton/quasi_newton.py +1032 -177
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +245 -54
torchzero/modules/second_order/newton_cg.py +311 -21
torchzero/modules/second_order/nystrom.py +124 -21
torchzero/modules/smoothing/gaussian.py +55 -21
torchzero/modules/smoothing/laplacian.py +20 -12
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +126 -10
torchzero/modules/wrappers/optim_wrapper.py +40 -12
torchzero/optim/wrappers/directsearch.py +281 -0
torchzero/optim/wrappers/fcmaes.py +105 -0
torchzero/optim/wrappers/mads.py +89 -0
torchzero/optim/wrappers/nevergrad.py +20 -5
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +167 -16
torchzero/utils/__init__.py +3 -7
torchzero/utils/derivatives.py +5 -4
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +27 -4
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/METADATA +76 -51
torchzero-0.3.11.dist-info/RECORD +159 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/soapy.py +0 -290
torchzero/modules/experimental/spectral.py +0 -288
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/lr.py +0 -59
torchzero/modules/lr/step_size.py +0 -97
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -419
torchzero/modules/ops/split.py +0 -75
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.9.dist-info/RECORD +0 -131
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/misc/regularization.py ADDED Viewed

@@ -0,0 +1,171 @@
+from collections import deque
+from collections.abc import Iterable
+from operator import itemgetter
+from typing import Literal
+import torch
+from ...core import Chainable, Module, Target, TensorwiseTransform, Transform, Var
+from ...utils import Distributions, NumberList, TensorList, unpack_dicts, unpack_states
+class Dropout(Transform):
+    """Applies dropout to the update.
+    For each weight the update to that weight has :code:`p` probability to be set to 0.
+    This can be used to implement gradient dropout or update dropout depending on placement.
+    Args:
+        p (float, optional): probability that update for a weight is replaced with 0. Defaults to 0.5.
+        graft (bool, optional):
+            if True, update after dropout is rescaled to have the same norm as before dropout. Defaults to False.
+        target (Target, optional): what to set on var, refer to documentation. Defaults to 'update'.
+    Examples:
+        Gradient dropout.
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Dropout(0.5),
+                tz.m.Adam(),
+                tz.m.LR(1e-3)
+            )
+        Update dropout.
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adam(),
+                tz.m.Dropout(0.5),
+                tz.m.LR(1e-3)
+            )
+    """
+    def __init__(self, p: float = 0.5, graft: bool=False, target: Target = 'update'):
+        defaults = dict(p=p, graft=graft)
+        super().__init__(defaults, uses_grad=False, target=target)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        tensors = TensorList(tensors)
+        p = NumberList(s['p'] for s in settings)
+        graft = settings[0]['graft']
+        if graft:
+            target_norm = tensors.global_vector_norm()
+            tensors.mul_(tensors.rademacher_like(1-p).add_(1).div_(2))
+            return tensors.mul_(target_norm / tensors.global_vector_norm()) # graft
+        return tensors.mul_(tensors.rademacher_like(1-p).add_(1).div_(2))
+def _bernoulli_like(tensor, p = 0.5, generator = None):
+    """p is probability of a 1, other values will be 0."""
+    return torch.bernoulli(torch.full_like(tensor, p), generator = generator)
+class WeightDropout(Module):
+    """
+    Changes the closure so that it evaluates loss and gradients with random weights replaced with 0.
+    Dropout can be disabled for a parameter by setting :code:`use_dropout=False` in corresponding parameter group.
+    Args:
+        p (float, optional): probability that any weight is replaced with 0. Defaults to 0.5.
+        graft (bool, optional):
+            if True, parameters after dropout are rescaled to have the same norm as before dropout. Defaults to False.
+    """
+    def __init__(self, p: float = 0.5, graft: bool = True):
+        defaults = dict(p=p, graft=graft, use_dropout=True)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        closure = var.closure
+        if closure is None: raise RuntimeError('WeightDropout requires closure')
+        params = TensorList(var.params)
+        p = NumberList(self.settings[p]['p'] for p in params)
+        # create masks
+        mask = []
+        for p, m in zip(params, mask):
+            prob = self.settings[p]['p']
+            use_dropout = self.settings[p]['use_dropout']
+            if use_dropout: mask.append(_bernoulli_like(p, prob))
+            else: mask.append(torch.ones_like(p))
+        @torch.no_grad
+        def dropout_closure(backward=True):
+            orig_params = params.clone()
+            params.mul_(mask)
+            if backward:
+                with torch.enable_grad(): loss = closure()
+            else:
+                loss = closure(False)
+            params.copy_(orig_params)
+            return loss
+        var.closure = dropout_closure
+        return var
+class PerturbWeights(Module):
+    """
+    Changes the closure so that it evaluates loss and gradients at weights perturbed by a random perturbation.
+    Can be disabled for a parameter by setting :code:`perturb=False` in corresponding parameter group.
+    Args:
+        alpha (float, optional): multiplier for perturbation magnitude. Defaults to 0.1.
+        relative (bool, optional): whether to multiply perturbation by mean absolute value of the parameter. Defaults to True.
+        graft (bool, optional):
+            if True, parameters after dropout are rescaled to have the same norm as before dropout. Defaults to False.
+    """
+    def __init__(self, alpha: float = 0.1, relative:bool=True, distribution:Distributions = 'normal'):
+        defaults = dict(alpha=alpha, relative=relative, distribution=distribution, perturb=True)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        closure = var.closure
+        if closure is None: raise RuntimeError('WeightDropout requires closure')
+        params = TensorList(var.params)
+        # create perturbations
+        perts = []
+        for p in params:
+            settings = self.settings[p]
+            if not settings['perturb']:
+                perts.append(torch.zeros_like(p))
+                continue
+            alpha = settings['alpha']
+            if settings['relative']:
+                alpha *= p.abs().mean()
+            distribution = self.settings[p]['distribution'].lower()
+            if distribution in ('normal', 'gaussian'):
+                perts.append(torch.randn_like(p).mul_(alpha))
+            elif distribution == 'uniform':
+                perts.append(torch.empty_like(p).uniform_(-alpha,alpha))
+            elif distribution == 'sphere':
+                r = torch.randn_like(p)
+                perts.append((r * alpha) / torch.linalg.vector_norm(r)) # pylint:disable=not-callable
+            else:
+                raise ValueError(distribution)
+        @torch.no_grad
+        def perturbed_closure(backward=True):
+            params.add_(perts)
+            if backward:
+                with torch.enable_grad(): loss = closure()
+            else:
+                loss = closure(False)
+            params.sub_(perts)
+            return loss
+        var.closure = perturbed_closure
+        return var

torchzero/modules/misc/split.py ADDED Viewed

@@ -0,0 +1,103 @@
+from collections.abc import Callable
+from typing import cast
+import torch
+from ...core import Chainable, Module, Var
+def _split(
+    module: Module,
+    idxs,
+    params,
+    var: Var,
+):
+    split_params = [p for i,p in enumerate(params) if i in idxs]
+    split_grad = None
+    if var.grad is not None:
+        split_grad = [g for i,g in enumerate(var.grad) if i in idxs]
+    split_update = None
+    if var.update is not None:
+        split_update = [u for i,u in enumerate(var.update) if i in idxs]
+    split_var = var.clone(clone_update=False)
+    split_var.params = split_params
+    split_var.grad = split_grad
+    split_var.update = split_update
+    split_var = module.step(split_var)
+    if (var.grad is None) and (split_var.grad is not None):
+        var.grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
+    if split_var.update is not None:
+        if var.update is None:
+            if var.grad is None: var.update = [cast(torch.Tensor, None) for _ in var.params]
+            else: var.update = [g.clone() for g in var.grad]
+        for idx, u in zip(idxs, split_var.update):
+            var.update[idx] = u
+    var.update_attrs_from_clone_(split_var)
+    return var
+class Split(Module):
+    """Apply `true` modules to all parameters filtered by `filter`, apply `false` modules to all other parameters.
+    Args:
+        filter (Callable[[torch.Tensor], bool]): a function that takes in a parameter tensor and returns a boolean value.
+        true (Chainable | None): modules that are applied to tensors where :code:`filter` returned True.
+        false (Chainable | None): modules that are applied to tensors where :code:`filter` returned False.
+    Examples:
+        standard Muon with Adam fallback
+        .. code-block:: python
+            opt = tz.Modular(
+                model.head.parameters(),
+                tz.m.Split(
+                    # apply muon only to 2D+ parameters
+                    filter = lambda t: t.ndim >= 2,
+                    true = [
+                        tz.m.HeavyBall(),
+                        tz.m.Orthogonalize(),
+                        tz.m.LR(1e-2),
+                    ],
+                    false = tz.m.Adam()
+                ),
+                tz.m.LR(1e-2)
+            )
+    """
+    def __init__(self, filter: Callable[[torch.Tensor], bool], true: Chainable | None, false: Chainable | None):
+        defaults = dict(filter=filter)
+        super().__init__(defaults)
+        if true is not None: self.set_child('true', true)
+        if false is not None: self.set_child('false', false)
+    def step(self, var):
+        params = var.params
+        filter = self.settings[params[0]]['filter']
+        true_idxs = []
+        false_idxs = []
+        for i,p in enumerate(params):
+            if filter(p): true_idxs.append(i)
+            else: false_idxs.append(i)
+        if 'true' in self.children:
+            true = self.children['true']
+            var = _split(true, idxs=true_idxs, params=params, var=var)
+        if 'false' in self.children:
+            false = self.children['false']
+            var = _split(false, idxs=false_idxs, params=params, var=var)
+        return var

torchzero/modules/{ops → misc}/switch.py RENAMED Viewed

@@ -7,7 +7,28 @@ from ...core import Chainable, Module
 class Alternate(Module):
-    """alternate between stepping with `modules`"""
+    """Alternates between stepping with :code:`modules`.
+    That is, first step is performed with 1st module, second step with second module, etc.
+    Args:
+        steps (int | Iterable[int], optional): number of steps to perform with each module. Defaults to 1.
+    Examples:
+        Alternate between Adam, SignSGD and RMSprop
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Alternate(
+                    tz.m.Adam(),
+                    [tz.m.SignSGD(), tz.m.Mul(0.5)],
+                    tz.m.RMSprop(),
+                ),
+                tz.m.LR(1e-3),
+            )
+    """
     LOOP = True
     def __init__(self, *modules: Chainable, steps: int | Iterable[int] = 1):
         if isinstance(steps, Iterable):
@@ -23,16 +44,16 @@ class Alternate(Module):
         self.global_state['steps_to_next'] = steps[0] if isinstance(steps, list) else steps
     @torch.no_grad
-    def step(self, vars):
+    def step(self, var):
         # get current module
         current_module_idx = self.global_state.setdefault('current_module_idx', 0)
         module = self.children[f'module_{current_module_idx}']
         # step
-        vars = module.step(vars.clone(clone_update=False))
+        var = module.step(var.clone(clone_update=False))
         # number of steps until next module
-        steps = self.settings[vars.params[0]]['steps']
+        steps = self.settings[var.params[0]]['steps']
         if isinstance(steps, int): steps = [steps]*len(self.children)
         if 'steps_to_next' not in self.global_state:
@@ -51,17 +72,37 @@ class Alternate(Module):
             self.global_state['steps_to_next'] = steps[self.global_state['current_module_idx']]
-        return vars
+        return var
 class Switch(Alternate):
-    """switch to next module after some steps"""
+    """After :code:`steps` steps switches to the next module.
+    Args:
+        steps (int | Iterable[int]): Number of steps to perform with each module.
+    Examples:
+        Start with Adam, switch to L-BFGS after 1000th step and Truncated Newton on 2000th step.
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Switch(
+                    [tz.m.Adam(), tz.m.LR(1e-3)],
+                    [tz.m.LBFGS(), tz.m.Backtracking()],
+                    [tz.m.NewtonCG(maxiter=20), tz.m.Backtracking()],
+                    steps = (1000, 2000)
+                )
+            )
+    """
     LOOP = False
     def __init__(self, *modules: Chainable, steps: int | Iterable[int]):
         if isinstance(steps, Iterable):
             steps = list(steps)
             if len(steps) != len(modules) - 1:
-                raise ValueError(f"steps must be the same length as modules, got {len(modules) = }, {len(steps) = }")
+                raise ValueError(f"steps must be the same length as modules minus 1, got {len(modules) = }, {len(steps) = }")
             steps.append(1)

torchzero/modules/momentum/__init__.py CHANGED Viewed

@@ -11,4 +11,4 @@ from .experimental import CoordinateMomentum
 # from .matrix_momentum import MatrixMomentum
 from .momentum import NAG, HeavyBall
-from .matrix_momentum import MatrixMomentum, AdaptiveMatrixMomentum
+from .matrix_momentum import MatrixMomentum, AdaptiveMatrixMomentum

torchzero/modules/momentum/averaging.py CHANGED Viewed

@@ -1,3 +1,4 @@
+"""Modules that perform averaging over a history of past updates."""
 from collections import deque
 from collections.abc import Sequence
 from typing import Any, Literal, cast
@@ -9,14 +10,19 @@ from ...utils import tolist
 class Averaging(TensorwiseTransform):
+    """Average of past :code:`history_size` updates.
+    Args:
+        history_size (int): Number of past updates to average
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, history_size: int, target: Target = 'update'):
         defaults = dict(history_size=history_size)
         super().__init__(uses_grad=False, defaults=defaults, target=target)
     @torch.no_grad
-    def transform(self, tensor, param, grad, vars):
-        history_size = self.settings[param]['history_size']
-        state = self.state[param]
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+        history_size = setting['history_size']
         if 'history' not in state:
             state['history'] = deque(maxlen=history_size)
             state['average'] = torch.zeros_like(tensor)
@@ -29,15 +35,19 @@ class Averaging(TensorwiseTransform):
         return average / len(history)
 class WeightedAveraging(TensorwiseTransform):
-    """weights are oldest to newest"""
+    """Weighted average of past :code:`len(weights)` updates.
+    Args:
+        weights (Sequence[float]): a sequence of weights from oldest to newest.
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, weights: Sequence[float] | torch.Tensor | Any, target: Target = 'update'):
         defaults = dict(weights = tolist(weights))
         super().__init__(uses_grad=False, defaults=defaults, target=target)
     @torch.no_grad
-    def transform(self, tensor, param, grad, vars):
-        weights = self.settings[param]['weights']
-        state = self.state[param]
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+        weights = setting['weights']
         if 'history' not in state:
             state['history'] = deque(maxlen=len(weights))
@@ -59,14 +69,19 @@ class WeightedAveraging(TensorwiseTransform):
 class MedianAveraging(TensorwiseTransform):
+    """Median of past :code:`history_size` updates.
+    Args:
+        history_size (int): Number of past updates to average
+        target (Target, optional): target. Defaults to 'update'.
+    """
     def __init__(self, history_size: int, target: Target = 'update'):
         defaults = dict(history_size = history_size)
         super().__init__(uses_grad=False, defaults=defaults, target=target)
     @torch.no_grad
-    def transform(self, tensor, param, grad, vars):
-        history_size = self.settings[param]['history_size']
-        state = self.state[param]
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+        history_size = setting['history_size']
         if 'history' not in state:
             state['history'] = deque(maxlen=history_size)

torchzero 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl