PyPI - torchzero - Versions diffs - 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

docs/source/conf.py +57 -0
tests/test_identical.py +230 -0
tests/test_module.py +50 -0
tests/test_opts.py +884 -0
tests/test_tensorlist.py +1787 -0
tests/test_utils_optimizer.py +170 -0
tests/test_vars.py +184 -0
torchzero/__init__.py +4 -4
torchzero/core/__init__.py +3 -13
torchzero/core/module.py +629 -510
torchzero/core/preconditioner.py +137 -0
torchzero/core/transform.py +252 -0
torchzero/modules/__init__.py +13 -21
torchzero/modules/clipping/__init__.py +3 -0
torchzero/modules/clipping/clipping.py +320 -0
torchzero/modules/clipping/ema_clipping.py +135 -0
torchzero/modules/clipping/growth_clipping.py +187 -0
torchzero/modules/experimental/__init__.py +13 -18
torchzero/modules/experimental/absoap.py +350 -0
torchzero/modules/experimental/adadam.py +111 -0
torchzero/modules/experimental/adamY.py +135 -0
torchzero/modules/experimental/adasoap.py +282 -0
torchzero/modules/experimental/algebraic_newton.py +145 -0
torchzero/modules/experimental/curveball.py +89 -0
torchzero/modules/experimental/dsoap.py +290 -0
torchzero/modules/experimental/gradmin.py +85 -0
torchzero/modules/experimental/reduce_outward_lr.py +35 -0
torchzero/modules/experimental/spectral.py +286 -0
torchzero/modules/experimental/subspace_preconditioners.py +128 -0
torchzero/modules/experimental/tropical_newton.py +136 -0
torchzero/modules/functional.py +209 -0
torchzero/modules/grad_approximation/__init__.py +4 -0
torchzero/modules/grad_approximation/fdm.py +120 -0
torchzero/modules/grad_approximation/forward_gradient.py +81 -0
torchzero/modules/grad_approximation/grad_approximator.py +66 -0
torchzero/modules/grad_approximation/rfdm.py +259 -0
torchzero/modules/line_search/__init__.py +5 -30
torchzero/modules/line_search/backtracking.py +186 -0
torchzero/modules/line_search/line_search.py +181 -0
torchzero/modules/line_search/scipy.py +37 -0
torchzero/modules/line_search/strong_wolfe.py +260 -0
torchzero/modules/line_search/trust_region.py +61 -0
torchzero/modules/lr/__init__.py +2 -0
torchzero/modules/lr/lr.py +59 -0
torchzero/modules/lr/step_size.py +97 -0
torchzero/modules/momentum/__init__.py +14 -4
torchzero/modules/momentum/averaging.py +78 -0
torchzero/modules/momentum/cautious.py +181 -0
torchzero/modules/momentum/ema.py +173 -0
torchzero/modules/momentum/experimental.py +189 -0
torchzero/modules/momentum/matrix_momentum.py +124 -0
torchzero/modules/momentum/momentum.py +43 -106
torchzero/modules/ops/__init__.py +103 -0
torchzero/modules/ops/accumulate.py +65 -0
torchzero/modules/ops/binary.py +240 -0
torchzero/modules/ops/debug.py +25 -0
torchzero/modules/ops/misc.py +419 -0
torchzero/modules/ops/multi.py +137 -0
torchzero/modules/ops/reduce.py +149 -0
torchzero/modules/ops/split.py +75 -0
torchzero/modules/ops/switch.py +68 -0
torchzero/modules/ops/unary.py +115 -0
torchzero/modules/ops/utility.py +112 -0
torchzero/modules/optimizers/__init__.py +18 -10
torchzero/modules/optimizers/adagrad.py +146 -49
torchzero/modules/optimizers/adam.py +112 -118
torchzero/modules/optimizers/lion.py +18 -11
torchzero/modules/optimizers/muon.py +222 -0
torchzero/modules/optimizers/orthograd.py +55 -0
torchzero/modules/optimizers/rmsprop.py +103 -51
torchzero/modules/optimizers/rprop.py +342 -99
torchzero/modules/optimizers/shampoo.py +197 -0
torchzero/modules/optimizers/soap.py +286 -0
torchzero/modules/optimizers/sophia_h.py +129 -0
torchzero/modules/projections/__init__.py +5 -0
torchzero/modules/projections/dct.py +73 -0
torchzero/modules/projections/fft.py +73 -0
torchzero/modules/projections/galore.py +10 -0
torchzero/modules/projections/projection.py +218 -0
torchzero/modules/projections/structural.py +151 -0
torchzero/modules/quasi_newton/__init__.py +7 -4
torchzero/modules/quasi_newton/cg.py +218 -0
torchzero/modules/quasi_newton/experimental/__init__.py +1 -0
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +265 -0
torchzero/modules/quasi_newton/lbfgs.py +228 -0
torchzero/modules/quasi_newton/lsr1.py +170 -0
torchzero/modules/quasi_newton/olbfgs.py +196 -0
torchzero/modules/quasi_newton/quasi_newton.py +475 -0
torchzero/modules/second_order/__init__.py +3 -4
torchzero/modules/second_order/newton.py +142 -165
torchzero/modules/second_order/newton_cg.py +84 -0
torchzero/modules/second_order/nystrom.py +168 -0
torchzero/modules/smoothing/__init__.py +2 -5
torchzero/modules/smoothing/gaussian.py +164 -0
torchzero/modules/smoothing/{laplacian_smoothing.py → laplacian.py} +115 -128
torchzero/modules/weight_decay/__init__.py +1 -0
torchzero/modules/weight_decay/weight_decay.py +52 -0
torchzero/modules/wrappers/__init__.py +1 -0
torchzero/modules/wrappers/optim_wrapper.py +91 -0
torchzero/optim/__init__.py +2 -10
torchzero/optim/utility/__init__.py +1 -0
torchzero/optim/utility/split.py +45 -0
torchzero/optim/wrappers/nevergrad.py +2 -28
torchzero/optim/wrappers/nlopt.py +31 -16
torchzero/optim/wrappers/scipy.py +79 -156
torchzero/utils/__init__.py +27 -0
torchzero/utils/compile.py +175 -37
torchzero/utils/derivatives.py +513 -99
torchzero/utils/linalg/__init__.py +5 -0
torchzero/utils/linalg/matrix_funcs.py +87 -0
torchzero/utils/linalg/orthogonalize.py +11 -0
torchzero/utils/linalg/qr.py +71 -0
torchzero/utils/linalg/solve.py +168 -0
torchzero/utils/linalg/svd.py +20 -0
torchzero/utils/numberlist.py +132 -0
torchzero/utils/ops.py +10 -0
torchzero/utils/optimizer.py +284 -0
torchzero/utils/optuna_tools.py +40 -0
torchzero/utils/params.py +149 -0
torchzero/utils/python_tools.py +40 -25
torchzero/utils/tensorlist.py +1081 -0
torchzero/utils/torch_tools.py +48 -12
torchzero-0.3.2.dist-info/METADATA +379 -0
torchzero-0.3.2.dist-info/RECORD +128 -0
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info}/WHEEL +1 -1
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info/licenses}/LICENSE +0 -0
torchzero-0.3.2.dist-info/top_level.txt +3 -0
torchzero/core/tensorlist_optimizer.py +0 -219
torchzero/modules/adaptive/__init__.py +0 -4
torchzero/modules/adaptive/adaptive.py +0 -192
torchzero/modules/experimental/experimental.py +0 -294
torchzero/modules/experimental/quad_interp.py +0 -104
torchzero/modules/experimental/subspace.py +0 -259
torchzero/modules/gradient_approximation/__init__.py +0 -7
torchzero/modules/gradient_approximation/_fd_formulas.py +0 -3
torchzero/modules/gradient_approximation/base_approximator.py +0 -105
torchzero/modules/gradient_approximation/fdm.py +0 -125
torchzero/modules/gradient_approximation/forward_gradient.py +0 -163
torchzero/modules/gradient_approximation/newton_fdm.py +0 -198
torchzero/modules/gradient_approximation/rfdm.py +0 -125
torchzero/modules/line_search/armijo.py +0 -56
torchzero/modules/line_search/base_ls.py +0 -139
torchzero/modules/line_search/directional_newton.py +0 -217
torchzero/modules/line_search/grid_ls.py +0 -158
torchzero/modules/line_search/scipy_minimize_scalar.py +0 -62
torchzero/modules/meta/__init__.py +0 -12
torchzero/modules/meta/alternate.py +0 -65
torchzero/modules/meta/grafting.py +0 -195
torchzero/modules/meta/optimizer_wrapper.py +0 -173
torchzero/modules/meta/return_overrides.py +0 -46
torchzero/modules/misc/__init__.py +0 -10
torchzero/modules/misc/accumulate.py +0 -43
torchzero/modules/misc/basic.py +0 -115
torchzero/modules/misc/lr.py +0 -96
torchzero/modules/misc/multistep.py +0 -51
torchzero/modules/misc/on_increase.py +0 -53
torchzero/modules/operations/__init__.py +0 -29
torchzero/modules/operations/multi.py +0 -298
torchzero/modules/operations/reduction.py +0 -134
torchzero/modules/operations/singular.py +0 -113
torchzero/modules/optimizers/sgd.py +0 -54
torchzero/modules/orthogonalization/__init__.py +0 -2
torchzero/modules/orthogonalization/newtonschulz.py +0 -159
torchzero/modules/orthogonalization/svd.py +0 -86
torchzero/modules/regularization/__init__.py +0 -22
torchzero/modules/regularization/dropout.py +0 -34
torchzero/modules/regularization/noise.py +0 -77
torchzero/modules/regularization/normalization.py +0 -328
torchzero/modules/regularization/ortho_grad.py +0 -78
torchzero/modules/regularization/weight_decay.py +0 -92
torchzero/modules/scheduling/__init__.py +0 -2
torchzero/modules/scheduling/lr_schedulers.py +0 -131
torchzero/modules/scheduling/step_size.py +0 -80
torchzero/modules/smoothing/gaussian_smoothing.py +0 -90
torchzero/modules/weight_averaging/__init__.py +0 -2
torchzero/modules/weight_averaging/ema.py +0 -72
torchzero/modules/weight_averaging/swa.py +0 -171
torchzero/optim/experimental/__init__.py +0 -20
torchzero/optim/experimental/experimental.py +0 -343
torchzero/optim/experimental/ray_search.py +0 -83
torchzero/optim/first_order/__init__.py +0 -18
torchzero/optim/first_order/cautious.py +0 -158
torchzero/optim/first_order/forward_gradient.py +0 -70
torchzero/optim/first_order/optimizers.py +0 -570
torchzero/optim/modular.py +0 -148
torchzero/optim/quasi_newton/__init__.py +0 -1
torchzero/optim/quasi_newton/directional_newton.py +0 -58
torchzero/optim/second_order/__init__.py +0 -1
torchzero/optim/second_order/newton.py +0 -94
torchzero/optim/zeroth_order/__init__.py +0 -4
torchzero/optim/zeroth_order/fdm.py +0 -87
torchzero/optim/zeroth_order/newton_fdm.py +0 -146
torchzero/optim/zeroth_order/rfdm.py +0 -217
torchzero/optim/zeroth_order/rs.py +0 -85
torchzero/random/__init__.py +0 -1
torchzero/random/random.py +0 -46
torchzero/tensorlist.py +0 -826
torchzero-0.1.8.dist-info/METADATA +0 -130
torchzero-0.1.8.dist-info/RECORD +0 -104
torchzero-0.1.8.dist-info/top_level.txt +0 -1

torchzero/modules/operations/reduction.py DELETED Viewed

@@ -1,134 +0,0 @@
-from collections.abc import Callable, Iterable
-import numpy as np
-import torch
-from ...core import OptimizerModule
-_Value = int | float | OptimizerModule | Iterable[OptimizerModule]
-class Sum(OptimizerModule):
-    """calculates sum of multiple updates.
-    Args:
-        *modules:
-            either OptimizerModules or iterables of OptimizerModules to chain. Scalars are also allowed."""
-    def __init__(
-        self,
-        *modules: _Value,
-    ):
-        super().__init__({})
-        scalars = [i for i in modules if isinstance(i, (int,float))]
-        self.scalar = sum(scalars) if len(scalars) > 0 else None
-        for i,module in enumerate(i for i in modules if not isinstance(i, (int, float))):
-            self._set_child_(i, module)
-    @torch.no_grad
-    def step(self, vars):
-        if len(self.children) == 1:
-            vars.ascent = self.children[0].return_ascent(vars)
-            if self.scalar is not None: vars.ascent += self.scalar
-            return self._update_params_or_step_with_next(vars)
-        sum = None
-        for i, c in sorted(self.children.items(), key=lambda x: x[0]):
-            if i == len(self.children) - 1: cur_state = vars
-            else: cur_state = vars.copy(clone_ascent = True)
-            if sum is None: sum = c.return_ascent(cur_state)
-            else: sum += c.return_ascent(cur_state)
-            if i != len(self.children) - 1: vars.update_attrs_(cur_state)
-        assert sum is not None
-        if self.scalar is not None: sum += self.scalar
-        vars.ascent = sum
-        return self._update_params_or_step_with_next(vars)
-class Mean(OptimizerModule):
-    """calculates mean of multiple updates.
-    Args:
-        *modules:
-            either OptimizerModules or iterables of OptimizerModules to chain. Scalars are also allowed."""
-    def __init__(
-        self,
-        *modules: _Value,
-    ):
-        super().__init__({})
-        scalars = [i for i in modules if isinstance(i, (int,float))]
-        self.scalar = sum(scalars) if len(scalars) > 0 else None
-        self.n_values = len(modules)
-        for i,module in enumerate(i for i in modules if not isinstance(i, (int, float))):
-            self._set_child_(i, module)
-    @torch.no_grad
-    def step(self, vars):
-        if len(self.children) == 1:
-            vars.ascent = self.children[0].return_ascent(vars)
-            if self.scalar is not None: vars.ascent += self.scalar
-            if self.n_values > 1: vars.ascent /= self.n_values
-            return self._update_params_or_step_with_next(vars)
-        sum = None
-        for i, c in sorted(self.children.items(), key=lambda x: x[0]):
-            if i == len(self.children) - 1: cur_state = vars
-            else: cur_state = vars.copy(clone_ascent = True)
-            if sum is None: sum = c.return_ascent(cur_state)
-            else: sum += c.return_ascent(cur_state)
-            if i != len(self.children) - 1: vars.update_attrs_(cur_state)
-        assert sum is not None
-        if self.scalar is not None: sum += self.scalar
-        if self.n_values > 1: sum /= self.n_values
-        vars.ascent = sum
-        return self._update_params_or_step_with_next(vars)
-class Product(OptimizerModule):
-    """calculates product of multiple updates.
-    Args:
-        *modules:
-            either OptimizerModules or iterables of OptimizerModules to chain. Scalars are also allowed."""
-    def __init__(
-        self,
-        *modules: _Value,
-    ):
-        super().__init__({})
-        scalars = [i for i in modules if isinstance(i, (int,float))]
-        self.scalar = np.prod(scalars).item() if len(scalars) > 0 else None
-        for i,module in enumerate(i for i in modules if not isinstance(i, (int, float))):
-            self._set_child_(i, module)
-    @torch.no_grad
-    def step(self, vars):
-        if len(self.children) == 1:
-            vars.ascent = self.children[0].return_ascent(vars)
-            if self.scalar is not None: vars.ascent *= self.scalar
-            return self._update_params_or_step_with_next(vars)
-        prod = None
-        for i, c in sorted(self.children.items(), key=lambda x: x[0]):
-            if i == len(self.children) - 1: cur_state = vars
-            else: cur_state = vars.copy(clone_ascent = True)
-            if prod is None: prod = c.return_ascent(cur_state)
-            else: prod *= c.return_ascent(cur_state)
-            if i != len(self.children) - 1: vars.update_attrs_(cur_state)
-        assert prod is not None
-        if self.scalar is not None: prod *= self.scalar
-        vars.ascent = prod
-        return self._update_params_or_step_with_next(vars)

torchzero/modules/operations/singular.py DELETED Viewed

@@ -1,113 +0,0 @@
-from collections.abc import Iterable
-from operator import methodcaller
-import torch
-from ...core import OptimizerModule
-from ...tensorlist import TensorList
-class Operation(OptimizerModule):
-    """Applies an operation to the ascent, supported operations:
-    `abs`, `sign`, `sin`, `cos`, `tan`, `asin`, `acos`, `atan`, `sinh`, `cosh`,
-    `tanh`, `log`, `log1p`, `log2`, `log10`, `erf`, `erfc`, `exp`, `neg`, `reciprocal`,
-    `copy`, `zero`, `sqrt`, `floor`, `ceil`, `round`."""
-    def __init__(self, operation: str):
-        super().__init__({})
-        self.operation = methodcaller(f'{operation}_')
-    @torch.no_grad
-    def _update(self, vars, ascent): return self.operation(ascent)
-class Reciprocal(OptimizerModule):
-    """*1 / update*"""
-    def __init__(self,):
-        super().__init__({})
-    @torch.no_grad()
-    def _update(self, vars, ascent): return ascent.reciprocal_()
-class Negate(OptimizerModule):
-    """minus update"""
-    def __init__(self,):
-        super().__init__({})
-    @torch.no_grad()
-    def _update(self, vars, ascent): return ascent.neg_()
-def sign_grad_(params: Iterable[torch.Tensor]):
-    """Apply sign function to gradients of an iterable of parameters.
-    Args:
-        params (abc.Iterable[torch.Tensor]): an iterable of Tensors or a single Tensor.
-    """
-    TensorList(params).get_existing_grads().sign_()
-class Sign(OptimizerModule):
-    """applies sign function to the update"""
-    def __init__(self):
-        super().__init__({})
-    @torch.no_grad
-    def _update(self, vars, ascent): return ascent.sign_()
-class Abs(OptimizerModule):
-    """takes absolute values of the update."""
-    def __init__(self):
-        super().__init__({})
-    @torch.no_grad
-    def _update(self, vars, ascent): return ascent.abs_()
-class Sin(OptimizerModule):
-    """applies sin function to the ascent"""
-    def __init__(self):
-        super().__init__({})
-    @torch.no_grad
-    def _update(self, vars, ascent): return ascent.sin_()
-class Cos(OptimizerModule):
-    """applies cos function to the ascent"""
-    def __init__(self):
-        super().__init__({})
-    @torch.no_grad
-    def _update(self, vars, ascent): return ascent.cos_()
-class NanToNum(OptimizerModule):
-    """Convert `nan`, `inf` and `-inf` to numbers.
-    Args:
-        nan (optional): the value to replace NaNs with. Default is zero.
-        posinf (optional): if a Number, the value to replace positive infinity values with.
-            If None, positive infinity values are replaced with the greatest finite value
-            representable by input's dtype. Default is None.
-        neginf (optional): if a Number, the value to replace negative infinity values with.
-            If None, negative infinity values are replaced with the lowest finite value
-            representable by input's dtype. Default is None.
-    """
-    def __init__(self, nan=None, posinf=None, neginf=None):
-        super().__init__({})
-        self.nan = nan
-        self.posinf = posinf
-        self.neginf = neginf
-    @torch.no_grad()
-    def _update(self, vars, ascent): return ascent.nan_to_num_(self.nan, self.posinf, self.neginf)
-class MagnitudePower(OptimizerModule):
-    """Raises update to the `value` power, but preserves the sign when the power is odd."""
-    def __init__(self, value: int | float):
-        super().__init__({})
-        self.value = value
-    @torch.no_grad()
-    def _update(self, vars, ascent):
-        if self.value % 2 == 1: return ascent.pow_(self.value)
-        return ascent.abs().pow_(self.value) * ascent.sign()

torchzero/modules/optimizers/sgd.py DELETED Viewed

@@ -1,54 +0,0 @@
-import typing as T
-import torch
-from ...core import OptimizerModule
-from ..momentum.momentum import _heavyball_step, _nesterov_step_
-class SGD(OptimizerModule):
-    """Same as `torch.optim.SGD` but as an optimizer module. Exactly matches `torch.optim.SGD`, except
-    nesterov momentum additionally supports dampening, and negative momentum is allowed.
-    Args:
-        momentum (float, optional): momentum. Defaults to 0.
-        dampening (float, optional): momentum dampening. Defaults to 0.
-        weight_decay (float, optional): weight decay (L2 regularization). Defaults to 0.
-        nesterov (bool, optional):
-            enables nesterov momentum, otherwise uses heavyball momentum. Defaults to False.
-        alpha (float, optional): learning rate. Defaults to 1.
-    """
-    def __init__(
-        self,
-        momentum: float = 0,
-        dampening: float = 0,
-        weight_decay: float = 0,
-        nesterov: bool = False,
-        alpha: float = 1,
-    ):
-        defaults = dict(alpha=alpha, momentum=momentum, dampening=dampening, weight_decay=weight_decay,)
-        super().__init__(defaults)
-        self.nesterov = nesterov
-        self.current_step = 0
-    @torch.no_grad
-    def _update(self, vars, ascent):
-        params = self.get_params()
-        settings = self.get_all_group_keys()
-        if any(i != 0 for i in settings['weight_decay']):
-            ascent += params * settings['weight_decay']
-        if any(i != 1 for i in settings['alpha']):
-            ascent *= settings['alpha']
-        if any(i != 0 for i in settings['momentum']):
-            velocity = self.get_state_key('velocity', init = torch.zeros_like if self.nesterov else ascent)
-            # consistency with pytorch which on first step only initializes momentum
-            if self.current_step > 0 or self.nesterov:
-                # nesterov step can be done in-place, polyak returns new direction
-                if self.nesterov: _nesterov_step_(ascent, velocity, settings['momentum'], settings['dampening'])
-                else: ascent = _heavyball_step(ascent, velocity, settings['momentum'], settings['dampening'])
-        self.current_step += 1
-        return ascent

torchzero/modules/orthogonalization/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .svd import Orthogonalize, orthogonalize_grad_
2	- from .newtonschulz import ZeropowerViaNewtonSchulz, zeropower_via_newtonschulz_, DualNormCorrection

torchzero/modules/orthogonalization/newtonschulz.py DELETED Viewed

@@ -1,159 +0,0 @@
-"""
-Newton-Schulz iteration code is taken from https://github.com/KellerJordan/Muon
-Keller Jordan and Yuchen Jin and Vlado Boza and You Jiacheng and Franz Cecista and Laker Newhouse and Jeremy Bernstein.
-Muon: An optimizer for hidden layers in neural networks (2024). URL: https://kellerjordan.github.io/posts/muon
-"""
-from collections.abc import Iterable
-import torch
-from ...core import OptimizerModule, _Targets
-# from ...utils.compile import maybe_compile
-def _zeropower_via_newtonschulz5(G, steps):
-    """
-    code from https://github.com/KellerJordan/Muon
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
-    assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750,  2.0315)
-    X = G.bfloat16()
-    if G.size(0) > G.size(1):
-        X = X.T
-    # Ensure spectral norm is at most 1
-    X = X / (X.norm() + 1e-7)
-    # Perform the NS iterations
-    for _ in range(steps):
-        A = X @ X.T
-        B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
-        X = a * X + B @ X
-    if G.size(0) > G.size(1):
-        X = X.T
-    return X
-_compiled_zeropower_via_newtonschulz5 = torch.compile(_zeropower_via_newtonschulz5)
-def zeropower_via_newtonschulz_(params: Iterable[torch.Tensor], steps: int = 6, adaptive = False, compiled = True):
-    """Uses newton-Schulz iteration to compute the zeroth power / orthogonalization of gradients of an iterable of parameters.
-    This sets gradients in-place.
-    Note that the Muon page says that embeddings and classifier heads should not be orthogonalized.
-    The orthogonalization code is taken from https://github.com/KellerJordan/Muon
-    Args:
-        params (abc.Iterable[torch.Tensor]): parameters that hold gradients to orthogonalize.
-        steps (int): The number of Newton-Schulz iterations to run. (6 is probably always enough).
-            The number of Newton-Schulz iterations to run. (6 is probably always enough). Defaults to 6.
-        adaptive (bool, optional):
-            Enables adaptation to scale of gradients (from https://github.com/leloykun/adaptive-muon). Defaults to False.
-        compiled (bool, optional):
-            Uses compiled newton-Schulz iteration function. Faster but won't work on windows. Defaults to True.
-    """
-    if compiled: fn = _compiled_zeropower_via_newtonschulz5
-    else: fn = _zeropower_via_newtonschulz5
-    for p in params:
-        if p.grad is not None and p.grad.ndim >= 2 and min(p.grad.shape) >= 2:
-            G = p.grad.view(p.grad.shape[0], -1)
-            X = fn(G, steps)
-            if adaptive:
-                # this is from https://github.com/leloykun/adaptive-muon
-                X = torch.einsum('ij,ij,ab->ab', G.type_as(X), X, X)  # Adaptive scaling,`(G * X).sum() * X` == (G.T @ X).trace() * X
-            p.grad = X.reshape_as(p.grad).to(p.grad, copy=False)
-class ZeropowerViaNewtonSchulz(OptimizerModule):
-    """Uses Newton-Schulz iteration to compute the zeroth power / orthogonalization of gradients of an iterable of parameters.
-    To disable orthogonalization for a parameter, put it into a parameter group with "newtonshultz" = False.
-    The Muon page says that embeddings and classifier heads should not be orthogonalized.
-    The orthogonalization code is taken from https://github.com/KellerJordan/Muon.
-    Note that unlike this module, Muon also uses Adam for gradients that are not orthogonalized,
-    so I'd still recommend using it. Maybe use `Wrap` to wrap it into a module (I will make muon
-    with selectable modules to optimize non-muon params soon)
-    However not using Adam, or putting Adam module after this to apply it to ALL updates, both seem
-    to work quite well too.
-    Args:
-        ns_steps (int, optional):
-            The number of Newton-Schulz iterations to run. (6 is probably always enough). Defaults to 6.
-        adaptive (bool, optional):
-            Enables adaptation to scale of gradients (from https://github.com/leloykun/adaptive-muon). Defaults to True.
-        compiled (bool, optional):
-            Uses compiled newton-Schulz iteration function. Faster but won't work on windows. Defaults to True.
-        target (str, optional):
-            determines what this module updates.
-            "ascent" - it updates the ascent
-            "grad" - it updates the gradient (and sets `.grad` attributes to updated gradient).
-            "closure" - it makes a new closure that sets the updated ascent to the .`grad` attributes.
-    """
-    def __init__(self, ns_steps = 6, adaptive = False, compiled=True, target:_Targets='ascent'):
-        defaults = dict(newtonshultz = True, ns_steps=ns_steps, adaptive=adaptive)
-        super().__init__(defaults, target=target)
-        if compiled: self._zeropower_via_newtonschulz5 = _compiled_zeropower_via_newtonschulz5
-        else: self._zeropower_via_newtonschulz5 = _zeropower_via_newtonschulz5
-    def _update(self, vars, ascent):
-        toggle, ns_steps, adaptive = self.get_group_keys('newtonshultz', 'ns_steps', 'adaptive', cls=list)
-        for asc, enable, steps, ada in zip(ascent, toggle, ns_steps, adaptive):
-            if enable and len([i for i in asc.shape if i > 1]) != 0:
-                G = asc.view(asc.shape[0], -1)
-                X = self._zeropower_via_newtonschulz5(G, steps)
-                if ada:
-                    # this is from https://github.com/leloykun/adaptive-muon
-                    X = torch.einsum('ij,ij,ab->ab', G.type_as(X), X, X)  # Adaptive scaling,`(G * X).sum() * X` == (G.T @ X).trace() * X
-                asc.set_(X.reshape_as(asc).to(asc, copy=False)) # type:ignore
-        return ascent
-class DualNormCorrection(OptimizerModule):
-    """Dual norm correction from https://github.com/leloykun/adaptive-muon.
-    Description from the page:
-    Single-line modification to any (dualizer-based) optimizer that allows the optimizer to adapt to the scale of the gradients as they change during training.
-    This is done by scaling the dualized gradient by the clipped dual norm of the original gradient.
-    """
-    def __init__(self, adaptive_scale_min: int | None = -1, adaptive_scale_max: int | None = 1):
-        defaults = dict(adaptive_scale_min = adaptive_scale_min, adaptive_scale_max = adaptive_scale_max)
-        super().__init__(defaults)
-    def _update(self, vars, ascent):
-        params = self.get_params()
-        adaptive_scale_min, adaptive_scale_max = self.get_group_keys('adaptive_scale_min', 'adaptive_scale_max')
-        for asc, grad, min, max in zip(ascent, vars.maybe_compute_grad_(params), adaptive_scale_min, adaptive_scale_max):
-            if len([i for i in asc.shape if i > 1]) != 0:
-                scale = torch.einsum('ij,ij->', grad.view(grad.shape[0], -1), asc.view(asc.shape[0], -1))
-                if min is not None or max is not None: scale = scale.clip(min, max)
-                asc *= scale
-        return ascent

torchzero/modules/orthogonalization/svd.py DELETED Viewed

@@ -1,86 +0,0 @@
-"""Orthogonalization code adapted from https://github.com/MarkTuddenham/Orthogonal-Optimisers
-Tuddenham, M., Prügel-Bennett, A., & Hare, J. (2022).
-Orthogonalising gradients to speed up neural network optimisation. arXiv preprint arXiv:2202.07052.
-"""
-import logging
-from collections.abc import Iterable, Sequence
-import torch
-from ...core import OptimizerModule, _Targets
-@torch.no_grad()
-def _orthogonalize_update_(updates: Sequence[torch.Tensor], toggle = None, warn_fail=True) -> None:
-    """adapted from https://github.com/MarkTuddenham/Orthogonal-Optimisers"""
-    if toggle is None: toggle = [True] * len(updates)
-    # Orthogonalise the gradients using SVD
-    for grad, orth in zip(updates, toggle):
-        if orth and grad.ndim > 1:
-            G: torch.Tensor = grad.view(grad.shape[0], -1)
-            orth_G: torch.Tensor | None = None
-            try:
-                u, s, vt = torch.linalg.svd(G, full_matrices=False) # pylint:disable=not-callable
-                orth_G = u @ vt
-            except RuntimeError:
-                # if warn: logging.warning('Failed to perform SVD, adding some noise.')
-                try:
-                    u, s, v = torch.svd_lowrank(
-                        G,
-                        q=1,    # assume rank is at least 1
-                        M=1e-4 * G.mean() * torch.randn_like(G))
-                    orth_G = u @ v.T
-                except RuntimeError:
-                    if warn_fail: logging.error(('Failed to perform SVD with noise,'
-                                    ' skipping gradient orthogonalisation'))
-            if orth_G is not None:
-                grad.set_(orth_G.reshape_as(grad)) # type:ignore
-    return updates
-def orthogonalize_grad_(params: Iterable[torch.Tensor], warn_fail=False):
-    """orthogonalizes gradients of an iterable of parameters.
-    This updates gradients in-place.
-    The orthogonalization code is adapted from https://github.com/MarkTuddenham/Orthogonal-Optimisers
-    Args:
-        params (abc.Iterable[torch.Tensor]): parameters that hold gradients to orthogonalize.
-        warn_fail (bool, optional):
-            whether to print a warning when orthogonalization fails, and gradients are not
-            orthogonalized. Defaults to True.
-    """
-    grads = [p.grad for p in params if p.grad is not None]
-    _orthogonalize_update_(grads, warn_fail=warn_fail)
-class Orthogonalize(OptimizerModule):
-    """Orthogonalizes the update using SVD.
-    To disable orthogonalization for a parameter, put it into a parameter group with "orth" = False.
-    The orthogonalization code is adapted from https://github.com/MarkTuddenham/Orthogonal-Optimisers
-    Tip: :py:class:`tz.m.ZeropowerViaNewtonSchulz` is a significantly faster version of this.
-    Args:
-        warn_fail (bool, optional):
-            whether to print a warning when orthogonalization fails, and gradients are not
-            orthogonalized. Defaults to True.
-        target (str, optional):
-            determines what this module updates.
-            "ascent" - it updates the ascent
-            "grad" - it updates the gradient (and sets `.grad` attributes to updated gradient).
-            "closure" - it makes a new closure that sets the updated ascent to the .`grad` attributes.
-    """
-    def __init__(self, warn_fail=True, target: _Targets = 'ascent'):
-        defaults = dict(orth = True)
-        super().__init__(defaults, target = target)
-        self.warn_fail = warn_fail
-    def _update(self, vars, ascent):
-        toggle = self.get_group_key('orth', cls=list)
-        _orthogonalize_update_(ascent, toggle, self.warn_fail)
-        return ascent

torchzero/modules/regularization/__init__.py DELETED Viewed

@@ -1,22 +0,0 @@
-r"""
-This includes regularization modules like weight decay.
-"""
-from .dropout import Dropout
-from .noise import AddNoise, Random, add_noise_
-from .normalization import (
-    Centralize,
-    ClipNorm,
-    ClipValue,
-    Normalize,
-    centralize_grad_,
-    clip_grad_norm_,
-    clip_grad_value_,
-    normalize_grad_,
-)
-from .weight_decay import (
-    WeightDecay,
-    l1_regularize_,
-    l2_regularize_,
-    weight_decay_penalty,
-)
-from .ortho_grad import OrthoGrad, orthograd_

torchzero/modules/regularization/dropout.py DELETED Viewed

@@ -1,34 +0,0 @@
-import typing as T
-from collections import abc
-import torch
-from ...tensorlist import Distributions, TensorList
-from ...core import OptimizerModule
-class Dropout(OptimizerModule):
-    """
-    Applies dropout to the update - sets random elements to 0.
-    This can be used to apply learning rate dropout, if put after other modules, or gradient dropout,
-    if put first.
-    Args:
-        p (float, optional): probability to replace update value with zero. Defaults to 0.5.
-    reference
-        *Lin, H., Zeng, W., Zhuang, Y., Ding, X., Huang, Y., & Paisley, J. (2022).
-        Learning rate dropout. IEEE Transactions on Neural Networks and Learning Systems,
-        34(11), 9029-9039.*
-    """
-    def __init__(self, p: float = 0.5):
-        defaults = dict(p = p)
-        super().__init__(defaults)
-    @torch.no_grad
-    def _update(self, vars, ascent):
-        p = self.get_group_key('p')
-        ascent *= ascent.bernoulli_like(p)
-        return ascent

torchzero/modules/regularization/noise.py DELETED Viewed

@@ -1,77 +0,0 @@
-from collections import abc
-from typing import Literal
-import torch
-from ...core import OptimizerModule
-from ...tensorlist import Distributions, TensorList, _Scalar, _ScalarSequence
-def add_noise_(
-    grads: abc.Iterable[torch.Tensor],
-    alpha: "_Scalar | _ScalarSequence" = 1e-2,
-    distribution: Distributions = "normal",
-    mode: Literal["absolute", "global", "param", "channel"] = "param",
-):
-    if not isinstance(grads, TensorList): grads = TensorList(grads)
-    if mode == 'absolute':
-        grads += grads.sample_like(alpha, distribution)
-    elif mode == 'global':
-        grads += grads.sample_like((grads.total_vector_norm(1)/grads.total_numel() * alpha).detach().cpu().item(), distribution) # type:ignore
-    elif mode == 'param':
-        grads += grads.sample_like(grads.abs().mean()*alpha, distribution)
-    elif mode == 'channel':
-        grads = grads.unbind_channels()
-        grads += grads.sample_like(grads.abs().mean()*alpha, distribution)
-class AddNoise(OptimizerModule):
-    """Add noise to update. By default noise magnitude is relative to the mean of each parameter.
-    Args:
-        alpha (float, optional): magnitude of noise. Defaults to 1e-2.
-        distribution (Distributions, optional): distribution of noise. Defaults to 'normal'.
-        mode (str, optional):
-            how to calculate noise magnitude.
-            - "absolute": ignores gradient magnitude and always uses `alpha` as magnitude.
-            - "global": multiplies `alpha` by mean of the entire gradient, as if it was a single vector.
-            - "param": multiplies `alpha` by mean of each individual parameter (default).
-            - "channel": multiplies `alpha` by mean of each channel of each parameter.
-        """
-    def __init__(
-        self,
-        alpha: float = 1.,
-        distribution: Distributions = "normal",
-        mode: Literal["absolute", "global", "param", "channel"] = "param",
-    ):
-        defaults = dict(alpha = alpha)
-        super().__init__(defaults)
-        self.distribution: Distributions = distribution
-        self.mode: Literal["absolute", "global", "param", "channel"] = mode
-    @torch.no_grad
-    def _update(self, vars, ascent):
-        alpha = self.get_group_key('alpha')
-        add_noise_(ascent, alpha, self.distribution, self.mode)
-        return ascent
-class Random(OptimizerModule):
-    """uses a random vector as the update. The vector is completely random and isn't checked to be descent direction.
-    This is therefore mainly useful in combination with other modules like Sum, Multiply, etc."""
-    def __init__(self, alpha: float = 1, distribution: Distributions = "normal"):
-        defaults = dict(alpha = alpha)
-        super().__init__(defaults)
-        self.distribution: Distributions = distribution
-    @torch.no_grad
-    def _update(self, vars, ascent):
-        alpha = self.get_group_key('alpha')
-        return ascent.sample_like(alpha, self.distribution)

torchzero 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl