PyPI - torchzero - Versions diffs - 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

docs/source/conf.py +57 -0
tests/test_identical.py +230 -0
tests/test_module.py +50 -0
tests/test_opts.py +884 -0
tests/test_tensorlist.py +1787 -0
tests/test_utils_optimizer.py +170 -0
tests/test_vars.py +184 -0
torchzero/__init__.py +4 -4
torchzero/core/__init__.py +3 -13
torchzero/core/module.py +629 -510
torchzero/core/preconditioner.py +137 -0
torchzero/core/transform.py +252 -0
torchzero/modules/__init__.py +13 -21
torchzero/modules/clipping/__init__.py +3 -0
torchzero/modules/clipping/clipping.py +320 -0
torchzero/modules/clipping/ema_clipping.py +135 -0
torchzero/modules/clipping/growth_clipping.py +187 -0
torchzero/modules/experimental/__init__.py +13 -18
torchzero/modules/experimental/absoap.py +350 -0
torchzero/modules/experimental/adadam.py +111 -0
torchzero/modules/experimental/adamY.py +135 -0
torchzero/modules/experimental/adasoap.py +282 -0
torchzero/modules/experimental/algebraic_newton.py +145 -0
torchzero/modules/experimental/curveball.py +89 -0
torchzero/modules/experimental/dsoap.py +290 -0
torchzero/modules/experimental/gradmin.py +85 -0
torchzero/modules/experimental/reduce_outward_lr.py +35 -0
torchzero/modules/experimental/spectral.py +286 -0
torchzero/modules/experimental/subspace_preconditioners.py +128 -0
torchzero/modules/experimental/tropical_newton.py +136 -0
torchzero/modules/functional.py +209 -0
torchzero/modules/grad_approximation/__init__.py +4 -0
torchzero/modules/grad_approximation/fdm.py +120 -0
torchzero/modules/grad_approximation/forward_gradient.py +81 -0
torchzero/modules/grad_approximation/grad_approximator.py +66 -0
torchzero/modules/grad_approximation/rfdm.py +259 -0
torchzero/modules/line_search/__init__.py +5 -30
torchzero/modules/line_search/backtracking.py +186 -0
torchzero/modules/line_search/line_search.py +181 -0
torchzero/modules/line_search/scipy.py +37 -0
torchzero/modules/line_search/strong_wolfe.py +260 -0
torchzero/modules/line_search/trust_region.py +61 -0
torchzero/modules/lr/__init__.py +2 -0
torchzero/modules/lr/lr.py +59 -0
torchzero/modules/lr/step_size.py +97 -0
torchzero/modules/momentum/__init__.py +14 -4
torchzero/modules/momentum/averaging.py +78 -0
torchzero/modules/momentum/cautious.py +181 -0
torchzero/modules/momentum/ema.py +173 -0
torchzero/modules/momentum/experimental.py +189 -0
torchzero/modules/momentum/matrix_momentum.py +124 -0
torchzero/modules/momentum/momentum.py +43 -106
torchzero/modules/ops/__init__.py +103 -0
torchzero/modules/ops/accumulate.py +65 -0
torchzero/modules/ops/binary.py +240 -0
torchzero/modules/ops/debug.py +25 -0
torchzero/modules/ops/misc.py +419 -0
torchzero/modules/ops/multi.py +137 -0
torchzero/modules/ops/reduce.py +149 -0
torchzero/modules/ops/split.py +75 -0
torchzero/modules/ops/switch.py +68 -0
torchzero/modules/ops/unary.py +115 -0
torchzero/modules/ops/utility.py +112 -0
torchzero/modules/optimizers/__init__.py +18 -10
torchzero/modules/optimizers/adagrad.py +146 -49
torchzero/modules/optimizers/adam.py +112 -118
torchzero/modules/optimizers/lion.py +18 -11
torchzero/modules/optimizers/muon.py +222 -0
torchzero/modules/optimizers/orthograd.py +55 -0
torchzero/modules/optimizers/rmsprop.py +103 -51
torchzero/modules/optimizers/rprop.py +342 -99
torchzero/modules/optimizers/shampoo.py +197 -0
torchzero/modules/optimizers/soap.py +286 -0
torchzero/modules/optimizers/sophia_h.py +129 -0
torchzero/modules/projections/__init__.py +5 -0
torchzero/modules/projections/dct.py +73 -0
torchzero/modules/projections/fft.py +73 -0
torchzero/modules/projections/galore.py +10 -0
torchzero/modules/projections/projection.py +218 -0
torchzero/modules/projections/structural.py +151 -0
torchzero/modules/quasi_newton/__init__.py +7 -4
torchzero/modules/quasi_newton/cg.py +218 -0
torchzero/modules/quasi_newton/experimental/__init__.py +1 -0
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +265 -0
torchzero/modules/quasi_newton/lbfgs.py +228 -0
torchzero/modules/quasi_newton/lsr1.py +170 -0
torchzero/modules/quasi_newton/olbfgs.py +196 -0
torchzero/modules/quasi_newton/quasi_newton.py +475 -0
torchzero/modules/second_order/__init__.py +3 -4
torchzero/modules/second_order/newton.py +142 -165
torchzero/modules/second_order/newton_cg.py +84 -0
torchzero/modules/second_order/nystrom.py +168 -0
torchzero/modules/smoothing/__init__.py +2 -5
torchzero/modules/smoothing/gaussian.py +164 -0
torchzero/modules/smoothing/{laplacian_smoothing.py → laplacian.py} +115 -128
torchzero/modules/weight_decay/__init__.py +1 -0
torchzero/modules/weight_decay/weight_decay.py +52 -0
torchzero/modules/wrappers/__init__.py +1 -0
torchzero/modules/wrappers/optim_wrapper.py +91 -0
torchzero/optim/__init__.py +2 -10
torchzero/optim/utility/__init__.py +1 -0
torchzero/optim/utility/split.py +45 -0
torchzero/optim/wrappers/nevergrad.py +2 -28
torchzero/optim/wrappers/nlopt.py +31 -16
torchzero/optim/wrappers/scipy.py +79 -156
torchzero/utils/__init__.py +27 -0
torchzero/utils/compile.py +175 -37
torchzero/utils/derivatives.py +513 -99
torchzero/utils/linalg/__init__.py +5 -0
torchzero/utils/linalg/matrix_funcs.py +87 -0
torchzero/utils/linalg/orthogonalize.py +11 -0
torchzero/utils/linalg/qr.py +71 -0
torchzero/utils/linalg/solve.py +168 -0
torchzero/utils/linalg/svd.py +20 -0
torchzero/utils/numberlist.py +132 -0
torchzero/utils/ops.py +10 -0
torchzero/utils/optimizer.py +284 -0
torchzero/utils/optuna_tools.py +40 -0
torchzero/utils/params.py +149 -0
torchzero/utils/python_tools.py +40 -25
torchzero/utils/tensorlist.py +1081 -0
torchzero/utils/torch_tools.py +48 -12
torchzero-0.3.2.dist-info/METADATA +379 -0
torchzero-0.3.2.dist-info/RECORD +128 -0
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info}/WHEEL +1 -1
{torchzero-0.1.8.dist-info → torchzero-0.3.2.dist-info/licenses}/LICENSE +0 -0
torchzero-0.3.2.dist-info/top_level.txt +3 -0
torchzero/core/tensorlist_optimizer.py +0 -219
torchzero/modules/adaptive/__init__.py +0 -4
torchzero/modules/adaptive/adaptive.py +0 -192
torchzero/modules/experimental/experimental.py +0 -294
torchzero/modules/experimental/quad_interp.py +0 -104
torchzero/modules/experimental/subspace.py +0 -259
torchzero/modules/gradient_approximation/__init__.py +0 -7
torchzero/modules/gradient_approximation/_fd_formulas.py +0 -3
torchzero/modules/gradient_approximation/base_approximator.py +0 -105
torchzero/modules/gradient_approximation/fdm.py +0 -125
torchzero/modules/gradient_approximation/forward_gradient.py +0 -163
torchzero/modules/gradient_approximation/newton_fdm.py +0 -198
torchzero/modules/gradient_approximation/rfdm.py +0 -125
torchzero/modules/line_search/armijo.py +0 -56
torchzero/modules/line_search/base_ls.py +0 -139
torchzero/modules/line_search/directional_newton.py +0 -217
torchzero/modules/line_search/grid_ls.py +0 -158
torchzero/modules/line_search/scipy_minimize_scalar.py +0 -62
torchzero/modules/meta/__init__.py +0 -12
torchzero/modules/meta/alternate.py +0 -65
torchzero/modules/meta/grafting.py +0 -195
torchzero/modules/meta/optimizer_wrapper.py +0 -173
torchzero/modules/meta/return_overrides.py +0 -46
torchzero/modules/misc/__init__.py +0 -10
torchzero/modules/misc/accumulate.py +0 -43
torchzero/modules/misc/basic.py +0 -115
torchzero/modules/misc/lr.py +0 -96
torchzero/modules/misc/multistep.py +0 -51
torchzero/modules/misc/on_increase.py +0 -53
torchzero/modules/operations/__init__.py +0 -29
torchzero/modules/operations/multi.py +0 -298
torchzero/modules/operations/reduction.py +0 -134
torchzero/modules/operations/singular.py +0 -113
torchzero/modules/optimizers/sgd.py +0 -54
torchzero/modules/orthogonalization/__init__.py +0 -2
torchzero/modules/orthogonalization/newtonschulz.py +0 -159
torchzero/modules/orthogonalization/svd.py +0 -86
torchzero/modules/regularization/__init__.py +0 -22
torchzero/modules/regularization/dropout.py +0 -34
torchzero/modules/regularization/noise.py +0 -77
torchzero/modules/regularization/normalization.py +0 -328
torchzero/modules/regularization/ortho_grad.py +0 -78
torchzero/modules/regularization/weight_decay.py +0 -92
torchzero/modules/scheduling/__init__.py +0 -2
torchzero/modules/scheduling/lr_schedulers.py +0 -131
torchzero/modules/scheduling/step_size.py +0 -80
torchzero/modules/smoothing/gaussian_smoothing.py +0 -90
torchzero/modules/weight_averaging/__init__.py +0 -2
torchzero/modules/weight_averaging/ema.py +0 -72
torchzero/modules/weight_averaging/swa.py +0 -171
torchzero/optim/experimental/__init__.py +0 -20
torchzero/optim/experimental/experimental.py +0 -343
torchzero/optim/experimental/ray_search.py +0 -83
torchzero/optim/first_order/__init__.py +0 -18
torchzero/optim/first_order/cautious.py +0 -158
torchzero/optim/first_order/forward_gradient.py +0 -70
torchzero/optim/first_order/optimizers.py +0 -570
torchzero/optim/modular.py +0 -148
torchzero/optim/quasi_newton/__init__.py +0 -1
torchzero/optim/quasi_newton/directional_newton.py +0 -58
torchzero/optim/second_order/__init__.py +0 -1
torchzero/optim/second_order/newton.py +0 -94
torchzero/optim/zeroth_order/__init__.py +0 -4
torchzero/optim/zeroth_order/fdm.py +0 -87
torchzero/optim/zeroth_order/newton_fdm.py +0 -146
torchzero/optim/zeroth_order/rfdm.py +0 -217
torchzero/optim/zeroth_order/rs.py +0 -85
torchzero/random/__init__.py +0 -1
torchzero/random/random.py +0 -46
torchzero/tensorlist.py +0 -826
torchzero-0.1.8.dist-info/METADATA +0 -130
torchzero-0.1.8.dist-info/RECORD +0 -104
torchzero-0.1.8.dist-info/top_level.txt +0 -1

torchzero/modules/optimizers/adagrad.py CHANGED Viewed

@@ -1,49 +1,146 @@
-from collections import abc
-import torch
-from ...tensorlist import TensorList
-from ...core import OptimizerModule
-def _adagrad_step_(ascent: TensorList, grad_sum: TensorList, alpha: TensorList, lr_decay: TensorList, eps: TensorList, step: int):
-    clr = alpha / (1 + step * lr_decay)
-    grad_sum.addcmul_(ascent, ascent)
-    return ascent.div_(grad_sum.sqrt().add_(eps)).mul_(clr)
-class Adagrad(OptimizerModule):
-    """
-    Divides ascent direction by mean square root of the sum of all past ascent directions.
-    Exactly matches `torch.optim.Adagrad`.
-    Args:
-        lr_decay (float, optional): learning rate decay. Defaults to 0.
-        initial_accumulator_value (float, optional): initial value of the sum of squares of gradients. Defaults to 0.
-        eps (float, optional): term added to the denominator to improve numerical stability. Defaults to 1e-10.
-        alpha (float, optional): learning rate. Defaults to 1.
-    reference
-        https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
-    """
-    def __init__(self, lr_decay: float = 0, initial_accumulator_value: float = 0, eps: float = 1e-10, alpha: float = 1):
-        defaults = dict(alpha = alpha, lr_decay = lr_decay, initial_accumulator_value=initial_accumulator_value, eps = eps)
-        super().__init__(defaults)
-        self.cur_step = 0
-    @torch.no_grad
-    def _update(self, vars, ascent):
-        settings = self.get_all_group_keys()
-        if self.cur_step == 0: init = ascent.full_like(settings['initial_accumulator_value'])
-        else: init = None
-        grad_sum = self.get_state_key('grad_sum', init = init) # type:ignore
-        updated_direction = _adagrad_step_(
-            ascent=ascent,
-            grad_sum=grad_sum,
-            alpha=settings["alpha"],
-            eps=settings["eps"],
-            lr_decay=settings["lr_decay"],
-            step=self.cur_step,
-        )
-        self.cur_step += 1
-        return updated_direction
+from operator import itemgetter
+import torch
+from ...core import (
+    Chainable,
+    Module,
+    Preconditioner,
+    Target,
+    TensorwisePreconditioner,
+    Transform,
+    Vars,
+    apply,
+)
+from ...utils import NumberList, TensorList
+from ...utils.linalg import matrix_power_eigh
+from ..functional import add_power_, lerp_power_, root
+def adagrad_(
+    tensors_: TensorList,
+    sq_sum_: TensorList,
+    alpha: float | NumberList,
+    lr_decay: float | NumberList,
+    eps: float | NumberList,
+    step: int,
+    pow: float = 2,
+    use_sqrt: bool = True,
+    # inner args
+    inner: Module | None = None,
+    params: list[torch.Tensor] | None = None,
+    grads: list[torch.Tensor] | None = None,
+    vars: Vars | None = None,
+):
+    """returns `tensors_`"""
+    clr = alpha / (1 + step * lr_decay)
+    sq_sum_ = add_power_(tensors_, sum_=sq_sum_, pow=pow)
+    if inner is not None:
+        assert params is not None
+        tensors_ = TensorList(apply(inner, tensors_, params=params, grads=grads, vars=vars))
+    if use_sqrt: tensors_.div_(root(sq_sum_, p=pow, inplace=False).add_(eps)).mul_(clr)
+    else: tensors_.div_(sq_sum_.add(eps)).mul_(clr)
+    return tensors_
+class Adagrad(Transform):
+    """Adagrad, divides by sum of past squares of gradients, matches pytorch Adagrad.
+    Args:
+        lr_decay (float, optional): learning rate decay. Defaults to 0.
+        initial_accumulator_value (float, optional): initial value of the sum of squares of gradients. Defaults to 0.
+        eps (float, optional): division epsilon. Defaults to 1e-10.
+        alpha (float, optional): step size. Defaults to 1.
+        pow (float, optional): power for gradients and accumulator root. Defaults to 2.
+        use_sqrt (bool, optional): whether to take the root of the accumulator. Defaults to True.
+        inner (Chainable | None, optional): Inner modules that are applied after updating accumulator and before preconditioning. Defaults to None.
+    """
+    def __init__(
+        self,
+        lr_decay: float = 0,
+        initial_accumulator_value: float = 0,
+        eps: float = 1e-10,
+        alpha: float = 1,
+        pow: float = 2,
+        use_sqrt: bool = True,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(alpha = alpha, lr_decay = lr_decay, initial_accumulator_value=initial_accumulator_value,
+                        eps = eps, pow=pow, use_sqrt = use_sqrt)
+        super().__init__(defaults=defaults, uses_grad=False)
+        if inner is not None:
+            self.set_child('inner', inner)
+    @torch.no_grad
+    def transform(self, tensors, params, grads, vars):
+        tensors = TensorList(tensors)
+        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
+        lr_decay,alpha,eps = self.get_settings('lr_decay', 'alpha', 'eps', params=params, cls=NumberList)
+        pow, use_sqrt = itemgetter('pow', 'use_sqrt')(self.settings[params[0]])
+        sq_sum = self.get_state('sq_sum', params=params, cls=TensorList)
+        # initialize accumulator on 1st step
+        if step == 1:
+            sq_sum.set_(tensors.full_like(self.get_settings('initial_accumulator_value', params=params)))
+        return adagrad_(
+            tensors,
+            sq_sum_=sq_sum,
+            alpha=alpha,
+            lr_decay=lr_decay,
+            eps=eps,
+            step=self.global_state["step"],
+            pow=pow,
+            use_sqrt=use_sqrt,
+            # inner args
+            inner=self.children.get("inner", None),
+            params=params,
+            grads=grads,
+            vars=vars,
+        )
+class FullMatrixAdagrad(TensorwisePreconditioner):
+    def __init__(self, beta: float | None = None, decay: float | None = None, concat_params=False, update_freq=1, inner: Chainable | None = None):
+        defaults = dict(beta=beta, decay=decay)
+        super().__init__(defaults, uses_grad=False, concat_params=concat_params, update_freq=update_freq, inner=inner)
+    @torch.no_grad
+    def update_tensor(self, tensor, param, grad, state, settings):
+        G = tensor.ravel()
+        GG = torch.outer(G, G)
+        decay = settings['decay']
+        beta = settings['beta']
+        if 'GG' not in state: state['GG'] = torch.eye(GG.size(0), device=GG.device, dtype=GG.dtype)
+        if decay is not None: state['GG'].mul_(decay)
+        if beta is not None: state['GG'].lerp_(GG, 1-beta)
+        else: state['GG'].add_(GG)
+    @torch.no_grad
+    def apply_tensor(self, tensor, param, grad, state, settings):
+        GG = state['GG']
+        if tensor.numel() == 1:
+            return tensor / (GG**(1/2)).squeeze()
+        try:
+            B = matrix_power_eigh(GG, -1/2)
+        except torch.linalg.LinAlgError:
+            return tensor.div_(tensor.abs().max()) # conservative scaling
+        return (B @ tensor.ravel()).view_as(tensor)

torchzero/modules/optimizers/adam.py CHANGED Viewed

@@ -1,118 +1,112 @@
-from collections import abc
-import torch
-from ...tensorlist import TensorList
-from ...core import OptimizerModule
-def _adam_step(ascent: TensorList, exp_avg: TensorList, exp_avg_sq: TensorList, alpha, beta1, beta2, eps, step:int, max_exp_avg_sqs: TensorList | None, params: TensorList | None = None):
-    # Decay the first and second moment running average coefficient
-    exp_avg.lerp_compat_(ascent, 1 - beta1)
-    exp_avg_sq.mul_(beta2).addcmul_(ascent, ascent.conj(), value=1 - beta2)
-    bias_correction1 = 1 - beta1**step
-    bias_correction2 = 1 - beta2**step
-    if max_exp_avg_sqs is not None:
-        max_exp_avg_sqs.maximum_(exp_avg_sq)
-        denom = max_exp_avg_sqs.sqrt().div_(bias_correction2**0.5).add_(eps)
-    else:
-        denom = exp_avg_sq.sqrt().div_(bias_correction2**0.5).add_(eps)
-    if params is None:
-        return (exp_avg / denom).mul_(alpha / bias_correction1)
-    # else directly apply the update to params
-    params.addcdiv_(exp_avg, denom, value = -(alpha / bias_correction1))
-    return None
-class Adam(OptimizerModule):
-    """Adam. Combines momentum and RMSProp. Exactly matches `torch.optim.Adam`.
-    Args:
-        beta1 (float, optional): exponential decay rate of gradient moving average. Defaults to 0.9.
-        beta2 (float, optional): exponential decay rate of squared gradient moving average. Defaults to 0.999.
-        eps (float, optional): epsilon for numerical stability. Defaults to 1e-8.
-        amsgrad (bool, optional):
-            whether to use the AMSGrad variant of this algorithm from
-            On the Convergence of Adam and Beyond (default: False).
-        alpha (float, optional): learning rate. Defaults to 1.
-    """
-    def __init__(self, beta1: float = 0.9, beta2: float = 0.999, eps: float = 1e-8, alpha: float = 1, amsgrad=False):
-        defaults = dict(alpha = alpha, beta1=beta1, beta2=beta2, eps=eps)
-        super().__init__(defaults)
-        self.cur_step = 1
-        self.amsgrad = amsgrad
-    @torch.no_grad
-    def step(self, vars):
-        # Adam step is a bit differet from other optimizer steps
-        # due to how common it is, I implemented two additional optimizations,
-        # 1st - if next module is None or if next module is LR and module after is None
-        # this will directly update parameters using `addcdiv_`
-        # 2nd - if next module is LR`, adam will "fuse" with it to avoid an additional add operation.
-        # the optimizations are quite verbose and seem to barely have any effect, so I probably won't implement
-        # this for other modules
-        settings = self.get_all_group_keys()
-        if self.amsgrad:
-            exp_avg, exp_avg_sq, max_exp_avg_sqs = self.get_state_keys('exp_avg', 'exp_avg_sq', 'max_exp_avg_sqs')
-        else:
-            exp_avg, exp_avg_sq = self.get_state_keys('exp_avg', 'exp_avg_sq')
-            max_exp_avg_sqs = None
-        params = None
-        # apply addcdiv if next module is None
-        if self.next_module is None: params = self.get_params()
-        # fuse with LR module if it is next
-        if self.next_module is not None and self.next_module.IS_LR_MODULE:
-            alpha = self.next_module.get_group_key('lr') * settings['alpha']
-            self.next_module._skip = True # type:ignore
-            # apply addcdiv if module after LR is None.
-            if self.next_module.next_module is None: params = self.get_params()
-        else:
-            alpha = settings['alpha']
-        # get params if ascent is None so we need params to access their gradient as initial ascent
-        if vars.ascent is None:
-            if params is None: pg = self.get_params()
-            else: pg = params
-        else:
-            pg = None
-        ret = _adam_step(
-            ascent=vars.maybe_use_grad_(pg),
-            exp_avg = exp_avg,
-            exp_avg_sq = exp_avg_sq,
-            alpha = alpha,
-            beta1 = settings['beta1'],
-            beta2 = settings['beta2'],
-            eps = settings['eps'],
-            step = self.cur_step,
-            max_exp_avg_sqs = max_exp_avg_sqs,
-            params = params
-        )
-        self.cur_step += 1
-        if params is None:
-            assert ret is not None
-            vars.ascent = ret
-            return self._update_params_or_step_with_next(vars)
-        # next module is either None or LR
-        if self.next_module is None: return vars.get_loss()
-        # step with LR, which has _skip = True so it won't apply lr, but may step with the scheduler
-        self.next_module._update(vars, None) # type:ignore
-        return vars.get_loss()
+from operator import itemgetter
+from functools import partial
+import torch
+from ...core import Module, Target, Transform
+from ...utils import NumberList, TensorList
+from ..functional import (
+    debias, debiased_step_size,
+    ema_,
+    sqrt_ema_sq_,
+)
+from ..lr.lr import lazy_lr
+from ..momentum.experimental import sqrt_nag_ema_sq_
+from ..momentum.momentum import nag_
+def adam_(
+    tensors: TensorList,
+    exp_avg_: TensorList,
+    exp_avg_sq_: TensorList,
+    alpha: float | NumberList,
+    beta1: float | NumberList,
+    beta2: float | NumberList,
+    eps: float | NumberList,
+    step: int,
+    pow: float = 2,
+    debiased: bool = True,
+    max_exp_avg_sq_: TensorList | None = None,
+    params_: TensorList | None = None,
+):
+    """Returns new tensors or updates params in-place."""
+    exp_avg_ = ema_(tensors, exp_avg_=exp_avg_, beta=beta1, dampening=0,lerp=True)
+    sqrt_exp_avg_sq = sqrt_ema_sq_(tensors, exp_avg_sq_=exp_avg_sq_, beta=beta2, max_exp_avg_sq_=max_exp_avg_sq_,
+                                   debiased=False,step=step,pow=pow)
+    if debiased: alpha = debiased_step_size(step, beta1=beta1, beta2=beta2, pow=pow, alpha=alpha)
+    # params is None, return update
+    if params_ is None: return (exp_avg_ / sqrt_exp_avg_sq.add_(eps)).lazy_mul(alpha)
+    # update params in-place
+    params_.addcdiv_(exp_avg_, sqrt_exp_avg_sq.add_(eps), -alpha)
+    return None
+class Adam(Module):
+    """Adam. Divides gradient EMA by EMA of gradient squares with debiased step size. This implementation is slightly different from
+    pytorch in that debiasing is applied after adding epsilon.
+    Args:
+        beta1 (float, optional): momentum. Defaults to 0.9.
+        beta2 (float, optional): second momentum. Defaults to 0.999.
+        eps (float, optional): epsilon. Defaults to 1e-8.
+        alpha (float, optional): learning rate. Defaults to 1.
+        amsgrad (bool, optional): Whether to divide by maximum of EMA of gradient squares instead. Defaults to False.
+        pow (float, optional): power used in second momentum power and root. Defaults to 2.
+        debiased (bool, optional): whether to apply debiasing to momentums based on current step. Defaults to True.
+    """
+    def __init__(
+        self,
+        beta1: float = 0.9,
+        beta2: float = 0.999,
+        eps: float = 1e-8,
+        amsgrad: bool = False,
+        alpha: float = 1.,
+        pow: float = 2,
+        debiased: bool = True,
+    ):
+        defaults=dict(beta1=beta1,beta2=beta2,eps=eps,alpha=alpha,amsgrad=amsgrad,pow=pow,debiased=debiased)
+        super().__init__(defaults)
+        self.getter = itemgetter('amsgrad','pow','debiased')
+    @torch.no_grad
+    def step(self, vars):
+        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
+        beta1,beta2,eps,alpha=self.get_settings('beta1','beta2','eps','alpha', params=vars.params, cls=NumberList)
+        amsgrad,pow,debiased = self.getter(self.settings[vars.params[0]])
+        if amsgrad:
+            exp_avg, exp_avg_sq, max_exp_avg_sq = self.get_state('exp_avg','exp_avg_sq','max_exp_avg_sq', params=vars.params, cls=TensorList)
+        else:
+            exp_avg, exp_avg_sq = self.get_state('exp_avg','exp_avg_sq', params=vars.params, cls=TensorList)
+            max_exp_avg_sq = None
+        # if this is last module, update parameters in-place with slightly more efficient addcdiv_
+        if vars.is_last:
+            if vars.last_module_lrs is not None: alpha = alpha * vars.last_module_lrs
+            passed_params = TensorList(vars.params)
+            vars.stop = True
+            vars.skip_update = True
+        else:
+            passed_params = None
+        vars.update = adam_(
+            tensors=TensorList(vars.get_update()),
+            exp_avg_=exp_avg,
+            exp_avg_sq_=exp_avg_sq,
+            alpha=alpha,
+            beta1=beta1,
+            beta2=beta2,
+            eps=eps,
+            step=step,
+            pow=pow,
+            debiased=debiased,
+            max_exp_avg_sq_=max_exp_avg_sq,
+            params_=passed_params,
+        )
+        return vars

torchzero/modules/optimizers/lion.py CHANGED Viewed

@@ -1,15 +1,21 @@
 import torch
-from ...core import OptimizerModule
-from ...tensorlist import TensorList
+from ...core import Module, Target, Transform
+from ...utils import NumberList, TensorList
-def _lion_step_(ascent: TensorList, ema: TensorList, beta1, beta2,):
-    update = ema.lerp_compat(ascent, 1-beta1).sign_()
-    ema.lerp_compat_(ascent, 1-beta2)
+def lion_(tensors: TensorList, exp_avg_: TensorList, beta1, beta2,):
+    """
+    Lion update rule.
+    Returns new tensors.
+    """
+    update = exp_avg_.lerp(tensors, 1-beta1).sign_()
+    exp_avg_.lerp_(tensors, 1-beta2)
     return update
-class Lion(OptimizerModule):
+class Lion(Transform):
     """Lion (EvoLved Sign Momentum) optimizer from https://arxiv.org/abs/2302.06675.
     Args:
@@ -19,10 +25,11 @@ class Lion(OptimizerModule):
     def __init__(self, beta1: float = 0.9, beta2: float = 0.99):
         defaults = dict(beta1=beta1, beta2=beta2)
-        super().__init__(defaults)
+        super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def _update(self, vars, ascent):
-        beta1, beta2 = self.get_group_keys('beta1', 'beta2')
-        ema = self.get_state_key('ema')
-        return _lion_step_(ascent,ema,beta1,beta2)
+    def transform(self, tensors, params, grads, vars):
+        beta1, beta2 = self.get_settings('beta1', 'beta2', params = params, cls=NumberList)
+        exp_avg = self.get_state('ema', params=params, cls=TensorList)
+        return lion_(TensorList(tensors),exp_avg,beta1,beta2)

torchzero 0.1.8__py3-none-any.whl → 0.3.2__py3-none-any.whl

torchzero 0.1.8py3-none-any.whl → 0.3.2py3-none-any.whl