PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

tests/test_opts.py +95 -76
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +229 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/spsa1.py +93 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/__init__.py +1 -1
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +6 -7
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +114 -175
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +16 -4
torchzero/modules/line_search/strong_wolfe.py +319 -220
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +253 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +207 -170
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +99 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +122 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/optimizer.py +2 -2
torchzero/utils/python_tools.py +7 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.14.dist-info/METADATA +14 -0
torchzero-0.3.14.dist-info/RECORD +167 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.14.dist-info}/WHEEL +0 -0

torchzero/modules/{optimizers → adaptive}/adahessian.py RENAMED Viewed

@@ -1,38 +1,42 @@
 import math
-from collections.abc import Callable
 from typing import Literal
 import torch
 from ...core import Chainable, Module, Target, Transform, apply_transform
 from ...utils import NumberList, TensorList, as_tensorlist
-from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
+from ..functional import debiased_step_size
+def _full_average(hvp: torch.Tensor):
+    if hvp.ndim >= 3:  # Conv kernel
+        return torch.mean(hvp.abs(), dim=[2, *range(3,hvp.ndim)], keepdim=True)
+    return hvp
 def _block_average(x: torch.Tensor, block_size: int | None, enable: bool):
     """averages x over first dimension in blocks"""
     if enable and x.ndim >= 2:
         if math.prod(x.shape[1:]) <= 1: return x
+        if block_size is None: return _full_average(x)
         size = x.size(0)
-        if block_size is None: return x.mean(0, keepdim=True)
         n_blocks = size // block_size
-        if n_blocks <= 1: return x.mean(0, keepdim = True)
+        if n_blocks <= 1: return x.abs().mean(0, keepdim = True)
         n_remaining = size - n_blocks * block_size
         remaining = None
         if n_remaining > 0:
-            remaining = x[-n_remaining:].mean(0, keepdim=True).repeat_interleave(n_remaining, 0)
+            remaining = x[-n_remaining:].abs().mean(0, keepdim=True).repeat_interleave(n_remaining, 0)
             x = x[:-n_remaining]
         x = x.view(block_size, n_blocks, *x.shape[1:])
-        x_mean = x.mean(0).repeat_interleave(block_size, 0)
+        x_mean = x.abs().mean(0).repeat_interleave(block_size, 0)
         if remaining is None: return x_mean
         return torch.cat([x_mean, remaining], 0)
     return x
 def _rademacher_like(tensor, p = 0.5, generator = None):
     """p is probability of a 1, other values will be -1."""
     return torch.bernoulli(torch.full_like(tensor, p), generator = generator).mul_(2).sub_(1)
@@ -46,11 +50,11 @@ def adahessian(
     beta2: float | NumberList,
     update_freq: int,
     eps: float | NumberList,
+    hessian_power: float | NumberList,
     step: int,
 ):
     # momentum
     exp_avg_.lerp_(tensors, 1-beta1)
-    num = exp_avg_ / (1-beta1)
     # update preconditioner
     if step % update_freq == 0:
@@ -60,7 +64,9 @@ def adahessian(
     else:
         assert D is None
-    denom = (D_exp_avg_sq_ / (1-beta2)).sqrt_().add_(eps)
+    denom = D_exp_avg_sq_.sqrt().pow_(hessian_power).add_(eps)
+    num = exp_avg_ * debiased_step_size(step+1, beta1, beta2)
     return num.div_(denom)
@@ -70,16 +76,12 @@ class AdaHessian(Module):
     This is similar to Adam, but the second momentum is replaced by square root of an exponential moving average of random hessian-vector products.
-    .. note::
-        In most cases AdaHessian should be the first module in the chain because it relies on autograd. Use the :code:`inner` argument if you wish to apply AdaHessian preconditioning to another module's output.
+    Notes:
+        - In most cases AdaHessian should be the first module in the chain because it relies on autograd. Use the ``inner`` argument if you wish to apply AdaHessian preconditioning to another module's output.
-    .. note::
-        If you are using gradient estimators or reformulations, set :code:`hvp_method` to "forward" or "central".
+        - If you are using gradient estimators or reformulations, set ``hvp_method`` to "forward" or "central".
-    .. note::
-        This module requires a closure passed to the optimizer step,
-        as it needs to re-evaluate the loss and gradients for calculating HVPs.
-        The closure must accept a ``backward`` argument (refer to documentation).
+        - This module requires a closure passed to the optimizer step, as it needs to re-evaluate the loss and gradients for calculating HVPs. The closure must accept a ``backward`` argument (refer to documentation).
     Args:
         beta1 (float, optional): first momentum. Defaults to 0.9.
@@ -105,7 +107,7 @@ class AdaHessian(Module):
               more accurate HVP approximation. This requires two extra
               gradient evaluations.
             Defaults to "autograd".
-        h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
+        fd_h (float, optional): finite difference step size if ``hvp_method`` is "forward" or "central". Defaults to 1e-3.
         n_samples (int, optional):
             number of hessian-vector products with random vectors to evaluate each time when updating
             the preconditioner. Larger values may lead to better hessian diagonal estimate. Defaults to 1.
@@ -113,48 +115,49 @@ class AdaHessian(Module):
         inner (Chainable | None, optional):
             Inner module. If this is specified, operations are performed in the following order.
             1. compute hessian diagonal estimate.
-            2. pass inputs to :code:`inner`.
-            3. momentum and preconditioning are applied to the ouputs of :code:`inner`.
-    Examples:
-        Using AdaHessian:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.AdaHessian(),
-                tz.m.LR(0.1)
-            )
-        AdaHessian preconditioner can be applied to any other module by passing it to the :code:`inner` argument.
-        Turn off AdaHessian's first momentum to get just the preconditioning. Here is an example of applying
-        AdaHessian preconditioning to nesterov momentum (:code:`tz.m.NAG`):
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.AdaHessian(beta1=0, inner=tz.m.NAG(0.9)),
-                tz.m.LR(0.1)
-            )
+            2. pass inputs to ``inner``.
+            3. momentum and preconditioning are applied to the ouputs of ``inner``.
+    ## Examples:
+    Using AdaHessian:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.AdaHessian(),
+        tz.m.LR(0.1)
+    )
+    ```
+    AdaHessian preconditioner can be applied to any other module by passing it to the ``inner`` argument.
+    Turn off AdaHessian's first momentum to get just the preconditioning. Here is an example of applying
+    AdaHessian preconditioning to nesterov momentum (``tz.m.NAG``):
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.AdaHessian(beta1=0, inner=tz.m.NAG(0.9)),
+        tz.m.LR(0.1)
+    )
+    ```
     """
     def __init__(
         self,
         beta1: float = 0.9,
         beta2: float = 0.999,
-        averaging: bool = False,
-        block_size: int | None = 9,
+        averaging: bool = True,
+        block_size: int | None = None,
         update_freq: int = 1,
         eps: float = 1e-8,
+        hessian_power: float = 1,
         hvp_method: Literal['autograd', 'forward', 'central'] = 'autograd',
         fd_h: float = 1e-3,
         n_samples = 1,
         seed: int | None = None,
         inner: Chainable | None = None
     ):
-        defaults = dict(beta1=beta1, beta2=beta2, update_freq=update_freq, averaging=averaging, block_size=block_size, eps=eps, hvp_method=hvp_method, n_samples=n_samples, fd_h=fd_h, seed=seed)
+        defaults = dict(beta1=beta1, beta2=beta2, update_freq=update_freq, averaging=averaging, block_size=block_size, eps=eps, hessian_power=hessian_power, hvp_method=hvp_method, n_samples=n_samples, fd_h=fd_h, seed=seed)
         super().__init__(defaults)
         if inner is not None:
@@ -170,14 +173,10 @@ class AdaHessian(Module):
         n_samples = settings['n_samples']
         seed = settings['seed']
-        generator = None
-        if seed is not None:
-            if 'generator' not in self.global_state:
-                self.global_state['generator'] = torch.Generator(params[0].device).manual_seed(seed)
-            generator = self.global_state['generator']
+        generator = self.get_generator(params[0].device, seed)
-        beta1, beta2, eps, averaging, block_size = self.get_settings(params,
-            'beta1', 'beta2', 'eps', 'averaging', 'block_size', cls=NumberList)
+        beta1, beta2, eps, averaging, block_size, hessian_power = self.get_settings(params,
+            'beta1', 'beta2', 'eps', 'averaging', 'block_size', "hessian_power", cls=NumberList)
         exp_avg, D_exp_avg_sq = self.get_state(params, 'exp_avg', 'h_exp_avg', cls=TensorList)
@@ -196,6 +195,7 @@ class AdaHessian(Module):
                 Hvp, rgrad = self.Hvp(u, at_x0=True, var=var, rgrad=rgrad, hvp_method=hvp_method,
                                      h=fd_h, normalize=True, retain_grad=i < n_samples-1)
+                Hvp = tuple(Hvp)
                 if D is None: D = Hvp
                 else: torch._foreach_add_(D, Hvp)
@@ -218,6 +218,7 @@ class AdaHessian(Module):
             beta2=beta2,
             update_freq=update_freq,
             eps=eps,
+            hessian_power=hessian_power,
             step=step,
         )
         return var

torchzero/modules/{optimizers → adaptive}/adam.py RENAMED Viewed

@@ -10,9 +10,6 @@ from ..functional import (
     ema_,
     sqrt_ema_sq_,
 )
-from ..step_size.lr import lazy_lr
-from ..momentum.experimental import sqrt_nag_ema_sq_
-from ..momentum.momentum import nag_
 def adam_(

torchzero/modules/{optimizers → adaptive}/adan.py RENAMED Viewed

@@ -9,37 +9,38 @@ def adan_(
     m_: TensorList, # exponential moving average
     v_: TensorList, # exponential moving average of gradient differences
     n_: TensorList, # kinda like squared momentum
-    n_prev_: TensorList | None,
     beta1: float | NumberList,
     beta2: float | NumberList,
     beta3: float | NumberList,
     eps: float | NumberList,
-    use_n_prev: bool,
+    step: int,
 ):
-    """Returns new tensors."""
-    m_.lerp_(g, 1-beta1)
+    """Returns new tensors"""
+    m_.lerp_(g, 1 - beta1)
-    y = g - g_prev_
-    v_.lerp_(y, 1-beta2)
+    if step == 1:
+        term = g
+    else:
+        diff = g - g_prev_
+        v_.lerp_(diff, 1 - beta2)
+        term = g + beta2 * diff
-    y.mul_(1-beta2).add_(g)
-    n_.mul_(beta3).addcmul_(y, y, 1-beta3)
+    n_.mul_(beta3).addcmul_(term, term, value=(1 - beta3))
-    if use_n_prev:
-        assert n_prev_ is not None
-        ns = n_prev_.clone()
-        n_prev_.copy_(n_)
-        n_ = ns
+    m = m_ / (1.0 - beta1**step)
+    v = v_ / (1.0 - beta2**step)
+    n = n_ / (1.0 - beta3**step)
-    eta = n_.sqrt().add_(eps).reciprocal_()
-    term = m_ + (1-beta2)*v_
-    update = eta.mul_(term)
+    denom = n.sqrt_().add_(eps)
+    num = m + beta2 * v
+    update = num.div_(denom)
     g_prev_.copy_(g)
     return update
 class Adan(Transform):
     """Adaptive Nesterov Momentum Algorithm from https://arxiv.org/abs/2208.06677
@@ -51,6 +52,13 @@ class Adan(Transform):
         use_n_prev (bool, optional):
             whether to use previous gradient differences momentum.
+    Example:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Adan(),
+        tz.m.LR(1e-3),
+    )
     Reference:
         Xie, X., Zhou, P., Li, H., Lin, Z., & Yan, S. (2024). Adan: Adaptive nesterov momentum algorithm for faster optimizing deep models. IEEE Transactions on Pattern Analysis and Machine Intelligence. https://arxiv.org/abs/2208.06677
     """
@@ -60,9 +68,8 @@ class Adan(Transform):
         beta2: float = 0.92,
         beta3: float = 0.99,
         eps: float = 1e-8,
-        use_n_prev: bool = False,
     ):
-        defaults=dict(beta1=beta1,beta2=beta2,beta3=beta3,eps=eps,use_n_prev=use_n_prev)
+        defaults=dict(beta1=beta1,beta2=beta2,beta3=beta3,eps=eps)
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
@@ -71,40 +78,19 @@ class Adan(Transform):
         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
         beta1,beta2,beta3,eps=unpack_dicts(settings, 'beta1','beta2','beta3','eps', cls=NumberList)
-        s = settings[0]
-        use_n_prev = s['use_n_prev']
         g_prev, m, v, n = unpack_states(states, tensors, 'g_prev','m','v','n', cls=TensorList)
-        if use_n_prev:
-            n_prev = unpack_states(states, tensors, 'n_prev', cls=TensorList)
-        else:
-            n_prev = None
-        if step == 1:
-            # initial values, also runs on restarts
-            m.copy_(tensors)
-            n.set_(tensors ** 2)
-            v.zero_()
-            g_prev.copy_(tensors)
-            if n_prev is not None: n_prev.set_(tensors ** 2)
-        if step == 2:
-            v.set_(tensors - g_prev)
         update = adan_(
             g=tensors,
             g_prev_=g_prev,
             m_=m,
             v_=v,
             n_=n,
-            n_prev_=n_prev,
             beta1=beta1,
             beta2=beta2,
             beta3=beta3,
             eps=eps,
-            use_n_prev=use_n_prev,
+            step=step,
         )
         return update

torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py RENAMED Viewed

@@ -4,7 +4,7 @@ from ...utils import TensorList, unpack_dicts, unpack_states
 def adaptive_heavy_ball(f, f_star, f_prev, g: TensorList, g_prev: TensorList, p: TensorList, p_prev: TensorList):
-    if f - f_star <= torch.finfo(p[0].dtype).eps: return g
+    if f - f_star <= torch.finfo(p[0].dtype).tiny * 2: return g
     g_g = g.dot(g)
     g_gp = g.dot(g_prev)
@@ -21,14 +21,12 @@ class AdaptiveHeavyBall(Transform):
     This is related to conjugate gradient methods, it may be very good for non-stochastic convex objectives, but won't work on stochastic ones.
-    .. note::
+    note:
         The step size is determined by the algorithm, so learning rate modules shouldn't be used.
     Args:
         f_star (int, optional):
             (estimated) minimal possible value of the objective function (lowest possible loss). Defaults to 0.
-        tol (float, optional):
-            tolerance on objective value change.
     """
     def __init__(self, f_star: float = 0):
         defaults = dict(f_star=f_star)
@@ -38,8 +36,7 @@ class AdaptiveHeavyBall(Transform):
     def apply_tensors(self, tensors, params, grads, loss, states, settings):
         assert loss is not None
         tensors = TensorList(tensors)
-        setting = settings[0]
-        f_star = setting['f_star']
+        f_star = self.defaults['f_star']
         f_prev = self.global_state.get('f_prev', None)
         p_prev, g_prev = unpack_states(states, tensors, 'p_prev', 'g_prev', init=[params,tensors], cls=TensorList)

torchzero/modules/adaptive/aegd.py ADDED Viewed

@@ -0,0 +1,54 @@
+import math
+import torch
+from ...core import Transform
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
+# i've verified, it is identical to official
+# https://github.com/txping/AEGD/blob/master/aegd.py
+def aegd_(f: torch.Tensor | float, g: TensorList, r_: TensorList, c:float|NumberList=1, eta:float|NumberList=0.1) -> TensorList:
+    v = g / (2 * (f + c)**0.5)
+    r_ /= 1 + (v ** 2).mul_(2*eta) # update energy
+    return 2*eta * r_*v # pyright:ignore[reportReturnType]
+class AEGD(Transform):
+    """AEGD (Adaptive gradient descent with energy) from https://arxiv.org/abs/2010.05109#page=10.26.
+    Note:
+        AEGD has a learning rate hyperparameter that can't really be removed from the update rule.
+        To avoid compounding learning rate mofications, remove the ``tz.m.LR`` module if you had it.
+    Args:
+        eta (float, optional): step size. Defaults to 0.1.
+        c (float, optional): c. Defaults to 1.
+        beta3 (float, optional): thrid (squared) momentum. Defaults to 0.1.
+        eps (float, optional): epsilon. Defaults to 1e-8.
+        use_n_prev (bool, optional):
+            whether to use previous gradient differences momentum.
+    """
+    def __init__(
+        self,
+        lr: float = 0.1,
+        c: float = 1,
+    ):
+        defaults=dict(c=c,lr=lr)
+        super().__init__(defaults, uses_loss=True)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        assert loss is not None
+        tensors = TensorList(tensors)
+        c,lr=unpack_dicts(settings, 'c','lr', cls=NumberList)
+        r = unpack_states(states, tensors, 'r', init=lambda t: torch.full_like(t, float(loss+c[0])**0.5), cls=TensorList)
+        update = aegd_(
+            f=loss,
+            g=tensors,
+            r_=r,
+            c=c,
+            eta=lr,
+        )
+        return update

torchzero/modules/{optimizers → adaptive}/esgd.py RENAMED Viewed

@@ -61,7 +61,7 @@ class ESGD(Module):
               more accurate HVP approximation. This requires two extra
               gradient evaluations.
             Defaults to "autograd".
-        h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
+        fd_h (float, optional): finite difference step size if :code:`hvp_method` is "forward" or "central". Defaults to 1e-3.
         n_samples (int, optional):
             number of hessian-vector products with random vectors to evaluate each time when updating
             the preconditioner. Larger values may lead to better hessian diagonal estimate. Defaults to 1.

torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} RENAMED Viewed

@@ -5,8 +5,12 @@ import warnings
 import torch
 from ...core import Chainable, TensorwiseTransform
-def lm_adagrad_update(history: deque[torch.Tensor], damping, rdamping):
-    M = torch.stack(tuple(history), dim=1)# / len(history)
+def lm_adagrad_update(history: deque[torch.Tensor] | torch.Tensor, damping, rdamping):
+    if isinstance(history, torch.Tensor):
+        M = history
+    else:
+        M = torch.stack(tuple(history), dim=1)# / len(history)
     MTM = M.T @ M
     if damping != 0:
         MTM.add_(torch.eye(MTM.size(0), device=MTM.device, dtype=MTM.dtype).mul_(damping))
@@ -58,47 +62,45 @@ class LMAdagrad(TensorwiseTransform):
             order=2 means gradient differences are used in place of gradients. Higher order uses higher order differences. Defaults to 1.
         true_damping (bool, optional):
             If True, damping is added to squared singular values to mimic Adagrad. Defaults to True.
-        eigh (bool, optional): uses a more efficient way to calculate U and S. Defaults to True.
         U_beta (float | None, optional): momentum for U (too unstable, don't use). Defaults to None.
-        S_beta (float | None, optional): momentum for S (too unstable, don't use). Defaults to None.
+        L_beta (float | None, optional): momentum for L (too unstable, don't use). Defaults to None.
         interval (int, optional): Interval between gradients that are added to history (2 means every second gradient is used). Defaults to 1.
         concat_params (bool, optional): if True, treats all parameters as a single vector, meaning it will also whiten inter-parameters. Defaults to True.
         inner (Chainable | None, optional): preconditioner will be applied to output of this module. Defaults to None.
-    Examples:
-        Limited-memory Adagrad
-        .. code-block:: python
-            optimizer = tz.Modular(
-                model.parameters(),
-                tz.m.LMAdagrad(),
-                tz.m.LR(0.1)
-            )
-        Adam with L-Adagrad preconditioner (for debiasing second beta is 0.999 arbitrarily)
-        .. code-block:: python
-            optimizer = tz.Modular(
-                model.parameters(),
-                tz.m.LMAdagrad(inner=tz.m.EMA()),
-                tz.m.Debias(0.9, 0.999),
-                tz.m.LR(0.01)
-            )
-        Stable Adam with L-Adagrad preconditioner (this is what I would recommend)
-        .. code-block:: python
-            optimizer = tz.Modular(
-                model.parameters(),
-                tz.m.LMAdagrad(inner=tz.m.EMA()),
-                tz.m.Debias(0.9, 0.999),
-                tz.m.ClipNormByEMA(max_ema_growth=1.2),
-                tz.m.LR(0.01)
-            )
+    ## Examples:
+    Limited-memory Adagrad
+    ```python
+    optimizer = tz.Modular(
+        model.parameters(),
+        tz.m.LMAdagrad(),
+        tz.m.LR(0.1)
+    )
+    ```
+    Adam with L-Adagrad preconditioner (for debiasing second beta is 0.999 arbitrarily)
+    ```python
+    optimizer = tz.Modular(
+        model.parameters(),
+        tz.m.LMAdagrad(inner=tz.m.EMA()),
+        tz.m.Debias(0.9, 0.999),
+        tz.m.LR(0.01)
+    )
+    ```
+    Stable Adam with L-Adagrad preconditioner (this is what I would recommend)
+    ```python
+    optimizer = tz.Modular(
+        model.parameters(),
+        tz.m.LMAdagrad(inner=tz.m.EMA()),
+        tz.m.Debias(0.9, 0.999),
+        tz.m.ClipNormByEMA(max_ema_growth=1.2),
+        tz.m.LR(0.01)
+    )
+    ```
     Reference:
         Agarwal N. et al. Efficient full-matrix adaptive regularization //International Conference on Machine Learning. – PMLR, 2019. – С. 102-110.
     """
@@ -143,6 +145,7 @@ class LMAdagrad(TensorwiseTransform):
             # scaled by parameter differences
             cur_p = param.clone()
             cur_g = tensor.clone()
+            eps = torch.finfo(cur_p.dtype).tiny * 2
             for i in range(1, order):
                 if f'prev_g_{i}' not in state:
                     state[f'prev_p_{i}'] = cur_p
@@ -157,7 +160,7 @@ class LMAdagrad(TensorwiseTransform):
                 cur_g = y
                 if i == order - 1:
-                    cur_g = cur_g / torch.linalg.norm(cur_p).clip(min=1e-8) # pylint:disable=not-callable
+                    cur_g = cur_g / torch.linalg.norm(cur_p).clip(min=eps) # pylint:disable=not-callable
                     history.append(cur_g.view(-1))
         step = state.get('step', 0)

torchzero/modules/{optimizers → adaptive}/mars.py RENAMED Viewed

@@ -1,18 +1,7 @@
-from operator import itemgetter
-from functools import partial
 import torch
-from ...core import Module, Target, Transform, apply_transform, Chainable
+from ...core import Transform
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
-from ..functional import (
-    debias, debiased_step_size,
-    ema_,
-    sqrt_ema_sq_,
-)
-from ..step_size.lr import lazy_lr
-from ..momentum.experimental import sqrt_nag_ema_sq_
-from ..momentum.momentum import nag_
 def mars_correction_(
@@ -35,36 +24,35 @@ class MARSCorrection(Transform):
     """MARS variance reduction correction.
     Place any other momentum-based optimizer after this,
-    make sure :code:`beta` parameter matches with momentum in the optimizer.
+    make sure ``beta`` parameter matches with momentum in the optimizer.
     Args:
         beta (float, optional): use the same beta as you use in the momentum module. Defaults to 0.9.
         scaling (float, optional): controls the scale of gradient correction in variance reduction. Defaults to 0.025.
         max_norm (float, optional): clips norm of corrected gradients, None to disable. Defaults to 1.
-    Examples:
-        Mars-AdamW
-        .. code-block:: python
-            optimizer = tz.Modular(
-                model.parameters(),
-                tz.m.MARSCorrection(beta=0.95),
-                tz.m.Adam(beta1=0.95, beta2=0.99),
-                tz.m.WeightDecay(1e-3),
-                tz.m.LR(0.1)
-            )
-        Mars-Lion
-        .. code-block:: python
-            optimizer = tz.Modular(
-                model.parameters(),
-                tz.m.MARSCorrection(beta=0.9),
-                tz.m.Lion(beta1=0.9),
-                tz.m.LR(0.1)
-            )
+    ## Examples:
+    Mars-AdamW
+    ```python
+    optimizer = tz.Modular(
+        model.parameters(),
+        tz.m.MARSCorrection(beta=0.95),
+        tz.m.Adam(beta1=0.95, beta2=0.99),
+        tz.m.WeightDecay(1e-3),
+        tz.m.LR(0.1)
+    )
+    ```
+    Mars-Lion
+    ```python
+    optimizer = tz.Modular(
+        model.parameters(),
+        tz.m.MARSCorrection(beta=0.9),
+        tz.m.Lion(beta1=0.9),
+        tz.m.LR(0.1)
+    )
+    ```
     """
     def __init__(

torchzero 0.3.11__py3-none-any.whl → 0.3.14__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.14py3-none-any.whl