PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +64 -50
tests/test_vars.py +1 -0
torchzero/core/module.py +138 -6
torchzero/core/transform.py +158 -51
torchzero/modules/__init__.py +3 -2
torchzero/modules/clipping/clipping.py +114 -17
torchzero/modules/clipping/ema_clipping.py +27 -13
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/experimental/__init__.py +22 -5
torchzero/modules/experimental/absoap.py +5 -2
torchzero/modules/experimental/adadam.py +8 -2
torchzero/modules/experimental/adamY.py +8 -2
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +21 -4
torchzero/modules/experimental/adasoap.py +7 -2
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +4 -1
torchzero/modules/experimental/etf.py +32 -9
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +27 -28
torchzero/modules/experimental/newtonnewton.py +7 -3
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +11 -4
torchzero/modules/experimental/{tada.py → tensor_adagrad.py} +10 -6
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +30 -3
torchzero/modules/grad_approximation/forward_gradient.py +13 -3
torchzero/modules/grad_approximation/grad_approximator.py +51 -6
torchzero/modules/grad_approximation/rfdm.py +285 -38
torchzero/modules/higher_order/higher_order_newton.py +152 -89
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +34 -9
torchzero/modules/line_search/line_search.py +70 -12
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +2 -2
torchzero/modules/line_search/strong_wolfe.py +34 -7
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/{ops → misc}/debug.py +24 -1
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/{ops → misc}/split.py +29 -1
torchzero/modules/{ops → misc}/switch.py +44 -3
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +6 -6
torchzero/modules/momentum/cautious.py +45 -8
torchzero/modules/momentum/ema.py +7 -7
torchzero/modules/momentum/experimental.py +2 -2
torchzero/modules/momentum/matrix_momentum.py +90 -63
torchzero/modules/momentum/momentum.py +2 -1
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +72 -26
torchzero/modules/ops/multi.py +77 -16
torchzero/modules/ops/reduce.py +15 -7
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +20 -12
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +23 -13
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +7 -6
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/{experimental/spectral.py → optimizers/ladagrad.py} +91 -71
torchzero/modules/optimizers/lion.py +1 -1
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +30 -5
torchzero/modules/optimizers/orthograd.py +1 -1
torchzero/modules/optimizers/rmsprop.py +7 -4
torchzero/modules/optimizers/rprop.py +42 -8
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +39 -5
torchzero/modules/optimizers/soap.py +29 -19
torchzero/modules/optimizers/sophia_h.py +71 -14
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +188 -94
torchzero/modules/quasi_newton/__init__.py +12 -2
torchzero/modules/quasi_newton/cg.py +160 -59
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +101 -57
torchzero/modules/quasi_newton/quasi_newton.py +863 -215
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +220 -41
torchzero/modules/second_order/newton_cg.py +300 -11
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/gaussian.py +34 -0
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +89 -7
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/optim/wrappers/directsearch.py +39 -2
torchzero/optim/wrappers/fcmaes.py +21 -13
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/optuna.py +1 -1
torchzero/optim/wrappers/scipy.py +5 -3
torchzero/utils/__init__.py +2 -2
torchzero/utils/derivatives.py +3 -3
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +10 -0
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/METADATA +65 -40
torchzero-0.3.11.dist-info/RECORD +159 -0
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.10.dist-info/RECORD +0 -139
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/WHEEL +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/clipping/ema_clipping.py CHANGED Viewed

@@ -5,7 +5,7 @@ from collections.abc import Iterable, Sequence
 import torch
 from ...core import Module, Target, Transform, apply_transform, Chainable
-from ...utils import NumberList, TensorList, generic_eq, unpack_dicts, unpack_states
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
 class ClipNormByEMA(Transform):
     """Clips norm to be no larger than the norm of an exponential moving average of past updates.
@@ -14,9 +14,10 @@ class ClipNormByEMA(Transform):
         beta (float, optional): beta for the exponential moving average. Defaults to 0.99.
         ord (float, optional): order of the norm. Defaults to 2.
         eps (float, optional): epsilon for division. Defaults to 1e-6.
-        tensorwise (bool, optional): whether to calculate norm separately for each layer, or global norm for all layers. Defaults to True.
+        tensorwise (bool, optional):
+            if True, norms are calculated parameter-wise, otherwise treats all parameters as single vector. Defaults to True.
         max_ema_growth (float | None, optional):
-            if specified, exponential moving average norm can grow but at most this value per step. Defaults to 1.5.
+            if specified, restricts how quickly exponential moving average norm can grow. The norm is allowed to grow by at most this value per step. Defaults to 1.5.
         ema_init (str, optional):
             How to initialize exponential moving average on first step, "update" to use the first update or "zeros". Defaults to 'zeros'.
     """
@@ -29,12 +30,13 @@ class ClipNormByEMA(Transform):
         tensorwise:bool=True,
         max_ema_growth: float | None = 1.5,
         ema_init: Literal['zeros', 'update'] = 'zeros',
+        inner: Chainable | None = None,
     ):
         defaults = dict(beta=beta, ord=ord, tensorwise=tensorwise, ema_init=ema_init, eps=eps, max_ema_growth=max_ema_growth)
-        super().__init__(defaults, uses_grad=False)
+        super().__init__(defaults, inner=inner)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
         tensors = TensorList(tensors)
         ord, tensorwise, ema_init, max_ema_growth = itemgetter('ord', 'tensorwise', 'ema_init', 'max_ema_growth')(settings[0])
@@ -78,7 +80,12 @@ class ClipNormByEMA(Transform):
             if self.NORMALIZE: denom.clip_(min=eps[0])
             else: denom.clip_(min=1)
-        tensors.div_(denom)
+        self.global_state['denom'] = denom
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        denom = self.global_state.pop('denom')
+        torch._foreach_div_(tensors, denom)
         return tensors
 class NormalizeByEMA(ClipNormByEMA):
@@ -88,9 +95,10 @@ class NormalizeByEMA(ClipNormByEMA):
         beta (float, optional): beta for the exponential moving average. Defaults to 0.99.
         ord (float, optional): order of the norm. Defaults to 2.
         eps (float, optional): epsilon for division. Defaults to 1e-6.
-        tensorwise (bool, optional): whether to calculate norm separately for each layer, or global norm for all layers. Defaults to True.
+        tensorwise (bool, optional):
+            if True, norms are calculated parameter-wise, otherwise treats all parameters as single vector. Defaults to True.
         max_ema_growth (float | None, optional):
-            if specified, exponential moving average norm can grow but at most this value per step. Defaults to 1.5.
+            if specified, restricts how quickly exponential moving average norm can grow. The norm is allowed to grow by at most this value per step. Defaults to 1.5.
         ema_init (str, optional):
             How to initialize exponential moving average on first step, "update" to use the first update or "zeros". Defaults to 'zeros'.
     """
@@ -99,28 +107,30 @@ class NormalizeByEMA(ClipNormByEMA):
 # TODO Centralize by EMA?
 class ClipValueByEMA(Transform):
-    """Clips magnitude of update to be no larger than magnitude of an exponential moving average of past (unclipped) updates.
+    """Clips magnitude of update to be no larger than magnitude of exponential moving average of past (unclipped) updates.
     Args:
         beta (float, optional): beta for the exponential moving average. Defaults to 0.99.
         ema_init (str, optional):
             How to initialize exponential moving average on first step, "update" to use the first update or "zeros". Defaults to 'zeros'.
-        ema_tfm (Chainable | None, optional): optional modules applied to exponential moving average before clipping by it. Defaults to None.
+        ema_tfm (Chainable | None, optional):
+            optional modules applied to exponential moving average before clipping by it. Defaults to None.
     """
     def __init__(
         self,
         beta=0.99,
         ema_init: Literal['zeros', 'update'] = 'zeros',
         ema_tfm:Chainable | None=None,
+        inner: Chainable | None = None,
     ):
         defaults = dict(beta=beta, ema_init=ema_init)
-        super().__init__(defaults, uses_grad=False)
+        super().__init__(defaults, inner=inner)
         if ema_tfm is not None:
             self.set_child('ema_tfm', ema_tfm)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def update_tensors(self, tensors, params, grads, loss, states, settings):
         ema_init = itemgetter('ema_init')(settings[0])
         beta = unpack_dicts(settings, 'beta', cls=NumberList)
@@ -129,8 +139,12 @@ class ClipValueByEMA(Transform):
         ema = unpack_states(states, tensors, 'ema', init = (torch.zeros_like if ema_init=='zeros' else lambda t: t.abs()), cls=TensorList)
         ema.lerp_(tensors.abs(), 1-beta)
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        tensors = TensorList(tensors)
+        ema = unpack_states(states, tensors, 'ema', cls=TensorList)
         if 'ema_tfm' in self.children:
-            ema = TensorList(apply_transform(self.children['ema_tfm'], ema, params, grads, loss))
+            ema = TensorList(apply_transform(self.children['ema_tfm'], ema.clone(), params, grads, loss))
         tensors.clip_(-ema, ema)
         return tensors

torchzero/modules/clipping/growth_clipping.py CHANGED Viewed

@@ -19,7 +19,7 @@ class ClipValueGrowth(TensorwiseTransform):
             bounds the tracked multiplicative clipping decay to prevent collapse to 0.
             Next update is at most :code:`max(previous update * mul, max_decay)`.
             Defaults to 2.
-        target (Target, optional): what to set on var.. Defaults to "update".
+        target (Target, optional): what to set on var. Defaults to "update".
     """
     def __init__(
         self,
@@ -30,11 +30,11 @@ class ClipValueGrowth(TensorwiseTransform):
         target: Target = "update",
     ):
         defaults = dict(add=add, mul=mul, min_value=min_value, max_decay=max_decay)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults, target=target)
-    def apply_tensor(self, tensor, param, grad, loss, state, settings):
-        add, mul, min_value, max_decay = itemgetter('add','mul','min_value','max_decay')(settings)
+    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+        add, mul, min_value, max_decay = itemgetter('add','mul','min_value','max_decay')(setting)
         add: float | None
         if add is None and mul is None:
@@ -120,7 +120,8 @@ class ClipNormGrowth(Transform):
     Args:
         add (float | None, optional): additive clipping, next update norm is at most `previous norm + add`. Defaults to None.
-        mul (float | None, optional): multiplicative clipping, next update norm is at most `previous norm * mul`. Defaults to 1.5.
+        mul (float | None, optional):
+            multiplicative clipping, next update norm is at most `previous norm * mul`. Defaults to 1.5.
         min_value (float | None, optional):
             minimum value for multiplicative clipping to prevent collapse to 0.
             Next norm is at most :code:`max(prev_norm, min_value) * mul`. Defaults to 1e-4.
@@ -144,11 +145,11 @@ class ClipNormGrowth(Transform):
         target: Target = "update",
     ):
         defaults = dict(add=add, mul=mul, min_value=min_value, max_decay=max_decay, ord=ord, parameterwise=parameterwise)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults, target=target)
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         parameterwise = settings[0]['parameterwise']
         tensors = TensorList(tensors)

torchzero/modules/experimental/__init__.py CHANGED Viewed

@@ -1,24 +1,41 @@
+"""This submodule contains various untested experimental modules, some of them are to be moved out of experimental when properly tested, some are to remain here forever or to be deleted depending on the degree of their usefulness."""
 from .absoap import ABSOAP
 from .adadam import Adadam
+from .adam_lambertw import AdamLambertW
 from .adamY import AdamY
+from .adaptive_step_size import AdaptiveStepSize
 from .adasoap import AdaSOAP
+from .cosine import (
+    AdaptiveDifference,
+    AdaptiveDifferenceEMA,
+    CosineDebounce,
+    CosineMomentum,
+    CosineStepSize,
+    ScaledAdaptiveDifference,
+)
+from .cubic_adam import CubicAdam
 from .curveball import CurveBall
+# from dct import DCTProjection
 from .eigendescent import EigenDescent
 from .etf import (
     ExponentialTrajectoryFit,
     ExponentialTrajectoryFitV2,
     PointwiseExponential,
 )
+from .exp_adam import ExpAdam
+from .expanded_lbfgs import ExpandedLBFGS
+from .fft import FFTProjection
 from .gradmin import GradMin
+from .hnewton import HNewton
+from .modular_lbfgs import ModularLBFGS
 from .newton_solver import NewtonSolver
 from .newtonnewton import NewtonNewton
+from .parabolic_search import CubicParabolaSearch, ParabolaSearch
 from .reduce_outward_lr import ReduceOutwardLR
-from .soapy import SOAPY
-from .spectral import SpectralPreconditioner
-from .structured_newton import StructuredNewton
+from .structural_projections import BlockPartition, TensorizeProjection
 from .subspace_preconditioners import (
     HistorySubspacePreconditioning,
     RandomSubspacePreconditioning,
 )
-from .tada import TAda
-from .diagonal_higher_order_newton import DiagonalHigherOrderNewton
+from .tensor_adagrad import TensorAdagrad

torchzero/modules/experimental/absoap.py CHANGED Viewed

@@ -24,7 +24,10 @@ def update_absoap_covariances_(
 Source=Literal['p','g','s','y', 'gy', 'sy', 'sn', 'yn', 'gys', 'sys']
 class ABSOAP(Transform):
-    """SOAP but with some extra options for testing. Please note that this is experimental and isn't guaranteed to work.
+    """SOAP but with some extra options for testing.
+    .. warning::
+        This module is just for testing my stupid ideas.
     Args:
         scale_by_s - whether to scale y by s
@@ -94,7 +97,7 @@ class ABSOAP(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         updates = []
         # update preconditioners
         for i,(p,t, state, setting) in enumerate(zip(params, tensors, states, settings)):

torchzero/modules/experimental/adadam.py CHANGED Viewed

@@ -10,7 +10,7 @@ from ..functional import (
     ema_,
     sqrt_ema_sq_,
 )
-from ..lr.lr import lazy_lr
+from ..step_size.lr import lazy_lr
 from ..momentum.experimental import sqrt_nag_ema_sq_
 from ..momentum.momentum import nag_
@@ -50,7 +50,13 @@ def adadam_(
     return None
 class Adadam(Module):
-    """Adam with a diagonally preconditioned preconditioner. Please note that this is experimental and isn't guaranteed to work."""
+    """Adam with a diagonally preconditioned preconditioner.
+    Verdict: I haven't tested this yet.
+    .. warning::
+        Experimental.
+    """
     def __init__(
         self,
         beta1: float = 0.9,

torchzero/modules/experimental/adamY.py CHANGED Viewed

@@ -10,7 +10,7 @@ from ..functional import (
     ema_,
     sqrt_ema_sq_,
 )
-from ..lr.lr import lazy_lr
+from ..step_size.lr import lazy_lr
 from ..momentum.experimental import sqrt_nag_ema_sq_
 from ..momentum.momentum import nag_
@@ -62,7 +62,13 @@ def adamy_(
     return None
 class AdamY(Module):
-    """Adam but uses scaled gradient differences for second momentum. Please note that this is experimental and isn't guaranteed to work."""
+    """Adam but uses scaled gradient differences for second momentum.
+    Verdict: I haven't tested this yet.
+    .. warning::
+        Experimental.
+    """
     def __init__(
         self,
         beta1: float = 0.9,

torchzero/modules/experimental/adam_lambertw.py ADDED Viewed

@@ -0,0 +1,149 @@
+from operator import itemgetter
+from functools import partial
+import math
+import torch
+from ...core import Module, Target, Transform, apply_transform, Chainable
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
+from ..functional import (
+    debias, debiased_step_size,
+    ema_,
+    sqrt_ema_sq_,
+)
+from ..step_size.lr import lazy_lr
+from ..momentum.experimental import sqrt_nag_ema_sq_
+from ..momentum.momentum import nag_
+def _lambertw_newton_raphson(x: TensorList, iterations=5):
+    # z = torch.zeros_like(x)
+    # mask_neg = x < 0
+    # mask_pos = ~mask_neg
+    # z[mask_pos] = torch.log(x[mask_pos] + 1.0)
+    # x_neg = x[mask_neg]
+    # z_neg = -1.0 + torch.sqrt(2.0 * (1.0 + math.e * x_neg))
+    # z[mask_neg] = z_neg
+    # x is always positive
+    z = (x+1).log_()
+    for _ in range(iterations):
+        exp_z = z.exp()
+        numerator = z * exp_z - x
+        denominator = exp_z * (z + 1.0) + 1e-8
+        delta = numerator / denominator
+        z -= delta
+    return z
+# https://github.com/gmgeorg/torchlambertw/blob/main/torchlambertw/special.py
+def _lambertw_winitzki(x: TensorList):
+    x_log1p = x.log1p()
+    return x_log1p * (1.0 - x_log1p.log1p() / (2.0 + x_log1p))
+def adam_lambertw_(
+    tensors: TensorList,
+    exp_avg_: TensorList,
+    exp_avg_xpx_: TensorList,
+    alpha: float | NumberList,
+    beta1: float | NumberList,
+    beta2: float | NumberList,
+    eps: float | NumberList,
+    step: int,
+    pow: float = 2,
+    debiased: bool = True,
+    max_exp_avg_xpx_: TensorList | None = None,
+    iterations: int | None = 5,
+    # inner args
+    inner: Module | None = None,
+    params: list[torch.Tensor] | None = None,
+    grads: list[torch.Tensor] | None = None,
+):
+    """Returns new tensors."""
+    tensors_abs = tensors.abs().clip_(max=20)
+    tensors_xpx = tensors_abs.pow_(tensors_abs)
+    exp_avg_xpx_.lerp_(tensors_xpx, 1-beta2)
+    if max_exp_avg_xpx_ is not None:
+        max_exp_avg_xpx_.maximum_(exp_avg_xpx_)
+        exp_avg_xpx_ = max_exp_avg_xpx_
+    if inner is not None:
+        assert params is not None
+        tensors = TensorList(apply_transform(inner, tensors, params=params, grads=grads))
+    exp_avg_ = ema_(tensors, exp_avg_=exp_avg_, beta=beta1, dampening=0,lerp=True)
+    if debiased: alpha = debiased_step_size(step, beta1=beta1, beta2=beta2, pow=pow, alpha=alpha)
+    if iterations is None or iterations < 1: exp_avg_xpx_ = _lambertw_winitzki(exp_avg_xpx_)
+    else: exp_avg_xpx_ = _lambertw_newton_raphson(exp_avg_xpx_, iterations)
+    return (exp_avg_.lazy_mul(alpha) / exp_avg_xpx_.add_(eps))
+class AdamLambertW(Transform):
+    """Adam but uses abs x^x and LambertW instead of square and sqrt.
+    The gradient will be clipped to 20 because float32 which you have to use otherwise you're PC will explode.
+    Args:
+        beta1 (float, optional): momentum. Defaults to 0.9.
+        beta2 (float, optional): second momentum. Defaults to 0.999.
+        eps (float, optional): epsilon. Defaults to 1e-8.
+        alpha (float, optional): learning rate. Defaults to 1.
+        amsgrad (bool, optional): Whether to divide by maximum of EMA of gradient squares instead. Defaults to False.
+        pow (float, optional): power used in second momentum power and root. Defaults to 2.
+        debiased (bool, optional): whether to apply debiasing to momentums based on current step. Defaults to True.
+        iterations (int, optional): 0 or None means Winitzki approximation otherwise number of newton raphson iterations.
+    """
+    def __init__(
+        self,
+        beta1: float = 0.9,
+        beta2: float = 0.999,
+        eps: float = 1e-8,
+        amsgrad: bool = False,
+        alpha: float = 1.,
+        pow: float = 2,
+        debiased: bool = True,
+        iterations: int | None = 5,
+        inner: Chainable | None = None
+    ):
+        defaults=dict(beta1=beta1,beta2=beta2,eps=eps,alpha=alpha,amsgrad=amsgrad,pow=pow,debiased=debiased, iterations=iterations)
+        super().__init__(defaults, uses_grad=False)
+        if inner is not None: self.set_child('inner', inner)
+    @torch.no_grad
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
+        beta1,beta2,eps,alpha=unpack_dicts(settings, 'beta1','beta2','eps','alpha', cls=NumberList)
+        amsgrad,pow,debiased,iterations = itemgetter('amsgrad','pow','debiased','iterations')(settings[0])
+        if amsgrad:
+            exp_avg, exp_avg_xpx, max_exp_avg_xpx = unpack_states(states, tensors, 'exp_avg', 'exp_avg_xpx', 'max_exp_avg_xpx', cls=TensorList)
+        else:
+            exp_avg, exp_avg_xpx = unpack_states(states, tensors, 'exp_avg', 'exp_avg_xpx', cls=TensorList)
+            max_exp_avg_xpx = None
+        return adam_lambertw_(
+            tensors=TensorList(tensors),
+            exp_avg_=exp_avg,
+            exp_avg_xpx_=exp_avg_xpx,
+            alpha=alpha,
+            beta1=beta1,
+            beta2=beta2,
+            eps=eps,
+            step=step,
+            pow=pow,
+            debiased=debiased,
+            max_exp_avg_xpx_=max_exp_avg_xpx,
+            iterations=iterations,
+            # inner args
+            inner=self.children.get("inner", None),
+            params=params,
+            grads=grads,
+        )

torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} RENAMED Viewed

@@ -2,12 +2,16 @@ from operator import itemgetter
 import torch
-from .line_search import LineSearch
+from ..line_search import LineSearchBase
-class TrustRegion(LineSearch):
-    """Basic first order trust region method. Re-evaluates the function after stepping, if value decreased sufficiently,
-    step size is increased. If value increased, step size is decreased. This is prone to collapsing.
+class AdaptiveStepSize(LineSearchBase):
+    """Basic first order step size adaptation method. Re-evaluates the function after stepping, if value decreased sufficiently,
+    step size is increased. If value increased, step size is decreased.
+    .. note::
+        This works well in some cases, but it is often prone to collapsing.
+        For a more robust alternative use :code:`tz.m.AdaptiveBacktracking`.
     Args:
         nplus (float, optional): multiplier to step size on successful steps. Defaults to 1.5.
@@ -18,6 +22,19 @@ class TrustRegion(LineSearch):
         adaptive (bool, optional):
             If enabled, when multiple consecutive steps have been successful or unsuccessful,
             the corresponding multipliers are increased, otherwise they are reset. Defaults to True.
+    Examples:
+        Adagrad with trust region:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.Adagrad(),
+                tz.m.TrustRegion()
+            )
     """
     def __init__(self, nplus: float=1.5, nminus: float=0.75, c: float=1e-4, init: float = 1, backtrack: bool = True, adaptive: bool = True):
         defaults = dict(nplus=nplus, nminus=nminus, c=c, init=init, backtrack=backtrack, adaptive=adaptive)

torchzero/modules/experimental/adasoap.py CHANGED Viewed

@@ -33,9 +33,14 @@ def update_adasoap_covariances_(
 class AdaSOAP(Transform):
-    """SOAP with diagonally preconditioned GG^Ts. Please note that this is experimental and isn't guaranteed to work.
+    """SOAP with diagonally preconditioned GG^Ts.
+    .. warning::
+        Experimental.
     precond_beta - beta for GG^T squares
+    Verdict: It works, but it is about the same performance as Adam, but maybe more tuning potential?
     """
     def __init__(
         self,
@@ -71,7 +76,7 @@ class AdaSOAP(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply(self, tensors, params, grads, loss, states, settings):
+    def apply_tensors(self, tensors, params, grads, loss, states, settings):
         updates = []
         # update preconditioners
         for i,(p,t, state, setting) in enumerate(zip(params, tensors, states, settings)):

torchzero 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl