PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +43 -33
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +48 -52
torchzero/core/module.py +130 -50
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +2 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +15 -12
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +1 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +32 -32
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +50 -48
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +10 -10
torchzero/modules/quasi_newton/sg2.py +19 -19
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +49 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +57 -90
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +157 -177
torchzero/modules/second_order/rsn.py +106 -96
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +10 -10
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +93 -69
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/sam.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from contextlib import nullcontext
 import torch
-from ...utils import TensorList, NumberList
-from ...core import Module
+from ...utils import TensorList, NumberList, unpack_dicts, unpack_states
+from ...core import Transform
-class SAM(Module):
+class SAM(Transform):
     """Sharpness-Aware Minimization from https://arxiv.org/pdf/2010.01412
     SAM functions by seeking parameters that lie in neighborhoods having uniformly low loss value.
@@ -22,50 +22,51 @@ class SAM(Module):
         p (float, optional): norm of the SAM objective. Defaults to 2.
         asam (bool, optional):
             enables ASAM variant which makes perturbation relative to weight magnitudes.
-            ASAM requires a much larger :code:`rho`, like 0.5 or 1.
-            The :code:`tz.m.ASAM` class is idential to setting this argument to True, but
-            it has larger :code:`rho` by default.
+            ASAM requires a much larger ``rho``, like 0.5 or 1.
+            The ``tz.m.ASAM`` class is idential to setting this argument to True, but
+            it has larger ``rho`` by default.
-    Examples:
-        SAM-SGD:
+    ### Examples:
-        .. code-block:: python
+    SAM-SGD:
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.SAM(),
-                tz.m.LR(1e-2)
-            )
+    ```py
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.SAM(),
+        tz.m.LR(1e-2)
+    )
+    ```
-        SAM-Adam:
+    SAM-Adam:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.SAM(),
-                tz.m.Adam(),
-                tz.m.LR(1e-2)
-            )
+    ```
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.SAM(),
+        tz.m.Adam(),
+        tz.m.LR(1e-2)
+    )
+    ```
     References:
-        Foret, P., Kleiner, A., Mobahi, H., & Neyshabur, B. (2020). Sharpness-aware minimization for efficiently improving generalization. arXiv preprint arXiv:2010.01412. https://arxiv.org/abs/2010.01412#page=3.16
+        [Foret, P., Kleiner, A., Mobahi, H., & Neyshabur, B. (2020). Sharpness-aware minimization for efficiently improving generalization. arXiv preprint arXiv:2010.01412.](https://arxiv.org/abs/2010.01412#page=3.16)
     """
     def __init__(self, rho: float = 0.05, p: float = 2, eps=1e-10, asam=False):
         defaults = dict(rho=rho, p=p, eps=eps, asam=asam)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
+    def update_states(self, objective, states, settings):
-        params = var.params
-        closure = var.closure
-        zero_grad = var.zero_grad
+        params = objective.params
+        closure = objective.closure
+        zero_grad = objective.zero_grad
         if closure is None: raise RuntimeError("SAM requires a closure passed to the optimizer step")
-        p, rho = self.get_settings(var.params, 'p', 'rho', cls=NumberList)
-        s = self.defaults
-        eps = s['eps']
-        asam = s['asam']
+        p, rho = unpack_dicts(settings, 'p', 'rho', cls=NumberList)
+        fs = settings[0]
+        eps = fs['eps']
+        asam = fs['asam']
         # 1/p + 1/q = 1
         # okay, authors of SAM paper, I will manually solve your equation
@@ -123,8 +124,7 @@ class SAM(Module):
             return sam_loss
-        var.closure = sam_closure
-        return var
+        objective.closure = sam_closure
 # different class because defaults for SAM are bad for ASAM
 class ASAM(SAM):
@@ -136,7 +136,7 @@ class ASAM(SAM):
     This implementation modifies the closure to return loss and calculate gradients
     of the SAM objective. All modules after this will use the modified objective.
-    .. note::
+    Note:
         This module requires a closure passed to the optimizer step,
         as it needs to re-evaluate the loss and gradients at two points on each step.
@@ -144,20 +144,30 @@ class ASAM(SAM):
         rho (float, optional): Neighborhood size. Defaults to 0.05.
         p (float, optional): norm of the SAM objective. Defaults to 2.
-    Examples:
-        ASAM-Adam:
+    ### Examples:
+    ASAM-SGD:
-        .. code-block:: python
+    ```py
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.ASAM(),
+        tz.m.LR(1e-2)
+    )
+    ```
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.ASAM(),
-                tz.m.Adam(),
-                tz.m.LR(1e-2)
-            )
+    ASAM-Adam:
+    ```
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.ASAM(),
+        tz.m.Adam(),
+        tz.m.LR(1e-2)
+    )
+    ```
     References:
-        Kwon, J., Kim, J., Park, H., & Choi, I. K. (2021, July). Asam: Adaptive sharpness-aware minimization for scale-invariant learning of deep neural networks. In International Conference on Machine Learning (pp. 5905-5914). PMLR. https://arxiv.org/abs/2102.11600
+        [Kwon, J., Kim, J., Park, H., & Choi, I. K. (2021, July). ASAM: Adaptive sharpness-aware minimization for scale-invariant learning of deep neural networks. In International Conference on Machine Learning (pp. 5905-5914). PMLR.](https://arxiv.org/abs/2102.11600)
     """
     def __init__(self, rho: float = 0.5, p: float = 2, eps=1e-10):
         super().__init__(rho=rho, p=p, eps=eps, asam=True)

torchzero/modules/adaptive/shampoo.py CHANGED Viewed

@@ -1,11 +1,10 @@
 from collections.abc import Sequence
-from operator import itemgetter
-from functools import partial
 import numpy as np
 import torch
-from ...core import Chainable, Transform, apply_transform
-from ...utils.linalg import matrix_power_eigh
+from ...core import Chainable, TensorTransform
+from ...linalg.matrix_power import MatrixPowerMethod, matrix_power as _matrix_power
 from ...utils import set_storage_
@@ -14,10 +13,11 @@ def update_shampoo_preconditioner_(
     accumulators_: list[torch.Tensor | None],
     preconditioners_: list[torch.Tensor | None],
     step: int,
-    update_freq: int,
-    exp_override: int | None,
+    precond_freq: int,
+    matrix_power: float | None,
     beta: float | None,
-    reg: float
+    reg: float,
+    matrix_power_method: MatrixPowerMethod,
 ):
     for i, (accumulator, preconditioner) in enumerate(zip(accumulators_, preconditioners_)):
         if accumulator is None: continue
@@ -27,22 +27,20 @@ def update_shampoo_preconditioner_(
         if beta is None: accumulator.add_(torch.tensordot(grad, grad, (axes, axes))) # pyright:ignore[reportArgumentType]
         else: accumulator.lerp_(torch.tensordot(grad, grad, (axes, axes)), 1-beta) # pyright:ignore[reportArgumentType]
-        if step % update_freq == 0:
-            matrix_exp = -1/(grad.ndim*2) if exp_override is None else -1/exp_override
+        if step % precond_freq == 0:
             if reg != 0:
                 accumulator = accumulator + torch.eye(accumulator.size(0), device=accumulator.device, dtype=accumulator.dtype).mul_(reg)
-            set_storage_(preconditioner, matrix_power_eigh(accumulator, matrix_exp))
+            if matrix_power is None: matrix_power = -1 / max(grad.ndim, 2)
+            set_storage_(preconditioner, _matrix_power(accumulator, matrix_power, method=matrix_power_method))
 def apply_shampoo_preconditioner(
     tensor: torch.Tensor,
     preconditioners_: list[torch.Tensor | None],
-    decay: float | None,
 ):
     for i, preconditioner in enumerate(preconditioners_):
         if preconditioner is None: continue
         tensor = torch.tensordot(tensor, preconditioner, ([0], [0])) # pyright:ignore[reportArgumentType]
-        if decay is not None: preconditioner.mul_(decay)
     return tensor
@@ -50,9 +48,8 @@ def update_diagonal_(grad: torch.Tensor, diagonal_accumulator_: torch.Tensor, be
     if beta is None: diagonal_accumulator_.add_(grad.pow(2))
     else: diagonal_accumulator_.mul_(beta).addcmul_(grad, grad, value=1-beta)
-def apply_diagonal_(grad_: torch.Tensor, diagonal_accumulator_: torch.Tensor, decay: float | None, eps: float):
+def apply_diagonal_(grad_: torch.Tensor, diagonal_accumulator_: torch.Tensor, eps: float):
     grad_.div_(diagonal_accumulator_.sqrt() + eps)
-    if decay is not None: diagonal_accumulator_.mul_(decay)
     return grad_
 def _merge_small_dims(tensor: torch.Tensor, max_dim: int):
@@ -86,144 +83,141 @@ def _unmerge_small_dims(tensor: torch.Tensor, flat_sizes: Sequence[int] | None,
     return tensor.permute(*np.argsort(sort_idxs).tolist())
-class Shampoo(Transform):
+class Shampoo(TensorTransform):
     """Shampoo from Preconditioned Stochastic Tensor Optimization (https://arxiv.org/abs/1802.09568).
-    .. note::
+    Notes:
         Shampoo is usually grafted to another optimizer like Adam, otherwise it can be unstable. An example of how to do grafting is given below in the Examples section.
-    .. note::
-        Shampoo is a very computationally expensive optimizer, increase :code:`update_freq` if it is too slow.
+        Shampoo is a very computationally expensive optimizer, increase ``update_freq`` if it is too slow.
-    .. note::
-        SOAP optimizer usually outperforms Shampoo and is also not as computationally expensive. SOAP implementation is available as :code:`tz.m.SOAP`.
+        SOAP optimizer usually outperforms Shampoo and is also not as computationally expensive. SOAP implementation is available as ``tz.m.SOAP``.
     Args:
-        decay (float | None, optional): slowly decays preconditioners. Defaults to None.
-        beta (float | None, optional):
-            if None calculates sum as in standard shampoo, otherwise uses EMA of preconditioners. Defaults to None.
         update_freq (int, optional): preconditioner update frequency. Defaults to 10.
-        exp_override (int | None, optional): matrix exponent override, if not set, uses 2*ndim. Defaults to 2.
+        matrix_power (float | None, optional): overrides matrix exponent. By default uses ``-1/grad.ndim``. Defaults to None.
         merge_small (bool, optional): whether to merge small dims on tensors. Defaults to True.
-        max_dim (int, optional): maximum dimension size for preconditioning. Defaults to 2_000.
+        max_dim (int, optional): maximum dimension size for preconditioning. Defaults to 10_000.
         precondition_1d (bool, optional): whether to precondition 1d tensors. Defaults to True.
         adagrad_eps (float, optional): epsilon for adagrad division for tensors where shampoo can't be applied. Defaults to 1e-8.
+        matrix_power_method (MatrixPowerMethod, optional): how to compute matrix power.
+        beta (float | None, optional):
+            if None calculates sum as in standard Shampoo, otherwise uses EMA of preconditioners. Defaults to None.
         inner (Chainable | None, optional):
             module applied after updating preconditioners and before applying preconditioning.
             For example if beta≈0.999 and `inner=tz.m.EMA(0.9)`, this becomes Adam with shampoo preconditioner (ignoring debiasing).
             Defaults to None.
     Examples:
-        Shampoo grafted to Adam
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.GraftModules(
-                    direction = tz.m.Shampoo(),
-                    magnitude = tz.m.Adam(),
-                ),
-                tz.m.LR(1e-3)
-            )
-        Adam with Shampoo preconditioner
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Shampoo(beta=0.999, inner=tz.m.EMA(0.9)),
-                tz.m.Debias(0.9, 0.999),
-                tz.m.LR(1e-3)
-            )
+    Shampoo grafted to Adam
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.GraftModules(
+            direction = tz.m.Shampoo(),
+            magnitude = tz.m.Adam(),
+        ),
+        tz.m.LR(1e-3)
+    )
+    ```
+    Adam with Shampoo preconditioner
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Shampoo(beta=0.999, inner=tz.m.EMA(0.9)),
+        tz.m.Debias(0.9, 0.999),
+        tz.m.LR(1e-3)
+    )
+    ```
     """
     def __init__(
         self,
-        decay: float | None = None,
-        beta: float | None = None,
         reg: float = 1e-12,
-        update_freq: int = 10,
-        exp_override: int | None = 2,
+        precond_freq: int = 10,
+        matrix_power: float | None = None,
         merge_small: bool = True,
-        max_dim: int = 2_000,
+        max_dim: int = 10_000,
         precondition_1d: bool = True,
         adagrad_eps: float = 1e-8,
+        matrix_power_method: MatrixPowerMethod = "eigh_abs",
+        beta: float | None = None,
+        beta_debias: bool = True,
         inner: Chainable | None = None,
     ):
-        defaults = dict(decay=decay, beta=beta, update_freq=update_freq, exp_override=exp_override, merge_small=merge_small, max_dim=max_dim, precondition_1d=precondition_1d,adagrad_eps=adagrad_eps, reg=reg)
-        super().__init__(defaults, uses_grad=False)
-        if inner is not None:
-            self.set_child('inner', inner)
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        merged_tensors = [] # target with merged dims
-        # update preconditioners
-        for i,(t,state, setting) in enumerate(zip(tensors, states, settings)):
-            beta, update_freq, exp_override, merge_small, max_dim, precondition_1d, reg = itemgetter(
-                'beta', 'update_freq', 'exp_override', 'merge_small', 'max_dim', 'precondition_1d', "reg")(setting)
-            if merge_small:
-                t, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(t, max_dim)
-            merged_tensors.append(t)
-            # initialize accumulators and preconditioners for each dim on 1st step
-            if 'accumulators' not in state:
-                if not precondition_1d and t.ndim <= 1:
-                    state['accumulators'] = []
-                else:
-                    state['accumulators'] = [torch.eye(s, dtype=t.dtype, device=t.device) if 1<s<max_dim else None for s in t.shape]
-                    state['preconditioners'] = [torch.eye(s, dtype=t.dtype, device=t.device) if 1<s<max_dim else None for s in t.shape]
-                # either scalar parameter, 1d with precondition_1d=False, or too big, then basic diagonal preconditioner is used.
-                if len([i is not None for i in state['accumulators']]) == 0:
-                    state['diagonal_accumulator'] = torch.zeros_like(t)
-                state['step'] = 0
-            # update preconditioners
-            if 'diagonal_accumulator' in state:
-                update_diagonal_(t, state['diagonal_accumulator'], beta)
-            else:
-                update_shampoo_preconditioner_(
-                    t,
-                    accumulators_=state['accumulators'],
-                    preconditioners_=state['preconditioners'],
-                    step=state['step'],
-                    update_freq=update_freq,
-                    exp_override=exp_override,
-                    beta=beta,
-                    reg=reg,
-                )
-        # inner step
-        if 'inner' in self.children:
-            tensors = apply_transform(self.children['inner'], tensors, params=params, grads=grads)
-            # have to merge small dims again
-            merged_tensors = [] # target with merged dims
-            for i,(t,state, setting) in enumerate(zip(tensors, states, settings)):
-                if setting['merge_small']:
-                    t, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(t, setting['max_dim'])
-                merged_tensors.append(t)
-        # precondition
-        for i,(t,state, setting) in enumerate(zip(merged_tensors, states, settings)):
-            decay, merge_small, adagrad_eps= itemgetter('decay', 'merge_small', 'adagrad_eps')(setting)
-            if 'diagonal_accumulator' in state:
-                tensors[i] = apply_diagonal_(t, state['diagonal_accumulator'], decay=decay, eps=adagrad_eps)
-            else:
-                tensors[i] = apply_shampoo_preconditioner(t, preconditioners_=state['preconditioners'], decay=decay)
-            if merge_small:
-                tensors[i] = _unmerge_small_dims(tensors[i], state['flat_sizes'], state['sort_idxs'])
-            state['step'] += 1
-        return tensors
+        defaults = locals().copy()
+        del defaults['self'], defaults["inner"]
+        super().__init__(defaults, inner=inner)
+    @torch.no_grad
+    def single_tensor_initialize(self, tensor, param, grad, loss, state, setting):
+        if setting["merge_small"]:
+            tensor, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(tensor, setting["max_dim"])
+        if tensor.ndim <= 1 and not setting["precondition_1d"]:
+            state["accumulators"] = []
+        else:
+            max_dim = setting["max_dim"]
+            state['accumulators'] = [
+                torch.eye(s, dtype=tensor.dtype, device=tensor.device) if 1<s<max_dim else None for s in tensor.shape
+            ]
+            state['preconditioners'] = [
+                torch.eye(s, dtype=tensor.dtype, device=tensor.device) if 1<s<max_dim else None for s in tensor.shape
+            ]
+        # either scalar parameter, 1d with precondition_1d=False, or too big, then diagonal preconditioner is used.
+        if len([i is not None for i in state['accumulators']]) == 0:
+            state['diagonal_accumulator'] = torch.zeros_like(tensor)
+        state['step'] = 0
+        state["num_GTG"] = 0
+    @torch.no_grad
+    def single_tensor_update(self, tensor, param, grad, loss, state, setting):
+        if setting["merge_small"]:
+            tensor, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(tensor, setting["max_dim"])
+        if 'diagonal_accumulator' in state:
+            update_diagonal_(tensor, state['diagonal_accumulator'], beta=setting["beta"])
+        else:
+            update_shampoo_preconditioner_(
+                tensor,
+                accumulators_=state['accumulators'],
+                preconditioners_=state['preconditioners'],
+                step=state['step'],
+                precond_freq=setting["precond_freq"],
+                matrix_power=setting["matrix_power"],
+                beta=setting["beta"],
+                reg=setting["reg"],
+                matrix_power_method=setting["matrix_power_method"],
+            )
+        if state["step"] % setting["precond_freq"] == 0:
+            state["num_GTG"] += 1
+        state["step"] += 1
+    @torch.no_grad
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
+        if setting["merge_small"]:
+            tensor, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(tensor, setting["max_dim"])
+        if 'diagonal_accumulator' in state:
+            dir = apply_diagonal_(tensor, state['diagonal_accumulator'], eps=setting["adagrad_eps"])
+        else:
+            dir = apply_shampoo_preconditioner(tensor, preconditioners_=state['preconditioners'])
+        if setting["merge_small"]:
+            dir = _unmerge_small_dims(dir, state['flat_sizes'], state['sort_idxs'])
+        if setting['beta_debias'] and setting["beta"] is not None:
+            bias_correction = 1 - (setting["beta"] ** state["num_GTG"])
+            dir *= bias_correction ** 0.5
+        return dir

torchzero 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl