PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +43 -33
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +48 -52
torchzero/core/module.py +130 -50
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +2 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +15 -12
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +1 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +32 -32
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +50 -48
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +10 -10
torchzero/modules/quasi_newton/sg2.py +19 -19
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +49 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +57 -90
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +157 -177
torchzero/modules/second_order/rsn.py +106 -96
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +10 -10
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +93 -69
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/adagrad.py CHANGED Viewed

@@ -1,62 +1,14 @@
-from operator import itemgetter
 from typing import Literal
 import torch
 from ...core import (
     Chainable,
-    Module,
-    Target,
-    TensorwiseTransform,
-    Transform,
-    Var,
-    apply_transform,
+    TensorTransform,
 )
-from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
-from ...utils.linalg import matrix_power_eigh
-from ..functional import add_power_, lerp_power_, root, epsilon_step_size
-from ...utils.linalg.linear_operator import Dense
-def adagrad_(
-    tensors_: TensorList,
-    sq_sum_: TensorList,
-    alpha: float | NumberList,
-    lr_decay: float | NumberList,
-    eps: float | NumberList,
-    step: int,
-    pow: float = 2,
-    use_sqrt: bool = True,
-    divide: bool = False,
-    decay: float | None = None,
-    beta: float | None = None,
-    # inner args
-    inner: Module | None = None,
-    params: list[torch.Tensor] | None = None,
-    grads: list[torch.Tensor] | None = None,
-):
-    """returns `tensors_`"""
-    clr = alpha / (1 + step * lr_decay)
-    if beta is None or step == 1: sq_sum_ = add_power_(tensors_, sum_=sq_sum_, pow=pow)
-    else: sq_sum_ = lerp_power_(tensors_, exp_avg_pow_=sq_sum_, beta=beta, pow=pow)
-    if decay is not None:
-        sq_sum_.mul_(1-decay)
-    if inner is not None:
-        assert params is not None
-        tensors_ = TensorList(apply_transform(inner, tensors_, params=params, grads=grads))
-    if divide: sq_sum_ = sq_sum_ / max(step, 1)
-    if use_sqrt: tensors_.div_(root(sq_sum_, p=pow, inplace=False).add_(eps)).mul_(clr)
-    else: tensors_.div_(sq_sum_.add(eps)).mul_(clr)
+from ...utils import NumberList, TensorList, unpack_dicts
+from ...linalg.matrix_power import matrix_power as _matrix_power, MatrixPowerMethod
-    return tensors_
-class Adagrad(Transform):
+class Adagrad(TensorTransform):
     """Adagrad, divides by sum of past squares of gradients.
     This implementation is identical to ``torch.optim.Adagrad``.
@@ -72,103 +24,53 @@ class Adagrad(Transform):
     """
     def __init__(
         self,
+        # hyperparams
         lr_decay: float = 0,
         initial_accumulator_value: float = 0,
         eps: float = 1e-10,
         alpha: float = 1,
-        pow: float = 2,
-        use_sqrt: bool = True,
-        divide: bool=False,
-        beta:float | None = None,
-        decay: float | None = None,
+        # tfms
         inner: Chainable | None = None,
+        accumulator_tfm: Chainable | None = None
     ):
-        defaults = dict(alpha = alpha, lr_decay = lr_decay, initial_accumulator_value=initial_accumulator_value,
-                        eps = eps, pow=pow, use_sqrt = use_sqrt, divide=divide, beta=beta, decay=decay)
-        super().__init__(defaults=defaults, uses_grad=False)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner'], defaults["accumulator_tfm"]
+        super().__init__(defaults=defaults, inner=inner)
-        if inner is not None:
-            self.set_child('inner', inner)
+        self.set_child('accumulator', accumulator_tfm)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
-        tensors = TensorList(tensors)
-        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
+    def single_tensor_initialize(self, tensor, param, grad, loss, state, setting):
+        state["accumulator"] = torch.full_like(tensor, fill_value=setting["initial_accumulator_value"])
-        lr_decay,alpha,eps = unpack_dicts(settings, 'lr_decay', 'alpha', 'eps', cls=NumberList)
-        pow, use_sqrt, divide = itemgetter('pow', 'use_sqrt', 'divide')(settings[0])
-        sq_sum = unpack_states(states, tensors, 'sq_sum', cls=TensorList)
-        # initialize accumulator on 1st step
-        if step == 1:
-            sq_sum.set_(tensors.full_like([s['initial_accumulator_value'] for s in settings]))
-        return adagrad_(
-            tensors,
-            sq_sum_=sq_sum,
-            alpha=alpha,
-            lr_decay=lr_decay,
-            eps=eps,
-            step=step,
-            pow=pow,
-            use_sqrt=use_sqrt,
-            divide=divide,
-            beta = self.defaults["beta"],
-            decay = self.defaults["decay"],
-            # inner args
-            inner=self.children.get("inner", None),
-            params=params,
-            grads=grads,
-        )
-def lerp(start, end, weight):
-    return start + weight * (end - start)
-def adagrad_norm_(
-    tensors_: TensorList,
-    accumulator: float | torch.Tensor,
-    alpha: float | NumberList,
-    lr_decay: float | NumberList,
-    eps: float | NumberList,
-    step: int,
-    use_sqrt: bool = True,
-    divide: bool = False,
-    decay: float | None = None,
-    beta: float | None = None,
-    # inner args
-    inner: Module | None = None,
-    params: list[torch.Tensor] | None = None,
-    grads: list[torch.Tensor] | None = None,
-):
-    """returns `tensors_`"""
-    clr = alpha / (1 + step * lr_decay)
+    @torch.no_grad
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
+        torch._foreach_addcmul_([state["accumulator"] for state in states], tensors, tensors)
+        self.increment_counter("step", start=0)
-    gg = tensors_.dot(tensors_)
+    @torch.no_grad
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
+        tensors_ = TensorList(tensors)
+        step = self.global_state["step"] # 0 on first apply
+        eps, alpha, lr_decay = unpack_dicts(settings, "eps", "alpha", "lr_decay", cls=NumberList)
-    if beta is None or step == 1: accumulator += gg
-    else: accumulator = lerp(accumulator, gg, 1-beta)
+        accumulator = [state["accumulator"] for state in states]
+        accumulator = TensorList(self.inner_step_tensors(
+            "accumulator", tensors=accumulator, clone=True, params=params, grads=grads, loss=loss, must_exist=False))
-    if decay is not None:
-        accumulator *= 1-decay
+        denom = accumulator.sqrt().add_(eps)
+        tensors_ /= denom
-    if inner is not None:
-        assert params is not None
-        tensors_ = TensorList(apply_transform(inner, tensors_, params=params, grads=grads))
+        clr = alpha / (1 + step * lr_decay)
+        tensors_.lazy_mul_(clr)
-    if divide: accumulator = accumulator / max(step, 1)
+        return tensors_
-    if use_sqrt: tensors_.div_(eps + accumulator.sqrt()).mul_(clr)
-    else: tensors_.div_(eps + accumulator).mul_(clr)
-    return tensors_, accumulator
-class AdagradNorm(Transform):
+class AdagradNorm(TensorTransform):
     """Adagrad-Norm, divides by sum of past means of squares of gradients.
     Args:
@@ -176,7 +78,6 @@ class AdagradNorm(Transform):
         initial_accumulator_value (float, optional): initial value of the sum of squares of gradients. Defaults to 0.
         eps (float, optional): division epsilon. Defaults to 1e-10.
         alpha (float, optional): step size. Defaults to 1.
-        pow (float, optional): power for gradients and accumulator root. Defaults to 2.
         use_sqrt (bool, optional): whether to take the root of the accumulator. Defaults to True.
         inner (Chainable | None, optional): Inner modules that are applied after updating accumulator and before preconditioning. Defaults to None.
     """
@@ -185,71 +86,104 @@ class AdagradNorm(Transform):
         lr_decay: float = 0,
         initial_accumulator_value: float = 0,
         eps: float = 1e-10,
-        alpha: float = 1,
-        pow: float = 2,
-        use_sqrt: bool = True,
-        divide: bool=False,
         beta:float | None = None,
-        decay: float | None = None,
+        beta_debias: bool = True,
+        layerwise: bool = True,
+        use_sqrt: bool = True,
+        alpha: float = 1,
         inner: Chainable | None = None,
     ):
-        defaults = dict(alpha = alpha, lr_decay = lr_decay, initial_accumulator_value=initial_accumulator_value,
-                        eps = eps, pow=pow, use_sqrt = use_sqrt, divide=divide, beta=beta, decay=decay)
-        super().__init__(defaults=defaults, uses_grad=False)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner']
+        super().__init__(defaults=defaults, inner=inner)
+    @torch.no_grad
+    def multi_tensor_initialize(self, tensors, params, grads, loss, states, settings):
+        # layerwise initialize in each state
+        if settings[0]["layerwise"]:
+            for tensor, state, setting in zip(tensors, states, settings):
+                initial_accumulator_value = setting["initial_accumulator_value"]
+                state["accumulator"] = torch.tensor(initial_accumulator_value, device=tensor.device, dtype=tensor.dtype)
+        # global initialize in global state
+        else:
+            initial_accumulator_value = settings[0]["initial_accumulator_value"]
+            tensor = tensors[0]
+            self.global_state["accumulator"] = torch.tensor(initial_accumulator_value, device=tensor.device, dtype=tensor.dtype)
+    def _get_accumulator(self, states, settings) -> torch.Tensor | TensorList:
+        layerwise = settings[0]["layerwise"]
+        if layerwise:
+            return TensorList(s["accumulator"] for s in states)
+        return self.global_state["accumulator"]
-        if inner is not None:
-            self.set_child('inner', inner)
+    @torch.no_grad
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
+        tensors = TensorList(tensors)
+        accumulator = self._get_accumulator(states, settings)
+        self.increment_counter("step", start=0)
+        # compute squared gradient norm (gg)
+        if isinstance(accumulator, TensorList): gg = tensors.tensorwise_dot(tensors)
+        else: gg = tensors.dot(tensors)
+        # update the accumulator
+        beta = settings[0]["beta"]
+        if beta is None: accumulator.add_(gg) # pyright:ignore[reportArgumentType]
+        else: accumulator.lerp_(gg, weight=1-beta) # pyright:ignore[reportArgumentType, reportCallIssue]
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         tensors = TensorList(tensors)
-        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
-        lr_decay,alpha,eps = unpack_dicts(settings, 'lr_decay', 'alpha', 'eps', cls=NumberList)
+        accumulator = self._get_accumulator(states, settings)
+        eps, alpha, lr_decay = unpack_dicts(settings, "eps", "alpha", "lr_decay", cls=NumberList)
+        step = self.global_state["step"] # 0 on 1st step
+        fs = settings[0]
+        beta = fs["beta"]
-        use_sqrt, divide, initial_accumulator_value = itemgetter('use_sqrt', 'divide', "initial_accumulator_value")(settings[0])
+        # ------------------------ debias if beta is not None ------------------------ #
+        if fs["beta_debias"] and beta is not None:
+            accumulator = accumulator / (1 - beta ** (step + 1))
-        accumulator = self.global_state.get("accumulator", initial_accumulator_value)
-        d, self.global_state["accumulator"] = adagrad_norm_(
-            tensors,
-            accumulator=accumulator,
-            alpha=alpha,
-            lr_decay=lr_decay,
-            eps=eps,
-            step=step,
-            use_sqrt=use_sqrt,
-            divide=divide,
+        # ---------------------------- compute denominator --------------------------- #
+        if fs["use_sqrt"]:
+            denom = accumulator.sqrt().add_(eps) # pyright:ignore[reportArgumentType]
+        else:
+            denom = accumulator + eps # pyright:ignore[reportOperatorIssue]
-            beta = self.defaults["beta"],
-            decay = self.defaults["decay"],
-            # inner args
-            inner=self.children.get("inner", None),
-            params=params,
-            grads=grads,
-        )
-        return d
+        # ---------------------------- compute the update ---------------------------- #
+        tensors /= denom
+        clr = alpha / (1 + step * lr_decay) # lr decay
+        tensors.lazy_mul_(clr)
+        return tensors
-class FullMatrixAdagrad(TensorwiseTransform):
+class FullMatrixAdagrad(TensorTransform):
     """Full-matrix version of Adagrad, can be customized to make RMSprop or Adam (see examples).
     Note:
         A more memory-efficient version equivalent to full matrix Adagrad on last n gradients is implemented in ``tz.m.LMAdagrad``.
     Args:
-        beta (float | None, optional): momentum for gradient outer product accumulators. if None, uses sum. Defaults to None.
-        decay (float | None, optional): decay for gradient outer product accumulators. Defaults to None.
-        sqrt (bool, optional): whether to take the square root of the accumulator. Defaults to True.
-        concat_params (bool, optional): if False, each parameter will have it's own accumulator. Defaults to True.
+        reg (float, optional): regularization, scale of identity matrix added to accumulator. Defaults to 1e-12.
         precond_freq (int, optional): frequency of updating the inverse square root of the accumulator. Defaults to 1.
+        beta (float | None, optional): momentum for gradient outer product accumulators. if None, uses sum. Defaults to None.
+        beta_debias (bool, optional): whether to use debiasing, only has effect when ``beta`` is not ``None``. Defaults to True.
         init (Literal[str], optional):
             how to initialize the accumulator.
             - "identity" - with identity matrix (default).
             - "zeros" - with zero matrix.
             - "ones" - with matrix of ones.
              -"GGT" - with the first outer product
-        divide (bool, optional): whether to divide the accumulator by number of gradients in it. Defaults to False.
+        matrix_power (float, optional): accumulator matrix power. Defaults to -1/2.
+        concat_params (bool, optional): if False, each parameter will have it's own accumulator. Defaults to True.
         inner (Chainable | None, optional): inner modules to apply preconditioning to. Defaults to None.
     ## Examples:
@@ -284,73 +218,89 @@ class FullMatrixAdagrad(TensorwiseTransform):
     """
     def __init__(
         self,
+        reg: float = 1e-12,
+        precond_freq: int = 1,
         beta: float | None = None,
-        decay: float | None = None,
-        sqrt: bool = True,
+        beta_debias: bool=True,
+        init: Literal["identity", "zeros", "GGT"] = "identity",
+        matrix_power: float = -1/2,
+        matrix_power_method: MatrixPowerMethod = "eigh_abs",
         concat_params=True,
-        precond_freq: int = 1,
-        init: Literal["identity", "zeros", "ones", "GGT"] = "identity",
-        reg: float = 1e-12,
-        divide: bool = False,
         inner: Chainable | None = None,
+        accumulator_tfm: Chainable | None = None
     ):
-        defaults = dict(beta=beta, decay=decay, sqrt=sqrt, precond_freq=precond_freq, init=init, divide=divide, reg=reg)
-        super().__init__(defaults, uses_grad=False, concat_params=concat_params, inner=inner,)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner'], defaults["concat_params"], defaults["accumulator_tfm"]
+        super().__init__(defaults=defaults, inner=inner, concat_params=concat_params)
+        self.set_child("accumulator", accumulator_tfm)
     @torch.no_grad
-    def update_tensor(self, tensor, param, grad, loss, state, setting):
+    def single_tensor_update(self, tensor, param, grad, loss, state, setting):
         G = tensor.ravel()
-        GG = torch.outer(G, G)
-        decay = setting['decay']
+        GGᵀ = torch.outer(G, G)
+        # initialize
+        if "accumulator" not in state:
+            init = setting['init']
+            if init == 'identity': state['accumulator'] = torch.eye(GGᵀ.size(0), device=GGᵀ.device, dtype=GGᵀ.dtype)
+            elif init == 'zeros': state['accumulator'] =  torch.zeros_like(GGᵀ)
+            elif init == 'GGT': state['accumulator'] = GGᵀ.clone()
+            else: raise ValueError(init)
+        # update
         beta = setting['beta']
-        init = setting['init']
+        accumulator: torch.Tensor = state["accumulator"]
-        if 'GG' not in state:
-            if init == 'identity': state['GG'] = torch.eye(GG.size(0), device=GG.device, dtype=GG.dtype)
-            elif init == 'zeros': state['GG'] =  torch.zeros_like(GG)
-            elif init == 'ones': state['GG'] = torch.ones_like(GG)
-            elif init == 'GGT': state['GG'] = GG.clone()
-            else: raise ValueError(init)
-        if decay is not None: state['GG'].mul_(decay)
+        if beta is None: accumulator.add_(GGᵀ)
+        else: accumulator.lerp_(GGᵀ, 1-beta)
-        if beta is not None: state['GG'].lerp_(GG, 1-beta)
-        else: state['GG'].add_(GG)
-        state['i'] = state.get('i', 0) + 1 # number of GGTs in sum
+        # update number of GGᵀ in accumulator for divide
+        state['num_GGTs'] = state.get('num_GGTs', 0) + 1
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
         step = state.get('step', 0)
         state['step'] = step + 1
-        GG: torch.Tensor = state['GG']
-        sqrt = setting['sqrt']
-        divide = setting['divide']
+        accumulator: torch.Tensor = state['accumulator']
+        accumulator = self.inner_step_tensors("accumulator", [accumulator], clone=True, must_exist=False)[0]
         precond_freq = setting['precond_freq']
         reg = setting['reg']
+        beta = setting["beta"]
-        if divide: GG = GG/state.get('i', 1)
+        # add regularizer
         if reg != 0:
-            GG = GG + torch.eye(GG.size(0), device=GG.device, dtype=GG.dtype).mul_(reg)
+            device = accumulator.device; dtype = accumulator.dtype
+            accumulator = accumulator + torch.eye(accumulator.size(0), device=device, dtype=dtype).mul_(reg)
+        # for single value use sqrt
         if tensor.numel() == 1:
-            GG = GG.squeeze()
-            if sqrt: return tensor / GG.sqrt()
-            return tensor / GG
+            dir = tensor.mul_(accumulator.squeeze() ** setting["matrix_power"])
-        try:
-            if sqrt:
+        # otherwise use matrix inverse square root
+        else:
+            # compute inverse square root and store to state
+            try:
                 if "B" not in state or step % precond_freq == 0:
-                    B = state["B"] = matrix_power_eigh(GG, -1/2)
+                    B = state["B"] = _matrix_power(accumulator, setting["matrix_power"], method=setting["matrix_power_method"])
                 else:
                     B = state["B"]
-            else: return torch.linalg.solve(GG, tensor.ravel()).view_as(tensor) # pylint:disable = not-callable
+                dir = (B @ tensor.ravel()).view_as(tensor)
+            # fallback to diagonal Adagrad on fail
+            except torch.linalg.LinAlgError:
+                dir = tensor.mul_(accumulator.diagonal() ** setting["matrix_power"])
-        except torch.linalg.LinAlgError:
-            # fallback to diagonal AdaGrad
-            denom = GG.diagonal()
-            if sqrt: denom = denom.sqrt()
-            return tensor.div_(denom + max(reg, 1e-12))
+        # debias
+        if setting["beta_debias"] and beta is not None:
+            num_GGTs = state.get('num_GGTs', 1)
+            bias_correction = 1 - beta ** num_GGTs
+            dir *= bias_correction ** 0.5
-        return (B @ tensor.ravel()).view_as(tensor)
+        return dir

torchzero 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl