PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/lre_optimizers.py ADDED Viewed

@@ -0,0 +1,299 @@
+"""subspace optimizers to be used in a low rank eigenbasis
+three opts support this - GGT and experimental AdaNystrom and Eigengrad
+I could define repoject on a module but because most opts use per-parameter state that is complicated"""
+import math
+from abc import ABC, abstractmethod
+from typing import Any, cast
+import torch
+from ...linalg import matrix_power_eigh, torch_linalg
+from .lion import lion_
+class LREOptimizerBase(ABC):
+    """Optimizer to run in a low rank eigenbasis.
+    notes:
+    1. it shouldn't store any states in self, everything should be in state.
+    This is because this may be called on multiple parameters in a sequence
+    2. apply is always called first, than reproject whenever eigenbasis gets updated
+    3. L is variance in the eigenbasis.
+    """
+    @abstractmethod
+    def step(self, g: torch.Tensor, L: torch.Tensor, Q: torch.Tensor, state: dict) -> torch.Tensor:
+        ...
+    @abstractmethod
+    def reproject(self, L_old: torch.Tensor, Q_old: torch.Tensor,
+                  L_new: torch.Tensor, Q_new: torch.Tensor, state: dict) -> None:
+        ...
+class Whiten(LREOptimizerBase):
+    """This simply applies whitening and is equivalent to not running an optimizer in the eigenbasis"""
+    def step(self, g, L, Q, state): return (Q * L.rsqrt()) @ (Q.T @ g)
+    def reproject(self, L_old, Q_old, L_new, Q_new, state): pass
+class EMA(LREOptimizerBase):
+    """Maintains exponential moving average of gradients in the low rank eigenbasis. Nesterov setting is experimental"""
+    def __init__(self, beta=0.9, nesterov:bool=False, cautious:bool=False, whiten:bool=True):
+        self.beta = beta
+        self.nesterov = nesterov
+        self.whiten = whiten
+        self.cautious = cautious
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        if "exp_avg" not in state:
+            state["exp_avg"] = torch.zeros_like(g)
+        exp_avg = state["exp_avg"]
+        exp_avg.lerp_(g, 1-self.beta)
+        if self.nesterov:
+            dir = (g + exp_avg * self.beta) / (1 + self.beta)
+        else:
+            dir = exp_avg
+        if self.cautious:
+            mask = (g * dir) > 0
+            dir *= mask
+        if self.whiten: return (Q * L.rsqrt()) @ dir
+        return Q @ dir
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if  "exp_avg" not in state: return
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+def adam(g:torch.Tensor, state:dict, beta1, beta2, eps):
+    if "exp_avg" not in state:
+        state["exp_avg"] = torch.zeros_like(g)
+        state["exp_avg_sq"] = torch.zeros_like(g)
+        state["current_step"] = 1
+    exp_avg = state["exp_avg"]
+    exp_avg_sq = state["exp_avg_sq"]
+    current_step = state["current_step"]
+    exp_avg.lerp_(g, 1-beta1)
+    exp_avg_sq.mul_(beta2).addcmul_(g, g, value=1-beta2)
+    denom = exp_avg_sq.sqrt().add_(eps)
+    bias_correction1 = 1.0 - (beta1 ** current_step)
+    bias_correction2 = 1.0 - (beta2 ** current_step)
+    alpha = math.sqrt(bias_correction2) / bias_correction1
+    state["current_step"] = current_step + 1
+    return (exp_avg * alpha) / denom
+def _squared_reproject(C: torch.Tensor, sq: torch.Tensor, exact: bool):
+    if exact:
+        return (C @ sq.diag_embed() @ C.T).diagonal()
+    return C.square() @ sq
+class Adam(LREOptimizerBase):
+    """Runs Adam in low rank eigenbasis."""
+    def __init__(self, beta1=0.9, beta2=0.95, cautious:bool=False, eps=1e-8, exact_reproject:bool=True):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eps = eps
+        self.cautious = cautious
+        self.exact_reproject = exact_reproject
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        dir = adam(g, state, self.beta1, self.beta2, self.eps)
+        if self.cautious:
+            mask = (g * dir) > 0
+            dir *= mask
+        return Q @ dir
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if  "exp_avg" not in state: return
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+        state["exp_avg_sq"] = _squared_reproject(C, state["exp_avg_sq"], self.exact_reproject)
+class FullMatrixAdam(LREOptimizerBase):
+    """Runs full-matrix Adam in low rank eigenbasis.
+    The preconditioner is updated whenever basis is updated"""
+    def __init__(self, beta1=0.9, beta2=0.95, eps=1e-8, matrix_power=-1/2, abs=True, cautious:bool=False):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eps = eps
+        self.matrix_power = matrix_power
+        self.abs = abs
+        self.cautious = cautious
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        # initialize
+        if "exp_avg" not in state:
+            state["exp_avg"] = torch.zeros_like(g)
+            state["covariance"] = torch.eye(g.numel(), device=g.device, dtype=g.dtype)
+            state["preconditioner"] = torch.eye(g.numel(), device=g.device, dtype=g.dtype)
+            state["reprojected"] = True
+            state["current_step"] = 1
+        exp_avg = state["exp_avg"]
+        covariance = state["covariance"]
+        current_step = state["current_step"]
+        # update buffers
+        exp_avg.lerp_(g, 1-self.beta1)
+        covariance.lerp_(g.outer(g), weight=1-self.beta2)
+        # correct bias
+        bias_correction1 = 1.0 - (self.beta1 ** current_step)
+        exp_avg = exp_avg / bias_correction1
+        # after reprojecting update the preconditioner
+        if state["reprojected"]:
+            state["reprojected"] = False
+            bias_correction2 = 1.0 - (self.beta2 ** current_step)
+            covariance = covariance / bias_correction2
+            reg = torch.eye(covariance.size(0), device=covariance.device, dtype=covariance.dtype).mul_(self.eps)
+            covariance = covariance + reg
+            # compute matrix power
+            try:
+                state["preconditioner"] = matrix_power_eigh(covariance, self.matrix_power, abs=self.abs)
+            except torch.linalg.LinAlgError:
+                # fallback to diagonal
+                state["preconditioner"] = covariance.diagonal().rsqrt().diag_embed()
+        # compute the update
+        state["current_step"] = current_step + 1
+        preconditioner = state["preconditioner"]
+        dir = preconditioner @ exp_avg
+        if self.cautious:
+            mask = (g * dir) > 0
+            dir *= mask
+        return Q @ dir
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if  "exp_avg" not in state: return
+        state["reprojected"] = True
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+        state["covariance"] = C @ state["covariance"] @ C.T
+class Lion(LREOptimizerBase):
+    """Runs Lion in the low rank eigenbasis."""
+    def __init__(self, beta1=0.9, beta2=0.99, cautious:bool=False):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.cautious = cautious
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        if "exp_avg" not in state:
+            state["exp_avg"] = torch.zeros_like(g)
+        dir = cast(torch.Tensor, lion_(g, state["exp_avg"], beta1=self.beta1, beta2=self.beta2))
+        if self.cautious:
+            mask = (g * dir) > 0
+            dir *= mask
+        return Q @ dir
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if "exp_avg" not in state: return
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+class Grams(LREOptimizerBase):
+    """Runs Grams in low rank eigenbasis."""
+    def __init__(self, beta1=0.9, beta2=0.95, eps=1e-8, exact_reproject=True):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eps = eps
+        self.exact_reproject = exact_reproject
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        dir = adam(g, state, self.beta1, self.beta2, self.eps)
+        return Q @ dir.copysign(g)
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if  "exp_avg" not in state: return
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+        state["exp_avg_sq"] = _squared_reproject(C, state["exp_avg_sq"], self.exact_reproject)
+class LaProp(LREOptimizerBase):
+    """Runs LaProp in low rank eigenbasis."""
+    def __init__(self, beta1=0.9, beta2=0.95, eps=1e-8, cautious:bool=False, exact_reproject=True):
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.eps = eps
+        self.cautious = cautious
+        self.exact_reproject = exact_reproject
+    def step(self, g, L, Q, state):
+        g = Q.T @ g
+        if "exp_avg" not in state:
+            state["exp_avg"] = torch.zeros_like(g)
+            state["exp_avg_sq"] = torch.zeros_like(g)
+            state["current_step"] = 1
+        exp_avg = state["exp_avg"]
+        exp_avg_sq = state["exp_avg_sq"]
+        current_step = state["current_step"]
+        # update second moments
+        exp_avg_sq.mul_(self.beta2).addcmul_(g, g, value=1-self.beta2)
+        bias_correction2 = 1.0 - (self.beta2 ** current_step)
+        # divide by bias corrected second moments
+        dir = g / (exp_avg_sq / bias_correction2).sqrt().add_(self.eps)
+        # update first moments and bias correct
+        exp_avg.lerp_(dir, 1-self.beta1)
+        bias_correction1 = 1.0 - (self.beta1 ** current_step)
+        dir = exp_avg / bias_correction1
+        if self.cautious:
+            mask = (g * dir) > 0
+            dir *= mask
+        state["current_step"] = current_step + 1
+        return Q @ dir
+    def reproject(self, L_old, Q_old, L_new, Q_new, state):
+        if  "exp_avg" not in state: return
+        C = Q_new.T @ Q_old
+        state["exp_avg"] = C @ state["exp_avg"]
+        state["exp_avg_sq"] = _squared_reproject(C, state["exp_avg_sq"], self.exact_reproject)

torchzero/modules/adaptive/mars.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
-from ...core import Transform
+from ...core import TensorTransform
 from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
@@ -20,7 +20,7 @@ def mars_correction_(
     return c
-class MARSCorrection(Transform):
+class MARSCorrection(TensorTransform):
     """MARS variance reduction correction.
     Place any other momentum-based optimizer after this,
@@ -35,7 +35,7 @@ class MARSCorrection(Transform):
     Mars-AdamW
     ```python
-    optimizer = tz.Modular(
+    optimizer = tz.Optimizer(
         model.parameters(),
         tz.m.MARSCorrection(beta=0.95),
         tz.m.Adam(beta1=0.95, beta2=0.99),
@@ -46,7 +46,7 @@ class MARSCorrection(Transform):
     Mars-Lion
     ```python
-    optimizer = tz.Modular(
+    optimizer = tz.Optimizer(
         model.parameters(),
         tz.m.MARSCorrection(beta=0.9),
         tz.m.Lion(beta1=0.9),
@@ -61,11 +61,11 @@ class MARSCorrection(Transform):
         scaling: float = 0.025,
         max_norm: float | None = 1,
     ):
-        defaults=dict(beta=beta, scaling=scaling, max_norm=max_norm)
-        super().__init__(defaults, uses_grad=False)
+        defaults = dict(beta=beta, scaling=scaling, max_norm=max_norm)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         prev = unpack_states(states, tensors, 'prev', init=tensors, cls=TensorList)
         beta, scaling = unpack_dicts(settings, 'beta', 'scaling', cls=NumberList)
         max_norm = settings[0]['max_norm']

torchzero/modules/adaptive/matrix_momentum.py CHANGED Viewed

@@ -1,14 +1,13 @@
 from typing import Literal
-from collections.abc import Callable
 import torch
-from ...core import Module, apply_transform, Chainable
-from ...utils import NumberList, TensorList, as_tensorlist
-from ...utils.derivatives import hvp, hvp_fd_central, hvp_fd_forward
-from ..functional import initial_step_size
+from ...core import Chainable, Transform, HVPMethod
+from ...utils import NumberList, TensorList, unpack_states, unpack_dicts
+from ..opt_utils import initial_step_size
-class MatrixMomentum(Module):
+class MatrixMomentum(Transform):
     """Second order momentum method.
     Matrix momentum is useful for convex objectives, also for some reason it has very really good generalization on elastic net logistic regression.
@@ -23,17 +22,17 @@ class MatrixMomentum(Module):
     Args:
         mu (float, optional): this has a similar role to (1 - beta) in normal momentum. Defaults to 0.1.
         hvp_method (str, optional):
-            Determines how Hessian-vector products are evaluated.
-            - ``"autograd"``: Use PyTorch's autograd to calculate exact HVPs.
-              This requires creating a graph for the gradient.
-            - ``"forward"``: Use a forward finite difference formula to
-              approximate the HVP. This requires one extra gradient evaluation.
-            - ``"central"``: Use a central finite difference formula for a
-              more accurate HVP approximation. This requires two extra
-              gradient evaluations.
-            Defaults to "autograd".
-        h (float, optional): finite difference step size if hvp_method is set to finite difference. Defaults to 1e-3.
+            Determines how hessian-vector products are computed.
+            - ``"batched_autograd"`` - uses autograd with batched hessian-vector products. If a single hessian-vector is evaluated, equivalent to ``"autograd"``. Faster than ``"autograd"`` but uses more memory.
+            - ``"autograd"`` - uses autograd hessian-vector products. If multiple hessian-vector products are evaluated, uses a for-loop. Slower than ``"batched_autograd"`` but uses less memory.
+            - ``"fd_forward"`` - uses gradient finite difference approximation with a less accurate forward formula which requires one extra gradient evaluation per hessian-vector product.
+            - ``"fd_central"`` - uses gradient finite difference approximation with a more accurate central formula which requires two gradient evaluations per hessian-vector product.
+            Defaults to ``"autograd"``.
+        h (float, optional):
+            The step size for finite difference if ``hvp_method`` is
+            ``"fd_forward"`` or ``"fd_central"``. Defaults to 1e-3.
         hvp_tfm (Chainable | None, optional): optional module applied to hessian-vector products. Defaults to None.
     Reference:
@@ -44,51 +43,45 @@ class MatrixMomentum(Module):
         self,
         lr:float,
         mu=0.1,
-        hvp_method: Literal["autograd", "forward", "central"] = "autograd",
+        hvp_method: HVPMethod = "autograd",
         h: float = 1e-3,
         adaptive:bool = False,
         adapt_freq: int | None = None,
-        hvp_tfm: Chainable | None = None,
+        inner: Chainable | None = None,
     ):
         defaults = dict(lr=lr, mu=mu, hvp_method=hvp_method, h=h, adaptive=adaptive, adapt_freq=adapt_freq)
-        super().__init__(defaults)
-        if hvp_tfm is not None:
-            self.set_child('hvp_tfm', hvp_tfm)
+        super().__init__(defaults, inner=inner)
     def reset_for_online(self):
         super().reset_for_online()
         self.clear_state_keys('p_prev')
     @torch.no_grad
-    def update(self, var):
-        assert var.closure is not None
-        p = TensorList(var.params)
-        p_prev = self.get_state(p, 'p_prev', init=var.params)
+    def update_states(self, objective, states, settings):
+        step = self.increment_counter("step", 0)
+        p = TensorList(objective.params)
+        p_prev = unpack_states(states, p, 'p_prev', init=p)
-        hvp_method = self.defaults['hvp_method']
-        h = self.defaults['h']
-        step = self.global_state.get("step", 0)
-        self.global_state["step"] = step + 1
+        fs = settings[0]
+        hvp_method = fs['hvp_method']
+        h = fs['h']
         if step > 0:
             s = p - p_prev
-            Hs, _ = var.hessian_vector_product(s, at_x0=True, rgrad=None, hvp_method=hvp_method, h=h, normalize=True, retain_graph=False)
+            Hs, _ = objective.hessian_vector_product(s, at_x0=True, rgrad=None, hvp_method=hvp_method, h=h, retain_graph=False)
             Hs = [t.detach() for t in Hs]
-            if 'hvp_tfm' in self.children:
-                Hs = TensorList(apply_transform(self.children['hvp_tfm'], Hs, params=p, grads=var.grad, var=var))
             self.store(p, ("Hs", "s"), (Hs, s))
             # -------------------------------- adaptive mu ------------------------------- #
-            if self.defaults["adaptive"]:
-                g = TensorList(var.get_grad())
+            if fs["adaptive"]:
+                g = TensorList(objective.get_grads())
-                if self.defaults["adapt_freq"] is None:
+                if fs["adapt_freq"] is None:
                     # ---------------------------- deterministic case ---------------------------- #
-                    g_prev = self.get_state(var.params, "g_prev", cls=TensorList)
+                    g_prev = unpack_states(states, p, "g_prev", cls=TensorList)
                     y = g - g_prev
                     g_prev.copy_(g)
                     denom = y.global_vector_norm()
@@ -101,14 +94,14 @@ class MatrixMomentum(Module):
                     # we start on 1nd step, and want to adapt when we start, so use (step - 1)
                     if (step - 1) % adapt_freq == 0:
-                        assert var.closure is not None
-                        params = TensorList(var.params)
+                        assert objective.closure is not None
+                        params = TensorList(objective.params)
                         p_cur = params.clone()
                         # move to previous params and evaluate p_prev with current mini-batch
-                        params.copy_(self.get_state(var.params, 'p_prev'))
+                        params.copy_(unpack_states(states, p, 'p_prev'))
                         with torch.enable_grad():
-                            var.closure()
+                            objective.closure()
                         g_prev = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
                         y = g - g_prev
@@ -119,12 +112,12 @@ class MatrixMomentum(Module):
                         denom = denom.clip(min=torch.finfo(denom.dtype).tiny * 2)
                         self.global_state["mu_mul"] = s.global_vector_norm() / denom
-        torch._foreach_copy_(p_prev, var.params)
+        torch._foreach_copy_(p_prev, objective.params)
     @torch.no_grad
-    def apply(self, var):
-        update = TensorList(var.get_update())
-        lr,mu = self.get_settings(var.params, "lr", 'mu', cls=NumberList)
+    def apply_states(self, objective, states, settings):
+        update = TensorList(objective.get_updates())
+        lr, mu = unpack_dicts(settings, "lr", 'mu', cls=NumberList)
         if "mu_mul" in self.global_state:
             mu = mu * self.global_state["mu_mul"]
@@ -133,14 +126,17 @@ class MatrixMomentum(Module):
         # p_prev is not available so make a small step
         step = self.global_state["step"]
         if step == 1:
-            if self.defaults["adaptive"]: self.get_state(var.params, "g_prev", init=var.get_grad())
+            if self.defaults["adaptive"]:
+                # initialize
+                unpack_states(states, objective.params, "g_prev", init=objective.get_grads())
             update.mul_(lr) # separate so that initial_step_size can clip correctly
             update.mul_(initial_step_size(update, 1e-7))
-            return var
+            return objective
         # -------------------------- matrix momentum update -------------------------- #
-        s, Hs = self.get_state(var.params, 's', 'Hs', cls=TensorList)
+        s, Hs = unpack_states(states, objective.params, 's', 'Hs', cls=TensorList)
         update.mul_(lr).sub_(s).add_(Hs*mu)
-        var.update = update
-        return var
+        objective.updates = update
+        return objective

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl