PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/natural_gradient.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import torch
-from ...core import Module, Chainable, apply_transform
+from ...core import Transform
 from ...utils.derivatives import jacobian_wrt, flatten_jacobian
-from ...utils import vec_to_tensors, TensorList
-from ...utils.linalg import linear_operator
-from .lmadagrad import lm_adagrad_apply, lm_adagrad_update
+from ...utils import vec_to_tensors
+from ...linalg import linear_operator
+from .ggt import ggt_update
-class NaturalGradient(Module):
+class NaturalGradient(Transform):
     """Natural gradient approximated via empirical fisher information matrix.
     To use this, either pass vector of per-sample losses to the step method, or make sure
@@ -27,9 +27,9 @@ class NaturalGradient(Module):
             with a vector that isn't strictly per-sample gradients, but rather for example different losses.
         gn_grad (bool, optional):
             if True, uses Gauss-Newton G^T @ f as the gradient, which is effectively sum weighted by value
-            and is equivalent to squaring the values. This way you can solve least-squares
-            objectives with a NGD-like algorithm. If False, uses sum of per-sample gradients.
-            This has an effect when ``sqrt=True``, and affects the ``grad`` attribute.
+            and is equivalent to squaring the values. That makes the kernel trick solver incorrect, but for
+            some reason it still works. If False, uses sum of per-sample gradients.
+            This has an effect when ``sqrt=False``, and affects the ``grad`` attribute.
             Defaults to False.
         batched (bool, optional): whether to use vmapping. Defaults to True.
@@ -41,7 +41,7 @@ class NaturalGradient(Module):
     y = torch.randn(64, 10)
     model = nn.Sequential(nn.Linear(20, 64), nn.ELU(), nn.Linear(64, 10))
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.NaturalGradient(),
         tz.m.LR(3e-2)
@@ -61,7 +61,7 @@ class NaturalGradient(Module):
     y = torch.randn(64, 10)
     model = nn.Sequential(nn.Linear(20, 64), nn.ELU(), nn.Linear(64, 10))
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.NaturalGradient(),
         tz.m.LR(3e-2)
@@ -84,7 +84,7 @@ class NaturalGradient(Module):
         return torch.stack([(1 - x1).abs(), (10 * (x2 - x1**2).abs())])
     X = torch.tensor([-1.1, 2.5], requires_grad=True)
-    opt = tz.Modular([X], tz.m.NaturalGradient(sqrt=True, gn_grad=True), tz.m.LR(0.05))
+    opt = tz.Optimizer([X], tz.m.NaturalGradient(sqrt=True, gn_grad=True), tz.m.LR(0.05))
     for iter in range(200):
         losses = rosenbrock(X)
@@ -97,20 +97,27 @@ class NaturalGradient(Module):
         super().__init__(defaults=dict(batched=batched, reg=reg, sqrt=sqrt, gn_grad=gn_grad))
     @torch.no_grad
-    def update(self, var):
-        params = var.params
-        batched = self.defaults['batched']
-        gn_grad = self.defaults['gn_grad']
-        closure = var.closure
-        assert closure is not None
+    def update_states(self, objective, states, settings):
+        params = objective.params
+        closure = objective.closure
+        fs = settings[0]
+        batched = fs['batched']
+        gn_grad = fs['gn_grad']
+        # compute per-sample losses
+        f = objective.loss
+        if f is None:
+            assert closure is not None
+            with torch.enable_grad():
+                f = objective.get_loss(backward=False) # n_out
+                assert isinstance(f, torch.Tensor)
+        # compute per-sample gradients
         with torch.enable_grad():
-            f = var.get_loss(backward=False) # n_out
-            assert isinstance(f, torch.Tensor)
             G_list = jacobian_wrt([f.ravel()], params, batched=batched)
-        var.loss = f.sum()
+        # set scalar loss and it's grad to objective
+        objective.loss = f.sum()
         G = self.global_state["G"] = flatten_jacobian(G_list) # (n_samples, ndim)
         if gn_grad:
@@ -119,13 +126,15 @@ class NaturalGradient(Module):
         else:
             g = self.global_state["g"] = G.sum(0)
-        var.grad = vec_to_tensors(g, params)
+        objective.grads = vec_to_tensors(g, params)
         # set closure to calculate scalar value for line searches etc
-        if var.closure is not None:
+        if closure is not None:
             def ngd_closure(backward=True):
                 if backward:
-                    var.zero_grad()
+                    objective.zero_grad()
                     with torch.enable_grad():
                         loss = closure(False)
                         if gn_grad: loss = loss.pow(2)
@@ -137,39 +146,52 @@ class NaturalGradient(Module):
                 if gn_grad: loss = loss.pow(2)
                 return loss.sum()
-            var.closure = ngd_closure
+            objective.closure = ngd_closure
     @torch.no_grad
-    def apply(self, var):
-        params = var.params
-        reg = self.defaults['reg']
-        sqrt = self.defaults['sqrt']
+    def apply_states(self, objective, states, settings):
+        params = objective.params
+        fs = settings[0]
+        reg = fs['reg']
+        sqrt = fs['sqrt']
         G: torch.Tensor = self.global_state['G'] # (n_samples, n_dim)
         if sqrt:
             # this computes U, S <- SVD(M), then calculate update as U S^-1 Uᵀg,
             # but it computes it through eigendecompotision
-            U, L = lm_adagrad_update(G.H, reg, 0)
-            if U is None or L is None: return var
+            L, U = ggt_update(G.H, damping=reg, rdamping=1e-16, truncate=0, eig_tol=1e-12)
+            if U is None or L is None:
+                # fallback to element-wise
+                g = self.global_state["g"]
+                g /= G.square().mean(0).sqrt().add(reg)
+                objective.updates = vec_to_tensors(g, params)
+                return objective
-            v = lm_adagrad_apply(self.global_state["g"], U, L)
-            var.update = vec_to_tensors(v, params)
-            return var
+            # whiten
+            z = U.T @ self.global_state["g"]
+            v = (U * L.rsqrt()) @ z
+            objective.updates = vec_to_tensors(v, params)
+            return objective
-        GGT = G @ G.H # (n_samples, n_samples)
+        # we need (G^T G)v = g
+        # where g = G^T
+        # so we need to solve (G^T G)v = G^T
+        GGt = G @ G.H # (n_samples, n_samples)
         if reg != 0:
-            GGT.add_(torch.eye(GGT.size(0), device=GGT.device, dtype=GGT.dtype).mul_(reg))
+            GGt.add_(torch.eye(GGt.size(0), device=GGt.device, dtype=GGt.dtype).mul_(reg))
-        z, _ = torch.linalg.solve_ex(GGT, torch.ones_like(GGT[0])) # pylint:disable=not-callable
+        z, _ = torch.linalg.solve_ex(GGt, torch.ones_like(GGt[0])) # pylint:disable=not-callable
         v = G.H @ z
-        var.update = vec_to_tensors(v, params)
-        return var
+        objective.updates = vec_to_tensors(v, params)
+        return objective
-    def get_H(self, var):
+    def get_H(self, objective=...):
         if "G" not in self.global_state: return linear_operator.ScaledIdentity()
         G = self.global_state['G']
         return linear_operator.AtA(G)

torchzero/modules/adaptive/orthograd.py CHANGED Viewed

@@ -1,13 +1,9 @@
-from operator import itemgetter
-import math
-import warnings
-from collections.abc import Iterable, Sequence
-from typing import Literal
+from collections.abc import Iterable
 import torch
-from ...core import Target, Transform
-from ...utils import as_tensorlist
+from ...core import  TensorTransform
+from ...utils import TensorList
 def orthograd_(params: Iterable[torch.Tensor], eps: float = 1e-30):
     """Applies ⟂Grad - projects gradient of an iterable of parameters to be orthogonal to the weights.
@@ -19,29 +15,29 @@ def orthograd_(params: Iterable[torch.Tensor], eps: float = 1e-30):
     reference
         https://arxiv.org/abs/2501.04697
     """
-    params = as_tensorlist(params).with_grad()
+    params = TensorList(params).with_grad()
     grad = params.grad
     grad -= (params.dot(grad)/(params.dot(params) + eps)) * params
-class OrthoGrad(Transform):
+class OrthoGrad(TensorTransform):
     """Applies ⟂Grad - projects gradient of an iterable of parameters to be orthogonal to the weights.
     Args:
         eps (float, optional): epsilon added to the denominator for numerical stability (default: 1e-30)
         renormalize (bool, optional): whether to graft projected gradient to original gradient norm. Defaults to True.
-        target (Target, optional): what to set on var. Defaults to 'update'.
     """
-    def __init__(self, eps: float = 1e-8, renormalize=True, target: Target = 'update'):
+    def __init__(self, eps: float = 1e-8, renormalize=True):
         defaults = dict(eps=eps, renormalize=renormalize)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults)
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    @torch.no_grad
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         eps = settings[0]['eps']
         renormalize = settings[0]['renormalize']
-        params = as_tensorlist(params)
-        target = as_tensorlist(tensors)
+        params = TensorList(params)
+        target = TensorList(tensors)
         scale = params.dot(target)/(params.dot(params) + eps)
         if renormalize:

torchzero/modules/adaptive/psgd/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .psgd_dense_newton import PSGDDenseNewton
+from .psgd_kron_newton import PSGDKronNewton
+from .psgd_kron_whiten import PSGDKronWhiten
+from .psgd_lra_newton import PSGDLRANewton
+from .psgd_lra_whiten import PSGDLRAWhiten

torchzero/modules/adaptive/psgd/_psgd_utils.py ADDED Viewed

@@ -0,0 +1,37 @@
+# pylint:disable=not-callable
+import warnings
+import torch
+from .psgd import lift2single
+def _initialize_lra_state_(tensor: torch.Tensor, state, setting):
+    n = tensor.numel()
+    rank = max(min(setting["rank"], n-1), 1)
+    dtype=tensor.dtype
+    device=tensor.device
+    U = torch.randn((n, rank), dtype=dtype, device=device)
+    U *= 0.1**0.5 / torch.linalg.vector_norm(U)
+    V = torch.randn((n, rank), dtype=dtype, device=device)
+    V *= 0.1**0.5 / torch.linalg.vector_norm(V)
+    if setting["init_scale"] is None:
+        # warnings.warn("FYI: Will set the preconditioner initial scale on the fly. Recommend to set it manually.")
+        d = None
+    else:
+        d = torch.ones(n, 1, dtype=dtype, device=device) * setting["init_scale"]
+    state["UVd"] = [U, V, d]
+    state["Luvd"] = [lift2single(torch.zeros([], dtype=dtype, device=device)) for _ in range(3)]
+def _wrap_with_no_backward(opt):
+    """to use original psgd opts with visualbench"""
+    class _Wrapped:
+        def step(self, closure):
+            return opt.step(lambda: closure(False))
+    return _Wrapped()

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl