PyPI - torchzero - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

tests/test_identical.py +22 -22
tests/test_opts.py +199 -198
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +1 -1
torchzero/core/functional.py +1 -1
torchzero/core/modular.py +5 -5
torchzero/core/module.py +2 -2
torchzero/core/objective.py +10 -10
torchzero/core/transform.py +1 -1
torchzero/linalg/__init__.py +3 -2
torchzero/linalg/eigh.py +223 -4
torchzero/linalg/orthogonalize.py +2 -4
torchzero/linalg/qr.py +12 -0
torchzero/linalg/solve.py +1 -3
torchzero/linalg/svd.py +47 -20
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +10 -10
torchzero/modules/adaptive/adahessian.py +2 -2
torchzero/modules/adaptive/adam.py +1 -1
torchzero/modules/adaptive/adan.py +1 -1
torchzero/modules/adaptive/adaptive_heavyball.py +1 -1
torchzero/modules/adaptive/esgd.py +2 -2
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +2 -1
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +2 -2
torchzero/modules/adaptive/matrix_momentum.py +1 -1
torchzero/modules/adaptive/msam.py +4 -4
torchzero/modules/adaptive/muon.py +9 -6
torchzero/modules/adaptive/natural_gradient.py +32 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rprop.py +2 -2
torchzero/modules/adaptive/sam.py +4 -4
torchzero/modules/adaptive/shampoo.py +28 -3
torchzero/modules/adaptive/soap.py +3 -3
torchzero/modules/adaptive/sophia_h.py +2 -2
torchzero/modules/clipping/clipping.py +7 -7
torchzero/modules/conjugate_gradient/cg.py +2 -2
torchzero/modules/experimental/__init__.py +5 -0
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +2 -2
torchzero/modules/experimental/newtonnewton.py +34 -40
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/rfdm.py +4 -4
torchzero/modules/least_squares/gn.py +68 -45
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/escape.py +1 -1
torchzero/modules/misc/gradient_accumulation.py +1 -1
torchzero/modules/misc/misc.py +1 -1
torchzero/modules/misc/multistep.py +4 -7
torchzero/modules/misc/regularization.py +2 -2
torchzero/modules/misc/split.py +1 -1
torchzero/modules/misc/switch.py +2 -2
torchzero/modules/momentum/cautious.py +3 -3
torchzero/modules/momentum/momentum.py +1 -1
torchzero/modules/ops/higher_level.py +1 -1
torchzero/modules/ops/multi.py +1 -1
torchzero/modules/projections/projection.py +5 -2
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +3 -3
torchzero/modules/quasi_newton/lsr1.py +3 -3
torchzero/modules/quasi_newton/quasi_newton.py +44 -29
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +17 -17
torchzero/modules/second_order/inm.py +33 -25
torchzero/modules/second_order/newton.py +132 -130
torchzero/modules/second_order/newton_cg.py +3 -3
torchzero/modules/second_order/nystrom.py +83 -32
torchzero/modules/second_order/rsn.py +41 -44
torchzero/modules/smoothing/laplacian.py +1 -1
torchzero/modules/smoothing/sampling.py +2 -3
torchzero/modules/step_size/adaptive.py +6 -6
torchzero/modules/step_size/lr.py +2 -2
torchzero/modules/trust_region/cubic_regularization.py +1 -1
torchzero/modules/trust_region/levenberg_marquardt.py +2 -2
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/variance_reduction/svrg.py +4 -5
torchzero/modules/weight_decay/reinit.py +2 -2
torchzero/modules/weight_decay/weight_decay.py +5 -5
torchzero/modules/wrappers/optim_wrapper.py +4 -4
torchzero/modules/zeroth_order/cd.py +1 -1
torchzero/optim/mbs.py +291 -0
torchzero/optim/wrappers/nevergrad.py +0 -9
torchzero/optim/wrappers/optuna.py +2 -0
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/derivatives.py +4 -4
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
torchzero/modules/adaptive/lmadagrad.py +0 -241
torchzero-0.4.0.dist-info/RECORD +0 -191
/torchzero/modules/{functional.py → opt_utils.py} +0 -0
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.4.0.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/adaptive/muon.py CHANGED Viewed

@@ -10,8 +10,10 @@ from ...linalg.orthogonalize import orthogonalize as _orthogonalize, Orthogonali
 def reverse_dims(t:torch.Tensor):
     return t.permute(*reversed(range(t.ndim)))
-def _is_at_least_2d(p: torch.Tensor):
-    if (p.ndim >= 2) and (p.size(0) > 1) and (p.size(1) > 1): return True
+def _is_at_least_2d(p: torch.Tensor, channel_first:bool):
+    if p.ndim < 2: return False
+    if channel_first and (p.size(0) > 1) and (p.size(1) > 1): return True
+    if (not channel_first) and (p.size(-2) > 1) and (p.size(-1) > 1): return True
     return False
 def _orthogonalize_format(
@@ -19,6 +21,7 @@ def _orthogonalize_format(
     method: OrthogonalizeMethod,
     channel_first: bool,
 ):
+    """orthogonalize either 1st two dims if channel first or last two otherwise"""
     if channel_first:
         return reverse_dims(_orthogonalize(reverse_dims(tensor), method=method))
@@ -69,7 +72,7 @@ def orthogonalize_grads_(
             are considered batch dimensions.
     """
     for p in params:
-        if (p.grad is not None) and _is_at_least_2d(p.grad):
+        if (p.grad is not None) and _is_at_least_2d(p.grad, channel_first=channel_first):
             X = _orthogonalize_format(p.grad, method=method, channel_first=channel_first)
             if dual_norm_correction: X = _dual_norm_correction(X, p.grad, channel_first=False)
             p.grad.set_(X.view_as(p)) # pyright:ignore[reportArgumentType]
@@ -100,7 +103,7 @@ class Orthogonalize(TensorTransform):
     standard Muon with Adam fallback
     ```py
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.head.parameters(),
         tz.m.Split(
             # apply muon only to 2D+ parameters
@@ -131,7 +134,7 @@ class Orthogonalize(TensorTransform):
         if not orthogonalize: return tensor
-        if _is_at_least_2d(tensor):
+        if _is_at_least_2d(tensor, channel_first=channel_first):
             X = _orthogonalize_format(tensor, method, channel_first=channel_first)
@@ -173,7 +176,7 @@ class MuonAdjustLR(Transform):
         alphas = [s['alpha'] for s in settings]
         channel_first = [s["channel_first=channel_first"] for s in settings]
         tensors_alphas = [
-            (t, adjust_lr_for_muon(a, t.shape, cf)) for t, a, cf in zip(tensors, alphas, channel_first) if _is_at_least_2d(t)
+            (t, adjust_lr_for_muon(a, t.shape, cf)) for t, a, cf in zip(tensors, alphas, channel_first) if _is_at_least_2d(t, channel_first=cf)
         ]
         tensors = [i[0] for i in tensors_alphas]
         a = [i[1] for i in alphas]

torchzero/modules/adaptive/natural_gradient.py CHANGED Viewed

@@ -4,7 +4,7 @@ from ...core import Transform
 from ...utils.derivatives import jacobian_wrt, flatten_jacobian
 from ...utils import vec_to_tensors
 from ...linalg import linear_operator
-from .lmadagrad import lm_adagrad_apply, lm_adagrad_update
+from .ggt import ggt_update
 class NaturalGradient(Transform):
     """Natural gradient approximated via empirical fisher information matrix.
@@ -41,7 +41,7 @@ class NaturalGradient(Transform):
     y = torch.randn(64, 10)
     model = nn.Sequential(nn.Linear(20, 64), nn.ELU(), nn.Linear(64, 10))
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.NaturalGradient(),
         tz.m.LR(3e-2)
@@ -61,7 +61,7 @@ class NaturalGradient(Transform):
     y = torch.randn(64, 10)
     model = nn.Sequential(nn.Linear(20, 64), nn.ELU(), nn.Linear(64, 10))
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.NaturalGradient(),
         tz.m.LR(3e-2)
@@ -84,7 +84,7 @@ class NaturalGradient(Transform):
         return torch.stack([(1 - x1).abs(), (10 * (x2 - x1**2).abs())])
     X = torch.tensor([-1.1, 2.5], requires_grad=True)
-    opt = tz.Modular([X], tz.m.NaturalGradient(sqrt=True, gn_grad=True), tz.m.LR(0.05))
+    opt = tz.Optimizer([X], tz.m.NaturalGradient(sqrt=True, gn_grad=True), tz.m.LR(0.05))
     for iter in range(200):
         losses = rosenbrock(X)
@@ -99,18 +99,24 @@ class NaturalGradient(Transform):
     @torch.no_grad
     def update_states(self, objective, states, settings):
         params = objective.params
+        closure = objective.closure
         fs = settings[0]
         batched = fs['batched']
         gn_grad = fs['gn_grad']
-        closure = objective.closure
-        assert closure is not None
+        # compute per-sample losses
+        f = objective.loss
+        if f is None:
+            assert closure is not None
+            with torch.enable_grad():
+                f = objective.get_loss(backward=False) # n_out
+                assert isinstance(f, torch.Tensor)
+        # compute per-sample gradients
         with torch.enable_grad():
-            f = objective.get_loss(backward=False) # n_out
-            assert isinstance(f, torch.Tensor)
             G_list = jacobian_wrt([f.ravel()], params, batched=batched)
+        # set scalar loss and it's grad to objective
         objective.loss = f.sum()
         G = self.global_state["G"] = flatten_jacobian(G_list) # (n_samples, ndim)
@@ -123,8 +129,10 @@ class NaturalGradient(Transform):
         objective.grads = vec_to_tensors(g, params)
         # set closure to calculate scalar value for line searches etc
-        if objective.closure is not None:
+        if closure is not None:
             def ngd_closure(backward=True):
                 if backward:
                     objective.zero_grad()
                     with torch.enable_grad():
@@ -152,22 +160,31 @@ class NaturalGradient(Transform):
         if sqrt:
             # this computes U, S <- SVD(M), then calculate update as U S^-1 Uᵀg,
             # but it computes it through eigendecompotision
-            U, L = lm_adagrad_update(G.H, reg, 0)
-            if U is None or L is None: return objective
+            L, U = ggt_update(G.H, damping=reg, rdamping=1e-16, truncate=0, eig_tol=1e-12)
+            if U is None or L is None:
+                # fallback to element-wise
+                g = self.global_state["g"]
+                g /= G.square().mean(0).sqrt().add(reg)
+                objective.updates = vec_to_tensors(g, params)
+                return objective
-            v = lm_adagrad_apply(self.global_state["g"], U, L)
+            # whiten
+            z = U.T @ self.global_state["g"]
+            v = (U * L.rsqrt()) @ z
             objective.updates = vec_to_tensors(v, params)
             return objective
         # we need (G^T G)v = g
         # where g = G^T
         # so we need to solve (G^T G)v = G^T
-        GGT = G @ G.H # (n_samples, n_samples)
+        GGt = G @ G.H # (n_samples, n_samples)
         if reg != 0:
-            GGT.add_(torch.eye(GGT.size(0), device=GGT.device, dtype=GGT.dtype).mul_(reg))
+            GGt.add_(torch.eye(GGt.size(0), device=GGt.device, dtype=GGt.dtype).mul_(reg))
-        z, _ = torch.linalg.solve_ex(GGT, torch.ones_like(GGT[0])) # pylint:disable=not-callable
+        z, _ = torch.linalg.solve_ex(GGt, torch.ones_like(GGt[0])) # pylint:disable=not-callable
         v = G.H @ z
         objective.updates = vec_to_tensors(v, params)

torchzero/modules/adaptive/psgd/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .psgd_dense_newton import PSGDDenseNewton
+from .psgd_kron_newton import PSGDKronNewton
+from .psgd_kron_whiten import PSGDKronWhiten
+from .psgd_lra_newton import PSGDLRANewton
+from .psgd_lra_whiten import PSGDLRAWhiten

torchzero/modules/adaptive/psgd/_psgd_utils.py ADDED Viewed

@@ -0,0 +1,37 @@
+# pylint:disable=not-callable
+import warnings
+import torch
+from .psgd import lift2single
+def _initialize_lra_state_(tensor: torch.Tensor, state, setting):
+    n = tensor.numel()
+    rank = max(min(setting["rank"], n-1), 1)
+    dtype=tensor.dtype
+    device=tensor.device
+    U = torch.randn((n, rank), dtype=dtype, device=device)
+    U *= 0.1**0.5 / torch.linalg.vector_norm(U)
+    V = torch.randn((n, rank), dtype=dtype, device=device)
+    V *= 0.1**0.5 / torch.linalg.vector_norm(V)
+    if setting["init_scale"] is None:
+        # warnings.warn("FYI: Will set the preconditioner initial scale on the fly. Recommend to set it manually.")
+        d = None
+    else:
+        d = torch.ones(n, 1, dtype=dtype, device=device) * setting["init_scale"]
+    state["UVd"] = [U, V, d]
+    state["Luvd"] = [lift2single(torch.zeros([], dtype=dtype, device=device)) for _ in range(3)]
+def _wrap_with_no_backward(opt):
+    """to use original psgd opts with visualbench"""
+    class _Wrapped:
+        def step(self, closure):
+            return opt.step(lambda: closure(False))
+    return _Wrapped()

torchzero 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl