PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/second_order/rsn.py CHANGED Viewed

@@ -5,46 +5,49 @@ from typing import Literal
 import torch
-from ...core import Chainable, Module, apply_transform
-from ...utils import Distributions, TensorList, vec_to_tensors
-from ...utils.linalg.linear_operator import Sketched
-from .newton import _newton_step
+from ...core import Chainable, Transform, HVPMethod
+from ...utils import vec_to_tensors_
+from ...linalg.linear_operator import Sketched
+from .newton import _newton_update_state_, _newton_solve
 def _qr_orthonormalize(A:torch.Tensor):
     m,n = A.shape
     if m < n:
         q, _ = torch.linalg.qr(A.T) # pylint:disable=not-callable
         return q.T
-    else:
-        q, _ = torch.linalg.qr(A) # pylint:disable=not-callable
-        return q
+    q, _ = torch.linalg.qr(A) # pylint:disable=not-callable
+    return q
 def _orthonormal_sketch(m, n, dtype, device, generator):
     return _qr_orthonormalize(torch.randn(m, n, dtype=dtype, device=device, generator=generator))
-def _gaussian_sketch(m, n, dtype, device, generator):
-    return torch.randn(m, n, dtype=dtype, device=device, generator=generator) / math.sqrt(m)
+def _rademacher_sketch(m, n, dtype, device, generator):
+    rademacher = torch.bernoulli(torch.full((m,n), 0.5), generator = generator).mul_(2).sub_(1)
+    return rademacher.mul_(1 / math.sqrt(m))
-class RSN(Module):
-    """Randomized Subspace Newton. Performs a Newton step in a random subspace.
+class SubspaceNewton(Transform):
+    """Subspace Newton. Performs a Newton step in a subspace (random or spanned by past gradients).
     Args:
         sketch_size (int):
             size of the random sketch. This many hessian-vector products will need to be evaluated each step.
         sketch_type (str, optional):
+            - "common_directions" - uses history steepest descent directions as the basis[2]. It is orthonormalized on-line using Gram-Schmidt (default).
             - "orthonormal" - random orthonormal basis. Orthonormality is necessary to use linear operator based modules such as trust region, but it can be slower to compute.
-            - "gaussian" - random gaussian (not orthonormal) basis.
-            - "common_directions" - uses history steepest descent directions as the basis[2]. It is orthonormalized on-line using Gram-Schmidt.
-            - "mixed" - random orthonormal basis but with three directions set to gradient, slow EMA and fast EMA (default).
+            - "rademacher" - approximately orthonormal (if dimension is large) scaled random rademacher basis. It is recommended to use at least "orthonormal" - it requires QR but it is still very cheap.
+            - "mixed" - random orthonormal basis but with four directions set to gradient, slow and fast gradient EMAs, and previous update direction.
         damping (float, optional): hessian damping (scale of identity matrix added to hessian). Defaults to 0.
         hvp_method (str, optional):
             How to compute hessian-matrix product:
-            - "batched" - uses batched autograd
+            - "batched_autograd" - uses batched autograd
             - "autograd" - uses unbatched autograd
             - "forward" - uses finite difference with forward formula, performing 1 backward pass per Hvp.
             - "central" - uses finite difference with a more accurate central formula, performing 2 backward passes per Hvp.
-            . Defaults to "batched".
+            . Defaults to "batched_autograd".
         h (float, optional): finite difference step size. Defaults to 1e-2.
         use_lstsq (bool, optional): whether to use least squares to solve ``Hx=g``. Defaults to False.
         update_freq (int, optional): frequency of updating the hessian. Defaults to 1.
@@ -67,7 +70,7 @@ class RSN(Module):
     RSN with line search
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.RSN(),
         tz.m.Backtracking()
@@ -76,7 +79,7 @@ class RSN(Module):
     RSN with trust region
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.LevenbergMarquardt(tz.m.RSN()),
     )
@@ -91,137 +94,141 @@ class RSN(Module):
     def __init__(
         self,
         sketch_size: int,
-        sketch_type: Literal["orthonormal", "gaussian", "common_directions", "mixed"] = "mixed",
+        sketch_type: Literal["orthonormal", "common_directions", "mixed", "rademacher"] = "common_directions",
         damping:float=0,
-        hvp_method: Literal["batched", "autograd", "forward", "central"] = "batched",
-        h: float = 1e-2,
-        use_lstsq: bool = True,
-        update_freq: int = 1,
-        H_tfm: Callable[[torch.Tensor, torch.Tensor], tuple[torch.Tensor, bool]] | Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
         eigval_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
+        update_freq: int = 1,
+        precompute_inverse: bool = False,
+        use_lstsq: bool = True,
+        hvp_method: HVPMethod = "batched_autograd",
+        h: float = 1e-2,
         seed: int | None = None,
         inner: Chainable | None = None,
     ):
-        defaults = dict(sketch_size=sketch_size, sketch_type=sketch_type,seed=seed,hvp_method=hvp_method, h=h, damping=damping, use_lstsq=use_lstsq, H_tfm=H_tfm, eigval_fn=eigval_fn, update_freq=update_freq)
-        super().__init__(defaults)
-        if inner is not None:
-            self.set_child("inner", inner)
+        defaults = locals().copy()
+        del defaults['self'], defaults['inner'], defaults["update_freq"]
+        super().__init__(defaults, update_freq=update_freq, inner=inner)
     @torch.no_grad
-    def update(self, var):
-        step = self.global_state.get('step', 0)
-        self.global_state['step'] = step + 1
-        if step % self.defaults['update_freq'] == 0:
+    def update_states(self, objective, states, settings):
+        fs = settings[0]
+        params = objective.params
+        generator = self.get_generator(params[0].device, fs["seed"])
-            closure = var.closure
-            if closure is None:
-                raise RuntimeError("RSN requires closure")
-            params = var.params
-            generator = self.get_generator(params[0].device, self.defaults["seed"])
+        ndim = sum(p.numel() for p in params)
-            ndim = sum(p.numel() for p in params)
+        device=params[0].device
+        dtype=params[0].dtype
-            device=params[0].device
-            dtype=params[0].dtype
+        # sample sketch matrix S: (ndim, sketch_size)
+        sketch_size = min(fs["sketch_size"], ndim)
+        sketch_type = fs["sketch_type"]
+        hvp_method = fs["hvp_method"]
-            # sample sketch matrix S: (ndim, sketch_size)
-            sketch_size = min(self.defaults["sketch_size"], ndim)
-            sketch_type = self.defaults["sketch_type"]
-            hvp_method = self.defaults["hvp_method"]
+        if sketch_type == "rademacher":
+            S = _rademacher_sketch(ndim, sketch_size, device=device, dtype=dtype, generator=generator)
-            if sketch_type in ('normal', 'gaussian'):
-                S = _gaussian_sketch(ndim, sketch_size, device=device, dtype=dtype, generator=generator)
+        elif sketch_type == 'orthonormal':
+            S = _orthonormal_sketch(ndim, sketch_size, device=device, dtype=dtype, generator=generator)
-            elif sketch_type == 'orthonormal':
-                S = _orthonormal_sketch(ndim, sketch_size, device=device, dtype=dtype, generator=generator)
+        elif sketch_type == 'common_directions':
+            # Wang, Po-Wei, Ching-pei Lee, and Chih-Jen Lin. "The common-directions method for regularized empirical risk minimization." Journal of Machine Learning Research 20.58 (2019): 1-49.
+            g_list = objective.get_grads(create_graph=hvp_method in ("batched_autograd", "autograd"))
+            g = torch.cat([t.ravel() for t in g_list])
-            elif sketch_type == 'common_directions':
-                # Wang, Po-Wei, Ching-pei Lee, and Chih-Jen Lin. "The common-directions method for regularized empirical risk minimization." Journal of Machine Learning Research 20.58 (2019): 1-49.
-                g_list = var.get_grad(create_graph=hvp_method in ("batched", "autograd"))
-                g = torch.cat([t.ravel() for t in g_list])
-                # initialize directions deque
-                if "directions" not in self.global_state:
+            # initialize directions deque
+            if "directions" not in self.global_state:
+                g_norm = torch.linalg.vector_norm(g) # pylint:disable=not-callable
+                if g_norm < torch.finfo(g.dtype).tiny * 2:
+                    g = torch.randn_like(g)
                     g_norm = torch.linalg.vector_norm(g) # pylint:disable=not-callable
-                    if g_norm < torch.finfo(g.dtype).tiny * 2:
-                        g = torch.randn_like(g)
-                        g_norm = torch.linalg.vector_norm(g) # pylint:disable=not-callable
-                    self.global_state["directions"] = deque([g / g_norm], maxlen=sketch_size)
-                    S = self.global_state["directions"][0].unsqueeze(1)
-                # add new steepest descent direction orthonormal to existing columns
-                else:
-                    S = torch.stack(tuple(self.global_state["directions"]), dim=1)
-                    p = g - S @ (S.T @ g)
-                    p_norm = torch.linalg.vector_norm(p) # pylint:disable=not-callable
-                    if p_norm > torch.finfo(p.dtype).tiny * 2:
-                        p = p / p_norm
-                        self.global_state["directions"].append(p)
-                        S = torch.cat([S, p.unsqueeze(1)], dim=1)
-            elif sketch_type == "mixed":
-                g_list = var.get_grad(create_graph=hvp_method in ("batched", "autograd"))
-                g = torch.cat([t.ravel() for t in g_list])
-                if "slow_ema" not in self.global_state:
-                    self.global_state["slow_ema"] = torch.randn_like(g) * 1e-2
-                    self.global_state["fast_ema"] = torch.randn_like(g) * 1e-2
-                slow_ema = self.global_state["slow_ema"]
-                fast_ema = self.global_state["fast_ema"]
-                slow_ema.lerp_(g, 0.001)
-                fast_ema.lerp_(g, 0.1)
-                S = torch.stack([g, slow_ema, fast_ema], dim=1)
-                if sketch_size > 3:
-                    S_random = _gaussian_sketch(ndim, sketch_size - 3, device=device, dtype=dtype, generator=generator)
-                    S = torch.cat([S, S_random], dim=1)
-                S = _qr_orthonormalize(S)
+                self.global_state["directions"] = deque([g / g_norm], maxlen=sketch_size)
+                S = self.global_state["directions"][0].unsqueeze(1)
+            # add new steepest descent direction orthonormal to existing columns
             else:
-                raise ValueError(f'Unknown sketch_type {sketch_type}')
+                S = torch.stack(tuple(self.global_state["directions"]), dim=1)
+                p = g - S @ (S.T @ g)
+                p_norm = torch.linalg.vector_norm(p) # pylint:disable=not-callable
+                if p_norm > torch.finfo(p.dtype).tiny * 2:
+                    p = p / p_norm
+                    self.global_state["directions"].append(p)
+                    S = torch.cat([S, p.unsqueeze(1)], dim=1)
+        elif sketch_type == "mixed":
+            g_list = objective.get_grads(create_graph=hvp_method in ("batched_autograd", "autograd"))
+            g = torch.cat([t.ravel() for t in g_list])
+            # initialize state
+            if "slow_ema" not in self.global_state:
+                self.global_state["slow_ema"] = torch.randn_like(g) * 1e-2
+                self.global_state["fast_ema"] = torch.randn_like(g) * 1e-2
+                self.global_state["p_prev"] = torch.randn_like(g)
+            # previous update direction
+            p_cur = torch.cat([t.ravel() for t in params])
+            prev_dir = p_cur - self.global_state["p_prev"]
+            self.global_state["p_prev"] = p_cur
+            # EMAs
+            slow_ema = self.global_state["slow_ema"]
+            fast_ema = self.global_state["fast_ema"]
+            slow_ema.lerp_(g, 0.001)
+            fast_ema.lerp_(g, 0.1)
+            # form and orthogonalize sketching matrix
+            S = torch.stack([g, slow_ema, fast_ema, prev_dir], dim=1)
+            if sketch_size > 4:
+                S_random = torch.randn(ndim, sketch_size - 3, device=device, dtype=dtype, generator=generator) / math.sqrt(ndim)
+                S = torch.cat([S, S_random], dim=1)
+            S = _qr_orthonormalize(S)
+        else:
+            raise ValueError(f'Unknown sketch_type {sketch_type}')
+        # form sketched hessian
+        HS, _ = objective.hessian_matrix_product(S, rgrad=None, at_x0=True,
+                                                 hvp_method=fs["hvp_method"], h=fs["h"])
+        H_sketched = S.T @ HS
+        # update state
+        _newton_update_state_(
+            state = self.global_state,
+            H = H_sketched,
+            damping = fs["damping"],
+            eigval_fn = fs["eigval_fn"],
+            precompute_inverse = fs["precompute_inverse"],
+            use_lstsq = fs["use_lstsq"]
-            # form sketched hessian
-            HS, _ = var.hessian_matrix_product(S, at_x0=True, rgrad=None, hvp_method=self.defaults["hvp_method"], normalize=True, retain_graph=False, h=self.defaults["h"])
-            H_sketched = S.T @ HS
+        )
-            self.global_state["H_sketched"] = H_sketched
-            self.global_state["S"] = S
+        self.global_state["S"] = S
-    def apply(self, var):
-        S: torch.Tensor = self.global_state["S"]
-        d_proj = _newton_step(
-            var=var,
-            H=self.global_state["H_sketched"],
-            damping=self.defaults["damping"],
-            inner=self.children.get("inner", None),
-            H_tfm=self.defaults["H_tfm"],
-            eigval_fn=self.defaults["eigval_fn"],
-            use_lstsq=self.defaults["use_lstsq"],
-            g_proj = lambda g: S.T @ g
-        )
-        d = S @ d_proj
-        var.update = vec_to_tensors(d, var.params)
+    def apply_states(self, objective, states, settings):
+        updates = objective.get_updates()
+        fs = settings[0]
-        return var
+        S = self.global_state["S"]
+        b = torch.cat([t.ravel() for t in updates])
+        b_proj = S.T @ b
-    def get_H(self, var=...):
-        eigval_fn = self.defaults["eigval_fn"]
-        H_sketched: torch.Tensor = self.global_state["H_sketched"]
-        S: torch.Tensor = self.global_state["S"]
+        d_proj = _newton_solve(b=b_proj, state=self.global_state, use_lstsq=fs["use_lstsq"])
-        if eigval_fn is not None:
-            try:
-                L, Q = torch.linalg.eigh(H_sketched) # pylint:disable=not-callable
-                L: torch.Tensor = eigval_fn(L)
-                H_sketched = Q @ L.diag_embed() @ Q.mH
+        d = S @ d_proj
+        vec_to_tensors_(d, updates)
+        return objective
-            except torch.linalg.LinAlgError:
-                pass
+    def get_H(self, objective=...):
+        if "H" in self.global_state:
+            H_sketched = self.global_state["H"]
+        else:
+            L = self.global_state["L"]
+            Q = self.global_state["Q"]
+            H_sketched = Q @ L.diag_embed() @ Q.mH
+        S: torch.Tensor = self.global_state["S"]
         return Sketched(S, H_sketched)

torchzero/modules/smoothing/laplacian.py CHANGED Viewed

@@ -4,7 +4,7 @@ from collections.abc import Iterable
 import torch
 from ...utils.tensorlist import TensorList
-from ...core import Transform, Target
+from ...core import TensorTransform
 def vector_laplacian_smoothing(input: torch.Tensor, sigma: float = 1) -> torch.Tensor:
@@ -55,7 +55,7 @@ def _precompute_denominator(tensor: torch.Tensor, sigma) -> torch.Tensor:
     v[-1] = 1
     return 1 - sigma * torch.fft.fft(v) # pylint: disable = not-callable
-class LaplacianSmoothing(Transform):
+class LaplacianSmoothing(TensorTransform):
     """Applies laplacian smoothing via a fast Fourier transform solver which can improve generalization.
     Args:
@@ -70,29 +70,30 @@ class LaplacianSmoothing(Transform):
             what to set on var.
     Examples:
-        Laplacian Smoothing Gradient Descent optimizer as in the paper
+    Laplacian Smoothing Gradient Descent optimizer as in the paper
-        .. code-block:: python
+    ```python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LaplacianSmoothing(),
-                tz.m.LR(1e-2),
-            )
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.LaplacianSmoothing(),
+        tz.m.LR(1e-2),
+    )
+    ```
     Reference:
         Osher, S., Wang, B., Yin, P., Luo, X., Barekat, F., Pham, M., & Lin, A. (2022). Laplacian smoothing gradient descent. Research in the Mathematical Sciences, 9(3), 55.
     """
-    def __init__(self, sigma:float = 1, layerwise=True, min_numel = 4, target: Target = 'update'):
+    def __init__(self, sigma:float = 1, layerwise=True, min_numel = 4):
         defaults = dict(sigma = sigma, layerwise=layerwise, min_numel=min_numel)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults)
         # precomputed denominator for when layerwise=False
         self.global_state['full_denominator'] = None
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         layerwise = settings[0]['layerwise']
         # layerwise laplacian smoothing

torchzero/modules/smoothing/sampling.py CHANGED Viewed

@@ -7,14 +7,14 @@ from typing import Literal, cast
 import torch
-from ...core import Chainable, Modular, Module, Var
+from ...core import Chainable, Optimizer, Module, Objective
 from ...core.reformulation import Reformulation
 from ...utils import Distributions, NumberList, TensorList
 from ..termination import TerminationCriteriaBase, make_termination_criteria
-def _reset_except_self(optimizer: Modular, var: Var, self: Module):
-    for m in optimizer.unrolled_modules:
+def _reset_except_self(objective: Objective, modules, self: Module):
+    for m in modules:
         if m is not self:
             m.reset()
@@ -98,15 +98,15 @@ class GradientSampling(Reformulation):
             self.set_child('termination', make_termination_criteria(extra=termination))
     @torch.no_grad
-    def pre_step(self, var):
-        params = TensorList(var.params)
+    def pre_step(self, objective):
+        params = TensorList(objective.params)
         fixed = self.defaults['fixed']
         # check termination criteria
         if 'termination' in self.children:
             termination = cast(TerminationCriteriaBase, self.children['termination'])
-            if termination.should_terminate(var):
+            if termination.should_terminate(objective):
                 # decay sigmas
                 states = [self.state[p] for p in params]
@@ -118,7 +118,7 @@ class GradientSampling(Reformulation):
                 # reset on sigmas decay
                 if self.defaults['reset_on_termination']:
-                    var.post_step_hooks.append(partial(_reset_except_self, self=self))
+                    objective.post_step_hooks.append(partial(_reset_except_self, self=self))
                 # clear perturbations
                 self.global_state.pop('perts', None)
@@ -136,7 +136,7 @@ class GradientSampling(Reformulation):
             self.global_state['perts'] = perts
     @torch.no_grad
-    def closure(self, backward, closure, params, var):
+    def closure(self, backward, closure, params, objective):
         params = TensorList(params)
         loss_agg = None
         grad_agg = None
@@ -160,7 +160,7 @@ class GradientSampling(Reformulation):
         # evaluate at x_0
         if include_x0:
-            f_0 = cast(torch.Tensor, var.get_loss(backward=backward))
+            f_0 = objective.get_loss(backward=backward)
             isfinite = math.isfinite(f_0)
             if isfinite:
@@ -168,7 +168,7 @@ class GradientSampling(Reformulation):
                 loss_agg = f_0
             if backward:
-                g_0 = var.get_grad()
+                g_0 = objective.get_grads()
                 if isfinite: grad_agg = g_0
         # evaluate at x_0 + p for each perturbation

torchzero/modules/step_size/adaptive.py CHANGED Viewed

@@ -5,10 +5,10 @@ from typing import Any, Literal
 import torch
-from ...core import Chainable, Transform
+from ...core import Chainable, TensorTransform
 from ...utils import NumberList, TensorList, tofloat, unpack_dicts, unpack_states
-from ...utils.linalg.linear_operator import ScaledIdentity
-from ..functional import epsilon_step_size
+from ...linalg.linear_operator import ScaledIdentity
+from ..opt_utils import epsilon_step_size
 def _acceptable_alpha(alpha, param:torch.Tensor):
     finfo = torch.finfo(param.dtype)
@@ -16,7 +16,7 @@ def _acceptable_alpha(alpha, param:torch.Tensor):
         return False
     return True
-def _get_H(self: Transform, var):
+def _get_scaled_identity_H(self: TensorTransform, var):
     n = sum(p.numel() for p in var.params)
     p = var.params[0]
     alpha = self.global_state.get('alpha', 1)
@@ -25,7 +25,7 @@ def _get_H(self: Transform, var):
     return ScaledIdentity(1 / alpha, shape=(n,n), device=p.device, dtype=p.dtype)
-class PolyakStepSize(Transform):
+class PolyakStepSize(TensorTransform):
     """Polyak's subgradient method with known or unknown f*.
     Args:
@@ -47,7 +47,7 @@ class PolyakStepSize(Transform):
         super().__init__(defaults, uses_grad=use_grad, uses_loss=True, inner=inner)
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         assert grads is not None and loss is not None
         tensors = TensorList(tensors)
         grads = TensorList(grads)
@@ -79,15 +79,15 @@ class PolyakStepSize(Transform):
         self.global_state['alpha'] = alpha
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         alpha = self.global_state.get('alpha', 1)
         if not _acceptable_alpha(alpha, tensors[0]): alpha = epsilon_step_size(TensorList(tensors))
         torch._foreach_mul_(tensors, alpha * unpack_dicts(settings, 'alpha', cls=NumberList))
         return tensors
-    def get_H(self, var):
-        return _get_H(self, var)
+    def get_H(self, objective):
+        return _get_scaled_identity_H(self, objective)
 def _bb_short(s: TensorList, y: TensorList, sy, eps):
@@ -116,7 +116,7 @@ def _bb_geom(s: TensorList, y: TensorList, sy, eps, fallback:bool):
         return None
     return (short * long) ** 0.5
-class BarzilaiBorwein(Transform):
+class BarzilaiBorwein(TensorTransform):
     """Barzilai-Borwein step size method.
     Args:
@@ -144,7 +144,7 @@ class BarzilaiBorwein(Transform):
         self.global_state['reset'] = True
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -175,11 +175,11 @@ class BarzilaiBorwein(Transform):
         prev_p.copy_(params)
         prev_g.copy_(g)
-    def get_H(self, var):
-        return _get_H(self, var)
+    def get_H(self, objective):
+        return _get_scaled_identity_H(self, objective)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         alpha = self.global_state.get('alpha', None)
         if not _acceptable_alpha(alpha, tensors[0]):
@@ -189,7 +189,7 @@ class BarzilaiBorwein(Transform):
         return tensors
-class BBStab(Transform):
+class BBStab(TensorTransform):
     """Stabilized Barzilai-Borwein method (https://arxiv.org/abs/1907.06409).
     This clips the norm of the Barzilai-Borwein update by ``delta``, where ``delta`` can be adaptive if ``c`` is specified.
@@ -228,7 +228,7 @@ class BBStab(Transform):
         self.global_state['reset'] = True
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -287,11 +287,11 @@ class BBStab(Transform):
         prev_p.copy_(params)
         prev_g.copy_(g)
-    def get_H(self, var):
-        return _get_H(self, var)
+    def get_H(self, objective):
+        return _get_scaled_identity_H(self, objective)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         alpha = self.global_state.get('alpha', None)
         if not _acceptable_alpha(alpha, tensors[0]):
@@ -301,7 +301,7 @@ class BBStab(Transform):
         return tensors
-class AdGD(Transform):
+class AdGD(TensorTransform):
     """AdGD and AdGD-2 (https://arxiv.org/abs/2308.02261)"""
     def __init__(self, variant:Literal[1,2]=2, alpha_0:float = 1e-7, sqrt:bool=True, use_grad=True, inner: Chainable | None = None,):
         defaults = dict(variant=variant, alpha_0=alpha_0, sqrt=sqrt)
@@ -313,7 +313,7 @@ class AdGD(Transform):
         self.global_state['reset'] = True
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         variant = settings[0]['variant']
         theta_0 = 0 if variant == 1 else 1/3
         theta = self.global_state.get('theta', theta_0)
@@ -371,7 +371,7 @@ class AdGD(Transform):
         prev_g.copy_(g)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         alpha = self.global_state.get('alpha', None)
         if not _acceptable_alpha(alpha, tensors[0]):
@@ -383,5 +383,5 @@ class AdGD(Transform):
         torch._foreach_mul_(tensors, alpha)
         return tensors
-    def get_H(self, var):
-        return _get_H(self, var)
+    def get_H(self, objective):
+        return _get_scaled_identity_H(self, objective)

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl