PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.3.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

tests/test_opts.py +4 -3
torchzero/core/__init__.py +4 -1
torchzero/core/chain.py +50 -0
torchzero/core/functional.py +37 -0
torchzero/core/modular.py +237 -0
torchzero/core/module.py +8 -599
torchzero/core/reformulation.py +3 -1
torchzero/core/transform.py +7 -5
torchzero/core/var.py +376 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/adahessian.py +2 -2
torchzero/modules/adaptive/esgd.py +2 -2
torchzero/modules/adaptive/matrix_momentum.py +1 -1
torchzero/modules/adaptive/sophia_h.py +2 -2
torchzero/modules/experimental/__init__.py +1 -0
torchzero/modules/experimental/newtonnewton.py +5 -5
torchzero/modules/experimental/spsa1.py +2 -2
torchzero/modules/functional.py +7 -0
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +1 -1
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +11 -20
torchzero/modules/line_search/strong_wolfe.py +3 -3
torchzero/modules/misc/misc.py +2 -2
torchzero/modules/misc/multistep.py +13 -13
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/quasi_newton.py +15 -6
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +89 -0
torchzero/modules/second_order/inm.py +105 -0
torchzero/modules/second_order/newton.py +103 -193
torchzero/modules/second_order/nystrom.py +1 -1
torchzero/modules/second_order/rsn.py +227 -0
torchzero/modules/wrappers/optim_wrapper.py +49 -42
torchzero/utils/derivatives.py +19 -19
torchzero/utils/linalg/linear_operator.py +50 -2
{torchzero-0.3.14.dist-info → torchzero-0.3.15.dist-info}/METADATA +1 -1
{torchzero-0.3.14.dist-info → torchzero-0.3.15.dist-info}/RECORD +44 -36
torchzero/modules/higher_order/__init__.py +0 -1
/torchzero/modules/{higher_order → experimental}/higher_order_newton.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.3.15.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.3.15.dist-info}/top_level.txt +0 -0

torchzero/modules/second_order/rsn.py ADDED Viewed

@@ -0,0 +1,227 @@
+import math
+from collections import deque
+from collections.abc import Callable
+from typing import Literal
+import torch
+from ...core import Chainable, Module, apply_transform
+from ...utils import Distributions, TensorList, vec_to_tensors
+from ...utils.linalg.linear_operator import Sketched
+from .newton import _newton_step
+def _qr_orthonormalize(A:torch.Tensor):
+    m,n = A.shape
+    if m < n:
+        q, _ = torch.linalg.qr(A.T) # pylint:disable=not-callable
+        return q.T
+    else:
+        q, _ = torch.linalg.qr(A) # pylint:disable=not-callable
+        return q
+def _orthonormal_sketch(m, n, dtype, device, generator):
+    return _qr_orthonormalize(torch.randn(m, n, dtype=dtype, device=device, generator=generator))
+def _gaussian_sketch(m, n, dtype, device, generator):
+    return torch.randn(m, n, dtype=dtype, device=device, generator=generator) / math.sqrt(m)
+class RSN(Module):
+    """Randomized Subspace Newton. Performs a Newton step in a random subspace.
+    Args:
+        sketch_size (int):
+            size of the random sketch. This many hessian-vector products will need to be evaluated each step.
+        sketch_type (str, optional):
+            - "orthonormal" - random orthonormal basis. Orthonormality is necessary to use linear operator based modules such as trust region, but it can be slower to compute.
+            - "gaussian" - random gaussian (not orthonormal) basis.
+            - "common_directions" - uses history steepest descent directions as the basis[2]. It is orthonormalized on-line using Gram-Schmidt.
+            - "mixed" - random orthonormal basis but with three directions set to gradient, slow EMA and fast EMA (default).
+        damping (float, optional): hessian damping (scale of identity matrix added to hessian). Defaults to 0.
+        hvp_method (str, optional):
+            How to compute hessian-matrix product:
+            - "batched" - uses batched autograd
+            - "autograd" - uses unbatched autograd
+            - "forward" - uses finite difference with forward formula, performing 1 backward pass per Hvp.
+            - "central" - uses finite difference with a more accurate central formula, performing 2 backward passes per Hvp.
+            . Defaults to "batched".
+        h (float, optional): finite difference step size. Defaults to 1e-2.
+        use_lstsq (bool, optional): whether to use least squares to solve ``Hx=g``. Defaults to False.
+        update_freq (int, optional): frequency of updating the hessian. Defaults to 1.
+        H_tfm (Callable | None, optional):
+            optional hessian transforms, takes in two arguments - `(hessian, gradient)`.
+            must return either a tuple: `(hessian, is_inverted)` with transformed hessian and a boolean value
+            which must be True if transform inverted the hessian and False otherwise.
+            Or it returns a single tensor which is used as the update.
+            Defaults to None.
+        eigval_fn (Callable | None, optional):
+            optional eigenvalues transform, for example ``torch.abs`` or ``lambda L: torch.clip(L, min=1e-8)``.
+            If this is specified, eigendecomposition will be used to invert the hessian.
+        seed (int | None, optional): seed for random generator. Defaults to None.
+        inner (Chainable | None, optional): preconditions output of this module. Defaults to None.
+    ### Examples
+    RSN with line search
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.RSN(),
+        tz.m.Backtracking()
+    )
+    ```
+    RSN with trust region
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.RSN()),
+    )
+    ```
+    References:
+        1. [Gower, Robert, et al. "RSN: randomized subspace Newton." Advances in Neural Information Processing Systems 32 (2019).](https://arxiv.org/abs/1905.10874)
+        2. Wang, Po-Wei, Ching-pei Lee, and Chih-Jen Lin. "The common-directions method for regularized empirical risk minimization." Journal of Machine Learning Research 20.58 (2019): 1-49.
+    """
+    def __init__(
+        self,
+        sketch_size: int,
+        sketch_type: Literal["orthonormal", "gaussian", "common_directions", "mixed"] = "mixed",
+        damping:float=0,
+        hvp_method: Literal["batched", "autograd", "forward", "central"] = "batched",
+        h: float = 1e-2,
+        use_lstsq: bool = True,
+        update_freq: int = 1,
+        H_tfm: Callable[[torch.Tensor, torch.Tensor], tuple[torch.Tensor, bool]] | Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None,
+        eigval_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
+        seed: int | None = None,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(sketch_size=sketch_size, sketch_type=sketch_type,seed=seed,hvp_method=hvp_method, h=h, damping=damping, use_lstsq=use_lstsq, H_tfm=H_tfm, eigval_fn=eigval_fn, update_freq=update_freq)
+        super().__init__(defaults)
+        if inner is not None:
+            self.set_child("inner", inner)
+    @torch.no_grad
+    def update(self, var):
+        step = self.global_state.get('step', 0)
+        self.global_state['step'] = step + 1
+        if step % self.defaults['update_freq'] == 0:
+            closure = var.closure
+            if closure is None:
+                raise RuntimeError("RSN requires closure")
+            params = var.params
+            generator = self.get_generator(params[0].device, self.defaults["seed"])
+            ndim = sum(p.numel() for p in params)
+            device=params[0].device
+            dtype=params[0].dtype
+            # sample sketch matrix S: (ndim, sketch_size)
+            sketch_size = min(self.defaults["sketch_size"], ndim)
+            sketch_type = self.defaults["sketch_type"]
+            hvp_method = self.defaults["hvp_method"]
+            if sketch_type in ('normal', 'gaussian'):
+                S = _gaussian_sketch(ndim, sketch_size, device=device, dtype=dtype, generator=generator)
+            elif sketch_type == 'orthonormal':
+                S = _orthonormal_sketch(ndim, sketch_size, device=device, dtype=dtype, generator=generator)
+            elif sketch_type == 'common_directions':
+                # Wang, Po-Wei, Ching-pei Lee, and Chih-Jen Lin. "The common-directions method for regularized empirical risk minimization." Journal of Machine Learning Research 20.58 (2019): 1-49.
+                g_list = var.get_grad(create_graph=hvp_method in ("batched", "autograd"))
+                g = torch.cat([t.ravel() for t in g_list])
+                # initialize directions deque
+                if "directions" not in self.global_state:
+                    g_norm = torch.linalg.vector_norm(g) # pylint:disable=not-callable
+                    if g_norm < torch.finfo(g.dtype).tiny * 2:
+                        g = torch.randn_like(g)
+                        g_norm = torch.linalg.vector_norm(g) # pylint:disable=not-callable
+                    self.global_state["directions"] = deque([g / g_norm], maxlen=sketch_size)
+                    S = self.global_state["directions"][0].unsqueeze(1)
+                # add new steepest descent direction orthonormal to existing columns
+                else:
+                    S = torch.stack(tuple(self.global_state["directions"]), dim=1)
+                    p = g - S @ (S.T @ g)
+                    p_norm = torch.linalg.vector_norm(p) # pylint:disable=not-callable
+                    if p_norm > torch.finfo(p.dtype).tiny * 2:
+                        p = p / p_norm
+                        self.global_state["directions"].append(p)
+                        S = torch.cat([S, p.unsqueeze(1)], dim=1)
+            elif sketch_type == "mixed":
+                g_list = var.get_grad(create_graph=hvp_method in ("batched", "autograd"))
+                g = torch.cat([t.ravel() for t in g_list])
+                if "slow_ema" not in self.global_state:
+                    self.global_state["slow_ema"] = torch.randn_like(g) * 1e-2
+                    self.global_state["fast_ema"] = torch.randn_like(g) * 1e-2
+                slow_ema = self.global_state["slow_ema"]
+                fast_ema = self.global_state["fast_ema"]
+                slow_ema.lerp_(g, 0.001)
+                fast_ema.lerp_(g, 0.1)
+                S = torch.stack([g, slow_ema, fast_ema], dim=1)
+                if sketch_size > 3:
+                    S_random = _gaussian_sketch(ndim, sketch_size - 3, device=device, dtype=dtype, generator=generator)
+                    S = torch.cat([S, S_random], dim=1)
+                S = _qr_orthonormalize(S)
+            else:
+                raise ValueError(f'Unknown sketch_type {sketch_type}')
+            # form sketched hessian
+            HS, _ = var.hessian_matrix_product(S, at_x0=True, rgrad=None, hvp_method=self.defaults["hvp_method"], normalize=True, retain_graph=False, h=self.defaults["h"])
+            H_sketched = S.T @ HS
+            self.global_state["H_sketched"] = H_sketched
+            self.global_state["S"] = S
+    def apply(self, var):
+        S: torch.Tensor = self.global_state["S"]
+        d_proj = _newton_step(
+            var=var,
+            H=self.global_state["H_sketched"],
+            damping=self.defaults["damping"],
+            inner=self.children.get("inner", None),
+            H_tfm=self.defaults["H_tfm"],
+            eigval_fn=self.defaults["eigval_fn"],
+            use_lstsq=self.defaults["use_lstsq"],
+            g_proj = lambda g: S.T @ g
+        )
+        d = S @ d_proj
+        var.update = vec_to_tensors(d, var.params)
+        return var
+    def get_H(self, var=...):
+        eigval_fn = self.defaults["eigval_fn"]
+        H_sketched: torch.Tensor = self.global_state["H_sketched"]
+        S: torch.Tensor = self.global_state["S"]
+        if eigval_fn is not None:
+            try:
+                L, Q = torch.linalg.eigh(H_sketched) # pylint:disable=not-callable
+                L: torch.Tensor = eigval_fn(L)
+                H_sketched = Q @ L.diag_embed() @ Q.mH
+            except torch.linalg.LinAlgError:
+                pass
+        return Sketched(S, H_sketched)

torchzero/modules/wrappers/optim_wrapper.py CHANGED Viewed

@@ -10,34 +10,48 @@ class Wrap(Module):
     """
     Wraps a pytorch optimizer to use it as a module.
-    .. note::
-        Custom param groups are supported only by `set_param_groups`, settings passed to Modular will be ignored.
+    Note:
+        Custom param groups are supported only by ``set_param_groups``, settings passed to Modular will be applied to all parameters.
     Args:
         opt_fn (Callable[..., torch.optim.Optimizer] | torch.optim.Optimizer):
-            function that takes in parameters and returns the optimizer, for example :code:`torch.optim.Adam`
-            or :code:`lambda parameters: torch.optim.Adam(parameters, lr=1e-3)`
+            function that takes in parameters and returns the optimizer, for example ``torch.optim.Adam``
+            or ``lambda parameters: torch.optim.Adam(parameters, lr=1e-3)``
         *args:
         **kwargs:
-            Extra args to be passed to opt_fn. The function is called as :code:`opt_fn(parameters, *args, **kwargs)`.
+            Extra args to be passed to opt_fn. The function is called as ``opt_fn(parameters, *args, **kwargs)``.
+        use_param_groups:
+            Whether to pass settings passed to Modular to the wrapped optimizer.
-    Example:
-        wrapping pytorch_optimizer.StableAdamW
+            Note that settings to the first parameter are used for all parameters,
+            so if you specified per-parameter settings, they will be ignored.
-        .. code-block:: py
+    ### Example:
+    wrapping pytorch_optimizer.StableAdamW
-            from pytorch_optimizer import StableAdamW
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Wrap(StableAdamW, lr=1),
-                tz.m.Cautious(),
-                tz.m.LR(1e-2)
-            )
+    ```python
+    from pytorch_optimizer import StableAdamW
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Wrap(StableAdamW, lr=1),
+        tz.m.Cautious(),
+        tz.m.LR(1e-2)
+    )
+    ```
     """
-    def __init__(self, opt_fn: Callable[..., torch.optim.Optimizer] | torch.optim.Optimizer, *args, **kwargs):
-        super().__init__()
+    def __init__(
+        self,
+        opt_fn: Callable[..., torch.optim.Optimizer] | torch.optim.Optimizer,
+        *args,
+        use_param_groups: bool = True,
+        **kwargs,
+    ):
+        defaults = dict(use_param_groups=use_param_groups)
+        super().__init__(defaults=defaults)
         self._opt_fn = opt_fn
         self._opt_args = args
         self._opt_kwargs = kwargs
@@ -48,7 +62,7 @@ class Wrap(Module):
             self.optimizer = self._opt_fn
     def set_param_groups(self, param_groups):
-        self._custom_param_groups = param_groups
+        self._custom_param_groups = _make_param_groups(param_groups, differentiable=False)
         return super().set_param_groups(param_groups)
     @torch.no_grad
@@ -61,37 +75,29 @@ class Wrap(Module):
             param_groups = params if self._custom_param_groups is None else self._custom_param_groups
             self.optimizer = self._opt_fn(param_groups, *self._opt_args, **self._opt_kwargs)
+        # set optimizer per-parameter settings
+        if self.defaults["use_param_groups"] and var.modular is not None:
+            for group in self.optimizer.param_groups:
+                first_param = group['params'][0]
+                setting = self.settings[first_param]
+                # settings passed in `set_param_groups` are the highest priority
+                # schedulers will override defaults but not settings passed in `set_param_groups`
+                # this is consistent with how Modular does it.
+                if self._custom_param_groups is not None:
+                    setting = {k:v for k,v in setting if k not in self._custom_param_groups[0]}
+                group.update(setting)
         # set grad to update
         orig_grad = [p.grad for p in params]
         for p, u in zip(params, var.get_update()):
             p.grad = u
-        # if this module is last, can step with _opt directly
-        # direct step can't be applied if next module is LR but _opt doesn't support lr,
-        # and if there are multiple different per-parameter lrs (would be annoying to support)
-        if var.is_last and (
-            (var.last_module_lrs is None)
-            or
-            (('lr' in self.optimizer.defaults) and (len(set(var.last_module_lrs)) == 1))
-        ):
-            lr = 1 if var.last_module_lrs is None else var.last_module_lrs[0]
-            # update optimizer lr with desired lr
-            if lr != 1:
-                self.optimizer.defaults['__original_lr__'] = self.optimizer.defaults['lr']
-                for g in self.optimizer.param_groups:
-                    g['__original_lr__'] = g['lr']
-                    g['lr'] = g['lr'] * lr
-            # step
+        # if this is last module, simply use optimizer to update parameters
+        if var.modular is not None and self is var.modular.modules[-1]:
             self.optimizer.step()
-            # restore original lr
-            if lr != 1:
-                self.optimizer.defaults['lr'] = self.optimizer.defaults.pop('__original_lr__')
-                for g in self.optimizer.param_groups:
-                    g['lr'] = g.pop('__original_lr__')
             # restore grad
             for p, g in zip(params, orig_grad):
                 p.grad = g
@@ -100,6 +106,7 @@ class Wrap(Module):
             return var
         # this is not the last module, meaning update is difference in parameters
+        # and passed to next module
         params_before_step = [p.clone() for p in params]
         self.optimizer.step() # step and update params
         for p, g in zip(params, orig_grad):

torchzero/utils/derivatives.py CHANGED Viewed

@@ -5,13 +5,13 @@ import torch.autograd.forward_ad as fwAD
 from .torch_tools import swap_tensors_no_use_count_check, vec_to_tensors
-def _jacobian(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
-    flat_input = torch.cat([i.reshape(-1) for i in output])
-    grad_ouputs = torch.eye(len(flat_input), device=output[0].device, dtype=output[0].dtype)
+def _jacobian(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
+    flat_outputs = torch.cat([i.reshape(-1) for i in outputs])
+    grad_ouputs = torch.eye(len(flat_outputs), device=outputs[0].device, dtype=outputs[0].dtype)
     jac = []
-    for i in range(flat_input.numel()):
+    for i in range(flat_outputs.numel()):
         jac.append(torch.autograd.grad(
-            flat_input,
+            flat_outputs,
             wrt,
             grad_ouputs[i],
             retain_graph=True,
@@ -22,12 +22,12 @@ def _jacobian(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], creat
     return [torch.stack(z) for z in zip(*jac)]
-def _jacobian_batched(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
-    flat_input = torch.cat([i.reshape(-1) for i in output])
+def _jacobian_batched(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
+    flat_outputs = torch.cat([i.reshape(-1) for i in outputs])
     return torch.autograd.grad(
-        flat_input,
+        flat_outputs,
         wrt,
-        torch.eye(len(flat_input), device=output[0].device, dtype=output[0].dtype),
+        torch.eye(len(flat_outputs), device=outputs[0].device, dtype=outputs[0].dtype),
         retain_graph=True,
         create_graph=create_graph,
         allow_unused=True,
@@ -51,13 +51,13 @@ def flatten_jacobian(jacs: Sequence[torch.Tensor]) -> torch.Tensor:
     return torch.cat([j.reshape(n_out, -1) for j in jacs], dim=1)
-def jacobian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True) -> Sequence[torch.Tensor]:
+def jacobian_wrt(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True) -> Sequence[torch.Tensor]:
     """Calculate jacobian of a sequence of tensors w.r.t another sequence of tensors.
     Returns a sequence of tensors with the length as `wrt`.
     Each tensor will have the shape `(*output.shape, *wrt[i].shape)`.
     Args:
-        input (Sequence[torch.Tensor]): input sequence of tensors.
+        outputs (Sequence[torch.Tensor]): input sequence of tensors.
         wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
         create_graph (bool, optional):
             pytorch option, if True, graph of the derivative will be constructed,
@@ -68,16 +68,16 @@ def jacobian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], cr
     Returns:
         sequence of tensors with the length as `wrt`.
     """
-    if batched: return _jacobian_batched(output, wrt, create_graph)
-    return _jacobian(output, wrt, create_graph)
+    if batched: return _jacobian_batched(outputs, wrt, create_graph)
+    return _jacobian(outputs, wrt, create_graph)
-def jacobian_and_hessian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
+def jacobian_and_hessian_wrt(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
     """Calculate jacobian and hessian of a sequence of tensors w.r.t another sequence of tensors.
     Calculating hessian requires calculating the jacobian. So this function is more efficient than
     calling `jacobian` and `hessian` separately, which would calculate jacobian twice.
     Args:
-        input (Sequence[torch.Tensor]): input sequence of tensors.
+        outputs (Sequence[torch.Tensor]): input sequence of tensors.
         wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
         create_graph (bool, optional):
             pytorch option, if True, graph of the derivative will be constructed,
@@ -87,7 +87,7 @@ def jacobian_and_hessian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch
     Returns:
         tuple with jacobians sequence and hessians sequence.
     """
-    jac = jacobian_wrt(output, wrt, create_graph=True, batched = batched)
+    jac = jacobian_wrt(outputs, wrt, create_graph=True, batched = batched)
     return jac, jacobian_wrt(jac, wrt, batched = batched, create_graph=create_graph)
@@ -96,13 +96,13 @@ def jacobian_and_hessian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch
 #     Note - I only tested this for cases where input is a scalar."""
 #     return torch.cat([h.reshape(h.size(0), h[1].numel()) for h in hessians], 1)
-def jacobian_and_hessian_mat_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
+def jacobian_and_hessian_mat_wrt(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
     """Calculate jacobian and hessian of a sequence of tensors w.r.t another sequence of tensors.
     Calculating hessian requires calculating the jacobian. So this function is more efficient than
     calling `jacobian` and `hessian` separately, which would calculate jacobian twice.
     Args:
-        input (Sequence[torch.Tensor]): input sequence of tensors.
+        outputs (Sequence[torch.Tensor]): input sequence of tensors.
         wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
         create_graph (bool, optional):
             pytorch option, if True, graph of the derivative will be constructed,
@@ -112,7 +112,7 @@ def jacobian_and_hessian_mat_wrt(output: Sequence[torch.Tensor], wrt: Sequence[t
     Returns:
         tuple with jacobians sequence and hessians sequence.
     """
-    jac = jacobian_wrt(output, wrt, create_graph=True, batched = batched)
+    jac = jacobian_wrt(outputs, wrt, create_graph=True, batched = batched)
     H_list = jacobian_wrt(jac, wrt, batched = batched, create_graph=create_graph)
     return flatten_jacobian(jac), flatten_jacobian(H_list)

torchzero/utils/linalg/linear_operator.py CHANGED Viewed

@@ -35,8 +35,8 @@ class LinearOperator(ABC):
         """solve with a norm bound on x"""
         raise NotImplementedError(f"{self.__class__.__name__} doesn't implement solve_bounded")
-    def update(self, *args, **kwargs) -> None:
-        raise NotImplementedError(f"{self.__class__.__name__} doesn't implement update")
+    # def update(self, *args, **kwargs) -> None:
+    #     raise NotImplementedError(f"{self.__class__.__name__} doesn't implement update")
     def add(self, x: torch.Tensor) -> "LinearOperator":
         raise NotImplementedError(f"{self.__class__.__name__} doesn't implement add")
@@ -298,6 +298,7 @@ class AtA(LinearOperator):
 class AAT(LinearOperator):
     def __init__(self, A: torch.Tensor):
         self.A = A
+        self.device = self.A.device; self.dtype = self.A.dtype
     def matvec(self, x): return self.A.mv(self.A.mH.mv(x))
     def rmatvec(self, x): return self.matvec(x)
@@ -327,3 +328,50 @@ class AAT(LinearOperator):
         n = self.A.size(1)
         return (n,n)
+class Sketched(LinearOperator):
+    """A projected by sketching matrix S, representing the operator S @ A_proj @ S.T.
+    Where A is (n, n) and S is (n, sketch_size).
+    """
+    def __init__(self, S: torch.Tensor, A_proj: torch.Tensor):
+        self.S = S
+        self.A_proj = A_proj
+        self.device = self.A_proj.device; self.dtype = self.A_proj.dtype
+    def matvec(self, x):
+        x_proj = self.S.T @ x
+        Ax_proj = self.A_proj @ x_proj
+        return self.S @ Ax_proj
+    def rmatvec(self, x):
+        x_proj = self.S.T @ x
+        ATx_proj = self.A_proj.mH @ x_proj
+        return self.S @ ATx_proj
+    def matmat(self, x): return Dense(torch.linalg.multi_dot([self.S, self.A_proj, self.S.T, x])) # pylint:disable=not-callable
+    def rmatmat(self, x): return Dense(torch.linalg.multi_dot([self.S, self.A_proj.mH, self.S.T, x])) # pylint:disable=not-callable
+    def is_dense(self): return False
+    def to_tensor(self): return self.S @ self.A_proj @ self.S.T
+    def transpose(self): return Sketched(self.S, self.A_proj.mH)
+    def add_diagonal(self, x):
+        """this doesn't correspond to adding diagonal to A, however it still works for LM etc."""
+        if isinstance(x, torch.Tensor) and x.numel() <= 1: x = x.item()
+        if isinstance(x, (int,float)): x = torch.full((self.A_proj.shape[0],), fill_value=x, device=self.A_proj.device, dtype=self.A_proj.dtype)
+        return Sketched(S=self.S, A_proj=self.A_proj + x.diag_embed())
+    def solve(self, b):
+        return self.S @ torch.linalg.lstsq(self.A_proj, self.S.T @ b).solution # pylint:disable=not-callable
+    def inv(self):
+        return Sketched(S=self.S, A_proj=torch.linalg.pinv(self.A_proj)) # pylint:disable=not-callable
+    def size(self):
+        n = self.S.size(0)
+        return (n,n)

{torchzero-0.3.14.dist-info → torchzero-0.3.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchzero
-Version: 0.3.14
+Version: 0.3.15
 Summary: Modular optimization library for PyTorch.
 Author-email: Ivan Nikishev <nkshv2@gmail.com>
 Project-URL: Homepage, https://github.com/inikishev/torchzero

torchzero 0.3.14__py3-none-any.whl → 0.3.15__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.3.15py3-none-any.whl