PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

tests/test_identical.py +2 -3
tests/test_opts.py +140 -100
tests/test_tensorlist.py +8 -7
tests/test_vars.py +1 -0
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +335 -50
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +197 -70
torchzero/modules/__init__.py +13 -4
torchzero/modules/adaptive/__init__.py +30 -0
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/adaptive/adahessian.py +224 -0
torchzero/modules/{optimizers → adaptive}/adam.py +6 -8
torchzero/modules/adaptive/adan.py +96 -0
torchzero/modules/adaptive/adaptive_heavyball.py +54 -0
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/adaptive/esgd.py +171 -0
torchzero/modules/{optimizers → adaptive}/lion.py +1 -1
torchzero/modules/{experimental/spectral.py → adaptive/lmadagrad.py} +94 -71
torchzero/modules/adaptive/mars.py +79 -0
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/adaptive/msam.py +188 -0
torchzero/modules/{optimizers → adaptive}/muon.py +29 -5
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/orthograd.py +1 -1
torchzero/modules/{optimizers → adaptive}/rmsprop.py +7 -4
torchzero/modules/{optimizers → adaptive}/rprop.py +42 -10
torchzero/modules/adaptive/sam.py +163 -0
torchzero/modules/{optimizers → adaptive}/shampoo.py +47 -9
torchzero/modules/{optimizers → adaptive}/soap.py +52 -65
torchzero/modules/adaptive/sophia_h.py +185 -0
torchzero/modules/clipping/clipping.py +115 -25
torchzero/modules/clipping/ema_clipping.py +31 -17
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/conjugate_gradient/cg.py +355 -0
torchzero/modules/experimental/__init__.py +13 -19
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +5 -42
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +32 -15
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +13 -55
torchzero/modules/functional.py +52 -6
torchzero/modules/grad_approximation/fdm.py +30 -4
torchzero/modules/grad_approximation/forward_gradient.py +16 -4
torchzero/modules/grad_approximation/grad_approximator.py +51 -10
torchzero/modules/grad_approximation/rfdm.py +321 -52
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +164 -93
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +124 -0
torchzero/modules/line_search/backtracking.py +95 -57
torchzero/modules/line_search/line_search.py +171 -22
torchzero/modules/line_search/scipy.py +3 -3
torchzero/modules/line_search/strong_wolfe.py +327 -199
torchzero/modules/misc/__init__.py +35 -0
torchzero/modules/misc/debug.py +48 -0
torchzero/modules/misc/escape.py +62 -0
torchzero/modules/misc/gradient_accumulation.py +136 -0
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +383 -0
torchzero/modules/misc/multistep.py +194 -0
torchzero/modules/misc/regularization.py +167 -0
torchzero/modules/misc/split.py +123 -0
torchzero/modules/{ops → misc}/switch.py +45 -4
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +9 -9
torchzero/modules/momentum/cautious.py +51 -19
torchzero/modules/momentum/momentum.py +37 -2
torchzero/modules/ops/__init__.py +11 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +81 -34
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +16 -39
torchzero/modules/ops/multi.py +82 -21
torchzero/modules/ops/reduce.py +16 -8
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +30 -18
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +190 -96
torchzero/modules/quasi_newton/__init__.py +9 -14
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -0
torchzero/modules/quasi_newton/lbfgs.py +286 -173
torchzero/modules/quasi_newton/lsr1.py +185 -106
torchzero/modules/quasi_newton/quasi_newton.py +816 -268
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +3 -2
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +292 -68
torchzero/modules/second_order/newton_cg.py +365 -15
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +387 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +94 -11
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +39 -3
torchzero/optim/wrappers/fcmaes.py +24 -15
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +3 -3
torchzero/optim/wrappers/scipy.py +86 -25
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +126 -114
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +369 -58
torchzero/utils/metrics.py +83 -0
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +16 -0
torchzero/utils/tensorlist.py +134 -51
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -57
torchzero/modules/experimental/absoap.py +0 -250
torchzero/modules/experimental/adadam.py +0 -112
torchzero/modules/experimental/adamY.py +0 -125
torchzero/modules/experimental/adasoap.py +0 -172
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/eigendescent.py +0 -117
torchzero/modules/experimental/etf.py +0 -172
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/experimental/subspace_preconditioners.py +0 -138
torchzero/modules/experimental/tada.py +0 -38
torchzero/modules/line_search/trust_region.py +0 -73
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/momentum/matrix_momentum.py +0 -166
torchzero/modules/ops/debug.py +0 -25
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/ops/split.py +0 -75
torchzero/modules/optimizers/__init__.py +0 -18
torchzero/modules/optimizers/adagrad.py +0 -155
torchzero/modules/optimizers/sophia_h.py +0 -129
torchzero/modules/quasi_newton/cg.py +0 -268
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +0 -266
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero/modules/smoothing/gaussian.py +0 -164
torchzero-0.3.10.dist-info/METADATA +0 -379
torchzero-0.3.10.dist-info/RECORD +0 -139
torchzero-0.3.10.dist-info/licenses/LICENSE +0 -21
{torchzero-0.3.10.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/optim/wrappers/optuna.py CHANGED Viewed

@@ -6,7 +6,7 @@ import torch
 import optuna
-from ...utils import Optimizer
+from ...utils import Optimizer, totensor, tofloat
 def silence_optuna():
     optuna.logging.set_verbosity(optuna.logging.WARNING)
@@ -23,7 +23,7 @@ class OptunaSampler(Optimizer):
     Note - optuna is surprisingly scalable to large number of parameters (up to 10,000), despite literally requiring a for-loop because it only supports scalars. Default TPESampler is good for BBO. Maybe not for NNs...
     Args:
-        params (_type_): parameters
+        params: iterable of parameters to optimize or dicts defining parameter groups.
         lb (float): lower bounds.
         ub (float): upper bounds.
         sampler (optuna.samplers.BaseSampler | type[optuna.samplers.BaseSampler] | None, optional): sampler. Defaults to None.
@@ -65,6 +65,6 @@ class OptunaSampler(Optimizer):
         params.from_vec_(vec)
         loss = closure()
-        with torch.enable_grad(): self.study.tell(trial, loss)
+        with torch.enable_grad(): self.study.tell(trial, tofloat(torch.nan_to_num(totensor(loss), 1e32)))
         return loss

torchzero/optim/wrappers/scipy.py CHANGED Viewed

@@ -4,12 +4,17 @@ from functools import partial
 from typing import Any, Literal
 import numpy as np
-import scipy.optimize
 import torch
+import scipy.optimize
 from ...utils import Optimizer, TensorList
-from ...utils.derivatives import jacobian_and_hessian_mat_wrt, jacobian_wrt
-from ...modules.second_order.newton import tikhonov_
+from ...utils.derivatives import (
+    flatten_jacobian,
+    jacobian_and_hessian_mat_wrt,
+    jacobian_wrt,
+)
 def _ensure_float(x) -> float:
     if isinstance(x, torch.Tensor): return x.detach().cpu().item()
@@ -21,14 +26,6 @@ def _ensure_numpy(x):
     if isinstance(x, np.ndarray): return x
     return np.array(x)
-def matrix_clamp(H: torch.Tensor, reg: float):
-    try:
-        eigvals, eigvecs = torch.linalg.eigh(H) # pylint:disable=not-callable
-        eigvals.clamp_(min=reg)
-        return eigvecs @ torch.diag(eigvals) @ eigvecs.mH
-    except Exception:
-        return H
 Closure = Callable[[bool], Any]
 class ScipyMinimize(Optimizer):
@@ -76,8 +73,6 @@ class ScipyMinimize(Optimizer):
         options = None,
         jac: Literal['2-point', '3-point', 'cs', 'autograd'] = 'autograd',
         hess: Literal['2-point', '3-point', 'cs', 'autograd'] | scipy.optimize.HessianUpdateStrategy = 'autograd',
-        tikhonov: float | None = 0,
-        min_eigval: float | None = None,
     ):
         defaults = dict(lb=lb, ub=ub)
         super().__init__(params, defaults)
@@ -85,12 +80,10 @@ class ScipyMinimize(Optimizer):
         self.constraints = constraints
         self.tol = tol
         self.callback = callback
-        self.min_eigval = min_eigval
         self.options = options
         self.jac = jac
         self.hess = hess
-        self.tikhonov: float | None = tikhonov
         self.use_jac_autograd = jac.lower() == 'autograd' and (method is None or method.lower() in [
             'cg', 'bfgs', 'newton-cg', 'l-bfgs-b', 'tnc', 'slsqp', 'dogleg',
@@ -111,9 +104,7 @@ class ScipyMinimize(Optimizer):
         with torch.enable_grad():
             value = closure(False)
             _, H = jacobian_and_hessian_mat_wrt([value], wrt = params)
-        if self.tikhonov is not None: H = tikhonov_(H, self.tikhonov)
-        if self.min_eigval is not None: H = matrix_clamp(H, self.min_eigval)
-        return H.detach().cpu().numpy()
+        return H.numpy(force=True)
     def _objective(self, x: np.ndarray, params: TensorList, closure):
         # set params to x
@@ -122,7 +113,10 @@ class ScipyMinimize(Optimizer):
         # return value and maybe gradients
         if self.use_jac_autograd:
             with torch.enable_grad(): value = _ensure_float(closure())
-            return value, params.ensure_grad_().grad.to_vec().detach().cpu().numpy()
+            grad = params.ensure_grad_().grad.to_vec().numpy(force=True)
+            # slsqp requires float64
+            if self.method.lower() == 'slsqp': grad = grad.astype(np.float64)
+            return value, grad
         return _ensure_float(closure(False))
     @torch.no_grad
@@ -135,13 +129,15 @@ class ScipyMinimize(Optimizer):
             else: hess = None
         else: hess = self.hess
-        x0 = params.to_vec().detach().cpu().numpy()
+        x0 = params.to_vec().numpy(force=True)
         # make bounds
         lb, ub = self.group_vals('lb', 'ub', cls=list)
-        bounds = []
-        for p, l, u in zip(params, lb, ub):
-            bounds.extend([(l, u)] * p.numel())
+        bounds = None
+        if any(b is not None for b in lb) or any(b is not None for b in ub):
+            bounds = []
+            for p, l, u in zip(params, lb, ub):
+                bounds.extend([(l, u)] * p.numel())
         if self.method is not None and (self.method.lower() == 'tnc' or self.method.lower() == 'slsqp'):
             x0 = x0.astype(np.float64) # those methods error without this
@@ -165,7 +161,7 @@ class ScipyMinimize(Optimizer):
 class ScipyRootOptimization(Optimizer):
-    """Optimization via using scipy.root on gradients, mainly for experimenting!
+    """Optimization via using scipy.optimize.root on gradients, mainly for experimenting!
     Args:
         params: iterable of parameters to optimize or dicts defining parameter groups.
@@ -246,6 +242,72 @@ class ScipyRootOptimization(Optimizer):
         return res.fun
+class ScipyLeastSquaresOptimization(Optimizer):
+    """Optimization via using scipy.optimize.least_squares on gradients, mainly for experimenting!
+    Args:
+        params: iterable of parameters to optimize or dicts defining parameter groups.
+        method (str | None, optional): _description_. Defaults to None.
+        tol (float | None, optional): _description_. Defaults to None.
+        callback (_type_, optional): _description_. Defaults to None.
+        options (_type_, optional): _description_. Defaults to None.
+        jac (T.Literal[&#39;2, optional): _description_. Defaults to 'autograd'.
+    """
+    def __init__(
+        self,
+        params,
+        method='trf',
+        jac='autograd',
+        bounds=(-np.inf, np.inf),
+        ftol=1e-8, xtol=1e-8, gtol=1e-8, x_scale=1.0, loss='linear',
+        f_scale=1.0, diff_step=None, tr_solver=None, tr_options=None,
+        jac_sparsity=None, max_nfev=None, verbose=0
+    ):
+        super().__init__(params, {})
+        kwargs = locals().copy()
+        del kwargs['self'], kwargs['params'], kwargs['__class__'], kwargs['jac']
+        self._kwargs = kwargs
+        self.jac = jac
+    def _objective(self, x: np.ndarray, params: TensorList, closure):
+        # set params to x
+        params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
+        # return the gradients
+        with torch.enable_grad(): self.value = closure()
+        jac = params.ensure_grad_().grad.to_vec()
+        return jac.numpy(force=True)
+    def _hess(self, x: np.ndarray, params: TensorList, closure):
+        params.from_vec_(torch.from_numpy(x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
+        with torch.enable_grad():
+            value = closure(False)
+            _, H = jacobian_and_hessian_mat_wrt([value], wrt = params)
+        return H.numpy(force=True)
+    @torch.no_grad
+    def step(self, closure: Closure): # pylint:disable = signature-differs # pyright:ignore[reportIncompatibleMethodOverride]
+        params = self.get_params()
+        x0 = params.to_vec().detach().cpu().numpy()
+        if self.jac == 'autograd': jac = partial(self._hess, params = params, closure = closure)
+        else: jac = self.jac
+        res = scipy.optimize.least_squares(
+            partial(self._objective, params = params, closure = closure),
+            x0 = x0,
+            jac=jac, # type:ignore
+            **self._kwargs
+        )
+        params.from_vec_(torch.from_numpy(res.x).to(device = params[0].device, dtype=params[0].dtype, copy=False))
+        return res.fun
 class ScipyDE(Optimizer):
     """Use scipy.minimize.differential_evolution as pytorch optimizer. Note that this performs full minimization on each step,
@@ -508,4 +570,3 @@ class ScipyBrute(Optimizer):
             **self._kwargs
         )
         params.from_vec_(torch.from_numpy(x0).to(device = params[0].device, dtype=params[0].dtype, copy=False))
-        return None

torchzero/utils/__init__.py CHANGED Viewed

@@ -1,5 +1,11 @@
 from . import tensorlist as tl
-from .compile import _optional_compiler, benchmark_compile_cpu, benchmark_compile_cuda, set_compilation, enable_compilation
+from .compile import (
+    _optional_compiler,
+    benchmark_compile_cpu,
+    benchmark_compile_cuda,
+    enable_compilation,
+    set_compilation,
+)
 from .numberlist import NumberList
 from .optimizer import (
     Init,
@@ -18,6 +24,36 @@ from .params import (
     _copy_param_groups,
     _make_param_groups,
 )
-from .python_tools import flatten, generic_eq, reduce_dim, unpack_dicts
-from .tensorlist import TensorList, as_tensorlist, Distributions, generic_clamp, generic_numel, generic_vector_norm, generic_zeros_like, generic_randn_like
-from .torch_tools import tofloat, tolist, tonumpy, totensor, vec_to_tensors, vec_to_tensors_, set_storage_
+from .python_tools import (
+    flatten,
+    generic_eq,
+    generic_ne,
+    reduce_dim,
+    safe_dict_update_,
+    unpack_dicts,
+)
+from .tensorlist import (
+    Distributions,
+    Metrics,
+    TensorList,
+    as_tensorlist,
+    generic_clamp,
+    generic_finfo,
+    generic_finfo_eps,
+    generic_finfo_tiny,
+    generic_max,
+    generic_numel,
+    generic_randn_like,
+    generic_sum,
+    generic_vector_norm,
+    generic_zeros_like,
+)
+from .torch_tools import (
+    set_storage_,
+    tofloat,
+    tolist,
+    tonumpy,
+    totensor,
+    vec_to_tensors,
+    vec_to_tensors_,
+)

torchzero/utils/compile.py CHANGED Viewed

@@ -38,7 +38,7 @@ class _MaybeCompiledFunc:
 _optional_compiler = _OptionalCompiler()
 """this holds .enable attribute, set to True to enable compiling for a few functions that benefit from it."""
-def set_compilation(enable: bool):
+def set_compilation(enable: bool=True):
     """`enable` is False by default. When True, certain functions will be compiled, which may not work on some systems like Windows, but it usually improves performance."""
     _optional_compiler.enable = enable

torchzero/utils/derivatives.py CHANGED Viewed

@@ -2,7 +2,6 @@ from collections.abc import Iterable, Sequence
 import torch
 import torch.autograd.forward_ad as fwAD
-from typing import Literal
 from .torch_tools import swap_tensors_no_use_count_check, vec_to_tensors
@@ -35,10 +34,27 @@ def _jacobian_batched(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor
         is_grads_batched=True,
     )
+def flatten_jacobian(jacs: Sequence[torch.Tensor]) -> torch.Tensor:
+    """Converts the output of jacobian_wrt (a list of tensors) into a single 2D matrix.
+    Args:
+        jacs (Sequence[torch.Tensor]):
+            output from jacobian_wrt where ach tensor has the shape `(*output.shape, *wrt[i].shape)`.
+    Returns:
+        torch.Tensor: has the shape `(output.ndim, wrt.ndim)`.
+    """
+    if not jacs:
+        return torch.empty(0, 0)
+    n_out = jacs[0].shape[0]
+    return torch.cat([j.reshape(n_out, -1) for j in jacs], dim=1)
 def jacobian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True) -> Sequence[torch.Tensor]:
     """Calculate jacobian of a sequence of tensors w.r.t another sequence of tensors.
     Returns a sequence of tensors with the length as `wrt`.
-    Each tensor will have the shape `(*input.shape, *wrt[i].shape)`.
+    Each tensor will have the shape `(*output.shape, *wrt[i].shape)`.
     Args:
         input (Sequence[torch.Tensor]): input sequence of tensors.
@@ -75,10 +91,10 @@ def jacobian_and_hessian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch
     return jac, jacobian_wrt(jac, wrt, batched = batched, create_graph=create_graph)
-def hessian_list_to_mat(hessians: Sequence[torch.Tensor]):
-    """takes output of `hessian` and returns the 2D hessian matrix.
-    Note - I only tested this for cases where input is a scalar."""
-    return torch.cat([h.reshape(h.size(0), h[1].numel()) for h in hessians], 1)
+# def hessian_list_to_mat(hessians: Sequence[torch.Tensor]):
+#     """takes output of `hessian` and returns the 2D hessian matrix.
+#     Note - I only tested this for cases where input is a scalar."""
+#     return torch.cat([h.reshape(h.size(0), h[1].numel()) for h in hessians], 1)
 def jacobian_and_hessian_mat_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
     """Calculate jacobian and hessian of a sequence of tensors w.r.t another sequence of tensors.
@@ -98,7 +114,7 @@ def jacobian_and_hessian_mat_wrt(output: Sequence[torch.Tensor], wrt: Sequence[t
     """
     jac = jacobian_wrt(output, wrt, create_graph=True, batched = batched)
     H_list = jacobian_wrt(jac, wrt, batched = batched, create_graph=create_graph)
-    return torch.cat([j.view(-1) for j in jac]), hessian_list_to_mat(H_list)
+    return flatten_jacobian(jac), flatten_jacobian(H_list)
 def hessian(
     fn,
@@ -115,19 +131,18 @@ def hessian(
     `vectorize` and `outer_jacobian_strategy` are only for `method = "torch.autograd"`, refer to its documentation.
     Example:
-    .. code:: py
-        model = nn.Linear(4, 2) # (2, 4) weight and (2, ) bias
-        X = torch.randn(10, 4)
-        y = torch.randn(10, 2)
+    ```python
+    model = nn.Linear(4, 2) # (2, 4) weight and (2, ) bias
+    X = torch.randn(10, 4)
+    y = torch.randn(10, 2)
-        def fn():
-            y_hat = model(X)
-            loss = F.mse_loss(y_hat, y)
-            return loss
-        hessian_mat(fn, model.parameters()) # list of two lists of two lists of 3D and 4D tensors
+    def fn():
+        y_hat = model(X)
+        loss = F.mse_loss(y_hat, y)
+        return loss
+    hessian_mat(fn, model.parameters()) # list of two lists of two lists of 3D and 4D tensors
+    ```
     """
     params = list(params)
@@ -158,26 +173,25 @@ def hessian_mat(
     method="func",
     vectorize=False,
     outer_jacobian_strategy="reverse-mode",
-):
+) -> torch.Tensor:
     """
     returns hessian matrix for parameters (as if they were flattened and concatenated into a vector).
     `vectorize` and `outer_jacobian_strategy` are only for `method = "torch.autograd"`, refer to its documentation.
     Example:
-    .. code:: py
-        model = nn.Linear(4, 2) # 10 parameters in total
-        X = torch.randn(10, 4)
-        y = torch.randn(10, 2)
+    ```python
+    model = nn.Linear(4, 2) # 10 parameters in total
+    X = torch.randn(10, 4)
+    y = torch.randn(10, 2)
-        def fn():
-            y_hat = model(X)
-            loss = F.mse_loss(y_hat, y)
-            return loss
-        hessian_mat(fn, model.parameters()) # 10x10 tensor
+    def fn():
+        y_hat = model(X)
+        loss = F.mse_loss(y_hat, y)
+        return loss
+    hessian_mat(fn, model.parameters()) # 10x10 tensor
+    ```
     """
     params = list(params)
@@ -190,7 +204,7 @@ def hessian_mat(
         return loss
     if method == 'func':
-        return torch.func.hessian(func)(torch.cat([p.view(-1) for p in params]).detach().requires_grad_(create_graph))
+        return torch.func.hessian(func)(torch.cat([p.view(-1) for p in params]).detach().requires_grad_(create_graph)) # pyright:ignore[reportReturnType]
     if method == 'autograd.functional':
         return torch.autograd.functional.hessian(
@@ -199,28 +213,27 @@ def hessian_mat(
             create_graph=create_graph,
             vectorize=vectorize,
             outer_jacobian_strategy=outer_jacobian_strategy,
-        )
+        ) # pyright:ignore[reportReturnType]
     raise ValueError(method)
 def jvp(fn, params: Iterable[torch.Tensor], tangent: Iterable[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
     """Jacobian vector product.
     Example:
-    .. code:: py
-        model = nn.Linear(4, 2)
-        X = torch.randn(10, 4)
-        y = torch.randn(10, 2)
+    ```python
+    model = nn.Linear(4, 2)
+    X = torch.randn(10, 4)
+    y = torch.randn(10, 2)
-        tangent = [torch.randn_like(p) for p in model.parameters()]
+    tangent = [torch.randn_like(p) for p in model.parameters()]
-        def fn():
-            y_hat = model(X)
-            loss = F.mse_loss(y_hat, y)
-            return loss
-        jvp(fn, model.parameters(), tangent) # scalar
+    def fn():
+        y_hat = model(X)
+        loss = F.mse_loss(y_hat, y)
+        return loss
+    jvp(fn, model.parameters(), tangent) # scalar
+    ```
     """
     params = list(params)
     tangent = list(tangent)
@@ -253,21 +266,20 @@ def jvp_fd_central(
     """Jacobian vector product using central finite difference formula.
     Example:
-    .. code:: py
-        model = nn.Linear(4, 2)
-        X = torch.randn(10, 4)
-        y = torch.randn(10, 2)
+    ```python
+    model = nn.Linear(4, 2)
+    X = torch.randn(10, 4)
+    y = torch.randn(10, 2)
-        tangent = [torch.randn_like(p) for p in model.parameters()]
+    tangent = [torch.randn_like(p) for p in model.parameters()]
-        def fn():
-            y_hat = model(X)
-            loss = F.mse_loss(y_hat, y)
-            return loss
-        jvp_fd_central(fn, model.parameters(), tangent) # scalar
+    def fn():
+        y_hat = model(X)
+        loss = F.mse_loss(y_hat, y)
+        return loss
+    jvp_fd_central(fn, model.parameters(), tangent) # scalar
+    ```
     """
     params = list(params)
     tangent = list(tangent)
@@ -304,24 +316,24 @@ def jvp_fd_forward(
     Loss at initial point can be specified in the `v_0` argument.
     Example:
-    .. code:: py
-        model = nn.Linear(4, 2)
-        X = torch.randn(10, 4)
-        y = torch.randn(10, 2)
+    ```python
+    model = nn.Linear(4, 2)
+    X = torch.randn(10, 4)
+    y = torch.randn(10, 2)
-        tangent1 = [torch.randn_like(p) for p in model.parameters()]
-        tangent2 = [torch.randn_like(p) for p in model.parameters()]
+    tangent1 = [torch.randn_like(p) for p in model.parameters()]
+    tangent2 = [torch.randn_like(p) for p in model.parameters()]
-        def fn():
-            y_hat = model(X)
-            loss = F.mse_loss(y_hat, y)
-            return loss
+    def fn():
+        y_hat = model(X)
+        loss = F.mse_loss(y_hat, y)
+        return loss
-        v_0 = fn() # pre-calculate loss at initial point
+    v_0 = fn() # pre-calculate loss at initial point
-        jvp1 = jvp_fd_forward(fn, model.parameters(), tangent1, v_0=v_0) # scalar
-        jvp2 = jvp_fd_forward(fn, model.parameters(), tangent2, v_0=v_0) # scalar
+    jvp1 = jvp_fd_forward(fn, model.parameters(), tangent1, v_0=v_0) # scalar
+    jvp2 = jvp_fd_forward(fn, model.parameters(), tangent2, v_0=v_0) # scalar
+    ```
     """
     params = list(params)
@@ -356,21 +368,21 @@ def hvp(
     """Hessian-vector product
     Example:
-    .. code:: py
+    ```python
+    model = nn.Linear(4, 2)
+    X = torch.randn(10, 4)
+    y = torch.randn(10, 2)
-        model = nn.Linear(4, 2)
-        X = torch.randn(10, 4)
-        y = torch.randn(10, 2)
+    y_hat = model(X)
+    loss = F.mse_loss(y_hat, y)
+    loss.backward(create_graph=True)
-        y_hat = model(X)
-        loss = F.mse_loss(y_hat, y)
-        loss.backward(create_graph=True)
-        grads = [p.grad for p in model.parameters()]
-        vec = [torch.randn_like(p) for p in model.parameters()]
+    grads = [p.grad for p in model.parameters()]
+    vec = [torch.randn_like(p) for p in model.parameters()]
-        # list of tensors, same layout as model.parameters()
-        hvp(model.parameters(), grads, vec=vec)
+    # list of tensors, same layout as model.parameters()
+    hvp(model.parameters(), grads, vec=vec)
+    ```
     """
     params = list(params)
     g = list(grads)
@@ -393,23 +405,23 @@ def hvp_fd_central(
     Please note that this will clear :code:`grad` attributes in params.
     Example:
-    .. code:: py
-        model = nn.Linear(4, 2)
-        X = torch.randn(10, 4)
-        y = torch.randn(10, 2)
+    ```python
+    model = nn.Linear(4, 2)
+    X = torch.randn(10, 4)
+    y = torch.randn(10, 2)
-        def closure():
-            y_hat = model(X)
-            loss = F.mse_loss(y_hat, y)
-            model.zero_grad()
-            loss.backward()
-            return loss
+    def closure():
+        y_hat = model(X)
+        loss = F.mse_loss(y_hat, y)
+        model.zero_grad()
+        loss.backward()
+        return loss
-        vec = [torch.randn_like(p) for p in model.parameters()]
+    vec = [torch.randn_like(p) for p in model.parameters()]
-        # list of tensors, same layout as model.parameters()
-        hvp_fd_central(closure, model.parameters(), vec=vec)
+    # list of tensors, same layout as model.parameters()
+    hvp_fd_central(closure, model.parameters(), vec=vec)
+    ```
     """
     params = list(params)
     vec = list(vec)
@@ -456,27 +468,27 @@ def hvp_fd_forward(
     Please note that this will clear :code:`grad` attributes in params.
     Example:
-    .. code:: py
+    ```python
+    model = nn.Linear(4, 2)
+    X = torch.randn(10, 4)
+    y = torch.randn(10, 2)
-        model = nn.Linear(4, 2)
-        X = torch.randn(10, 4)
-        y = torch.randn(10, 2)
-        def closure():
-            y_hat = model(X)
-            loss = F.mse_loss(y_hat, y)
-            model.zero_grad()
-            loss.backward()
-            return loss
+    def closure():
+        y_hat = model(X)
+        loss = F.mse_loss(y_hat, y)
+        model.zero_grad()
+        loss.backward()
+        return loss
-        vec = [torch.randn_like(p) for p in model.parameters()]
+    vec = [torch.randn_like(p) for p in model.parameters()]
-        # pre-compute gradient at initial point
-        closure()
-        g_0 = [p.grad for p in model.parameters()]
+    # pre-compute gradient at initial point
+    closure()
+    g_0 = [p.grad for p in model.parameters()]
-        # list of tensors, same layout as model.parameters()
-        hvp_fd_forward(closure, model.parameters(), vec=vec, g_0=g_0)
+    # list of tensors, same layout as model.parameters()
+    hvp_fd_forward(closure, model.parameters(), vec=vec, g_0=g_0)
+    ```
     """
     params = list(params)
@@ -485,7 +497,7 @@ def hvp_fd_forward(
     vec_norm = None
     if normalize:
-        vec_norm = torch.linalg.vector_norm(torch.cat([t.view(-1) for t in vec])) # pylint:disable=not-callable
+        vec_norm = torch.linalg.vector_norm(torch.cat([t.ravel() for t in vec])) # pylint:disable=not-callable
         if vec_norm == 0: return None, [torch.zeros_like(p) for p in params]
         vec = torch._foreach_div(vec, vec_norm)

torchzero/utils/linalg/__init__.py CHANGED Viewed

@@ -1,5 +1,12 @@
-from .matrix_funcs import inv_sqrt_2x2, eigvals_func, singular_vals_func, matrix_power_eigh, x_inv
+from . import linear_operator
+from .matrix_funcs import (
+    eigvals_func,
+    inv_sqrt_2x2,
+    matrix_power_eigh,
+    singular_vals_func,
+    x_inv,
+)
 from .orthogonalize import gram_schmidt
 from .qr import qr_householder
+from .solve import cg, nystrom_approximation, nystrom_sketch_and_solve
 from .svd import randomized_svd
-from .solve import cg, nystrom_approximation, nystrom_sketch_and_solve

torchzero 0.3.10__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.13py3-none-any.whl