PyPI - torchzero - Versions diffs - 0.1.8__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

torchzero 0.1.8py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

docs/source/conf.py +57 -0
tests/test_identical.py +230 -0
tests/test_module.py +50 -0
tests/test_opts.py +884 -0
tests/test_tensorlist.py +1787 -0
tests/test_utils_optimizer.py +170 -0
tests/test_vars.py +184 -0
torchzero/__init__.py +4 -4
torchzero/core/__init__.py +3 -13
torchzero/core/module.py +629 -510
torchzero/core/preconditioner.py +137 -0
torchzero/core/transform.py +252 -0
torchzero/modules/__init__.py +13 -21
torchzero/modules/clipping/__init__.py +3 -0
torchzero/modules/clipping/clipping.py +320 -0
torchzero/modules/clipping/ema_clipping.py +135 -0
torchzero/modules/clipping/growth_clipping.py +187 -0
torchzero/modules/experimental/__init__.py +13 -18
torchzero/modules/experimental/absoap.py +350 -0
torchzero/modules/experimental/adadam.py +111 -0
torchzero/modules/experimental/adamY.py +135 -0
torchzero/modules/experimental/adasoap.py +282 -0
torchzero/modules/experimental/algebraic_newton.py +145 -0
torchzero/modules/experimental/curveball.py +89 -0
torchzero/modules/experimental/dsoap.py +290 -0
torchzero/modules/experimental/gradmin.py +85 -0
torchzero/modules/experimental/reduce_outward_lr.py +35 -0
torchzero/modules/experimental/spectral.py +286 -0
torchzero/modules/experimental/subspace_preconditioners.py +128 -0
torchzero/modules/experimental/tropical_newton.py +136 -0
torchzero/modules/functional.py +209 -0
torchzero/modules/grad_approximation/__init__.py +4 -0
torchzero/modules/grad_approximation/fdm.py +120 -0
torchzero/modules/grad_approximation/forward_gradient.py +81 -0
torchzero/modules/grad_approximation/grad_approximator.py +66 -0
torchzero/modules/grad_approximation/rfdm.py +259 -0
torchzero/modules/line_search/__init__.py +5 -30
torchzero/modules/line_search/backtracking.py +186 -0
torchzero/modules/line_search/line_search.py +181 -0
torchzero/modules/line_search/scipy.py +37 -0
torchzero/modules/line_search/strong_wolfe.py +260 -0
torchzero/modules/line_search/trust_region.py +61 -0
torchzero/modules/lr/__init__.py +2 -0
torchzero/modules/lr/lr.py +59 -0
torchzero/modules/lr/step_size.py +97 -0
torchzero/modules/momentum/__init__.py +14 -4
torchzero/modules/momentum/averaging.py +78 -0
torchzero/modules/momentum/cautious.py +181 -0
torchzero/modules/momentum/ema.py +173 -0
torchzero/modules/momentum/experimental.py +189 -0
torchzero/modules/momentum/matrix_momentum.py +124 -0
torchzero/modules/momentum/momentum.py +43 -106
torchzero/modules/ops/__init__.py +103 -0
torchzero/modules/ops/accumulate.py +65 -0
torchzero/modules/ops/binary.py +240 -0
torchzero/modules/ops/debug.py +25 -0
torchzero/modules/ops/misc.py +419 -0
torchzero/modules/ops/multi.py +137 -0
torchzero/modules/ops/reduce.py +149 -0
torchzero/modules/ops/split.py +75 -0
torchzero/modules/ops/switch.py +68 -0
torchzero/modules/ops/unary.py +115 -0
torchzero/modules/ops/utility.py +112 -0
torchzero/modules/optimizers/__init__.py +18 -10
torchzero/modules/optimizers/adagrad.py +146 -49
torchzero/modules/optimizers/adam.py +112 -118
torchzero/modules/optimizers/lion.py +18 -11
torchzero/modules/optimizers/muon.py +222 -0
torchzero/modules/optimizers/orthograd.py +55 -0
torchzero/modules/optimizers/rmsprop.py +103 -51
torchzero/modules/optimizers/rprop.py +342 -99
torchzero/modules/optimizers/shampoo.py +197 -0
torchzero/modules/optimizers/soap.py +286 -0
torchzero/modules/optimizers/sophia_h.py +129 -0
torchzero/modules/projections/__init__.py +5 -0
torchzero/modules/projections/dct.py +73 -0
torchzero/modules/projections/fft.py +73 -0
torchzero/modules/projections/galore.py +10 -0
torchzero/modules/projections/projection.py +218 -0
torchzero/modules/projections/structural.py +151 -0
torchzero/modules/quasi_newton/__init__.py +7 -4
torchzero/modules/quasi_newton/cg.py +218 -0
torchzero/modules/quasi_newton/experimental/__init__.py +1 -0
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +265 -0
torchzero/modules/quasi_newton/lbfgs.py +228 -0
torchzero/modules/quasi_newton/lsr1.py +170 -0
torchzero/modules/quasi_newton/olbfgs.py +196 -0
torchzero/modules/quasi_newton/quasi_newton.py +475 -0
torchzero/modules/second_order/__init__.py +3 -4
torchzero/modules/second_order/newton.py +142 -165
torchzero/modules/second_order/newton_cg.py +84 -0
torchzero/modules/second_order/nystrom.py +168 -0
torchzero/modules/smoothing/__init__.py +2 -5
torchzero/modules/smoothing/gaussian.py +164 -0
torchzero/modules/smoothing/{laplacian_smoothing.py → laplacian.py} +115 -128
torchzero/modules/weight_decay/__init__.py +1 -0
torchzero/modules/weight_decay/weight_decay.py +52 -0
torchzero/modules/wrappers/__init__.py +1 -0
torchzero/modules/wrappers/optim_wrapper.py +91 -0
torchzero/optim/__init__.py +2 -10
torchzero/optim/utility/__init__.py +1 -0
torchzero/optim/utility/split.py +45 -0
torchzero/optim/wrappers/nevergrad.py +2 -28
torchzero/optim/wrappers/nlopt.py +31 -16
torchzero/optim/wrappers/scipy.py +79 -156
torchzero/utils/__init__.py +27 -0
torchzero/utils/compile.py +175 -37
torchzero/utils/derivatives.py +513 -99
torchzero/utils/linalg/__init__.py +5 -0
torchzero/utils/linalg/matrix_funcs.py +87 -0
torchzero/utils/linalg/orthogonalize.py +11 -0
torchzero/utils/linalg/qr.py +71 -0
torchzero/utils/linalg/solve.py +168 -0
torchzero/utils/linalg/svd.py +20 -0
torchzero/utils/numberlist.py +132 -0
torchzero/utils/ops.py +10 -0
torchzero/utils/optimizer.py +284 -0
torchzero/utils/optuna_tools.py +40 -0
torchzero/utils/params.py +149 -0
torchzero/utils/python_tools.py +40 -25
torchzero/utils/tensorlist.py +1081 -0
torchzero/utils/torch_tools.py +48 -12
torchzero-0.3.1.dist-info/METADATA +379 -0
torchzero-0.3.1.dist-info/RECORD +128 -0
{torchzero-0.1.8.dist-info → torchzero-0.3.1.dist-info}/WHEEL +1 -1
{torchzero-0.1.8.dist-info → torchzero-0.3.1.dist-info/licenses}/LICENSE +0 -0
torchzero-0.3.1.dist-info/top_level.txt +3 -0
torchzero/core/tensorlist_optimizer.py +0 -219
torchzero/modules/adaptive/__init__.py +0 -4
torchzero/modules/adaptive/adaptive.py +0 -192
torchzero/modules/experimental/experimental.py +0 -294
torchzero/modules/experimental/quad_interp.py +0 -104
torchzero/modules/experimental/subspace.py +0 -259
torchzero/modules/gradient_approximation/__init__.py +0 -7
torchzero/modules/gradient_approximation/_fd_formulas.py +0 -3
torchzero/modules/gradient_approximation/base_approximator.py +0 -105
torchzero/modules/gradient_approximation/fdm.py +0 -125
torchzero/modules/gradient_approximation/forward_gradient.py +0 -163
torchzero/modules/gradient_approximation/newton_fdm.py +0 -198
torchzero/modules/gradient_approximation/rfdm.py +0 -125
torchzero/modules/line_search/armijo.py +0 -56
torchzero/modules/line_search/base_ls.py +0 -139
torchzero/modules/line_search/directional_newton.py +0 -217
torchzero/modules/line_search/grid_ls.py +0 -158
torchzero/modules/line_search/scipy_minimize_scalar.py +0 -62
torchzero/modules/meta/__init__.py +0 -12
torchzero/modules/meta/alternate.py +0 -65
torchzero/modules/meta/grafting.py +0 -195
torchzero/modules/meta/optimizer_wrapper.py +0 -173
torchzero/modules/meta/return_overrides.py +0 -46
torchzero/modules/misc/__init__.py +0 -10
torchzero/modules/misc/accumulate.py +0 -43
torchzero/modules/misc/basic.py +0 -115
torchzero/modules/misc/lr.py +0 -96
torchzero/modules/misc/multistep.py +0 -51
torchzero/modules/misc/on_increase.py +0 -53
torchzero/modules/operations/__init__.py +0 -29
torchzero/modules/operations/multi.py +0 -298
torchzero/modules/operations/reduction.py +0 -134
torchzero/modules/operations/singular.py +0 -113
torchzero/modules/optimizers/sgd.py +0 -54
torchzero/modules/orthogonalization/__init__.py +0 -2
torchzero/modules/orthogonalization/newtonschulz.py +0 -159
torchzero/modules/orthogonalization/svd.py +0 -86
torchzero/modules/regularization/__init__.py +0 -22
torchzero/modules/regularization/dropout.py +0 -34
torchzero/modules/regularization/noise.py +0 -77
torchzero/modules/regularization/normalization.py +0 -328
torchzero/modules/regularization/ortho_grad.py +0 -78
torchzero/modules/regularization/weight_decay.py +0 -92
torchzero/modules/scheduling/__init__.py +0 -2
torchzero/modules/scheduling/lr_schedulers.py +0 -131
torchzero/modules/scheduling/step_size.py +0 -80
torchzero/modules/smoothing/gaussian_smoothing.py +0 -90
torchzero/modules/weight_averaging/__init__.py +0 -2
torchzero/modules/weight_averaging/ema.py +0 -72
torchzero/modules/weight_averaging/swa.py +0 -171
torchzero/optim/experimental/__init__.py +0 -20
torchzero/optim/experimental/experimental.py +0 -343
torchzero/optim/experimental/ray_search.py +0 -83
torchzero/optim/first_order/__init__.py +0 -18
torchzero/optim/first_order/cautious.py +0 -158
torchzero/optim/first_order/forward_gradient.py +0 -70
torchzero/optim/first_order/optimizers.py +0 -570
torchzero/optim/modular.py +0 -148
torchzero/optim/quasi_newton/__init__.py +0 -1
torchzero/optim/quasi_newton/directional_newton.py +0 -58
torchzero/optim/second_order/__init__.py +0 -1
torchzero/optim/second_order/newton.py +0 -94
torchzero/optim/zeroth_order/__init__.py +0 -4
torchzero/optim/zeroth_order/fdm.py +0 -87
torchzero/optim/zeroth_order/newton_fdm.py +0 -146
torchzero/optim/zeroth_order/rfdm.py +0 -217
torchzero/optim/zeroth_order/rs.py +0 -85
torchzero/random/__init__.py +0 -1
torchzero/random/random.py +0 -46
torchzero/tensorlist.py +0 -826
torchzero-0.1.8.dist-info/METADATA +0 -130
torchzero-0.1.8.dist-info/RECORD +0 -104
torchzero-0.1.8.dist-info/top_level.txt +0 -1

torchzero/utils/derivatives.py CHANGED Viewed

@@ -1,99 +1,513 @@
-from collections.abc import Sequence, Iterable
-import torch
-def _jacobian(input: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
-    flat_input = torch.cat([i.reshape(-1) for i in input])
-    grad_ouputs = torch.eye(len(flat_input), device=input[0].device, dtype=input[0].dtype)
-    jac = []
-    for i in range(flat_input.numel()):
-        jac.append(torch.autograd.grad(
-            flat_input,
-            wrt,
-            grad_ouputs[i],
-            retain_graph=True,
-            create_graph=create_graph,
-            allow_unused=True,
-            is_grads_batched=False,
-        ))
-    return [torch.stack(z) for z in zip(*jac)]
-def _jacobian_batched(input: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
-    flat_input = torch.cat([i.reshape(-1) for i in input])
-    return torch.autograd.grad(
-        flat_input,
-        wrt,
-        torch.eye(len(flat_input), device=input[0].device, dtype=input[0].dtype),
-        retain_graph=True,
-        create_graph=create_graph,
-        allow_unused=True,
-        is_grads_batched=True,
-    )
-def jacobian(input: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True) -> Sequence[torch.Tensor]:
-    """Calculate jacobian of a sequence of tensors w.r.t another sequence of tensors.
-    Returns a sequence of tensors with the length as `wrt`.
-    Each tensor will have the shape `(*input.shape, *wrt[i].shape)`.
-    Args:
-        input (Sequence[torch.Tensor]): input sequence of tensors.
-        wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
-        create_graph (bool, optional):
-            pytorch option, if True, graph of the derivative will be constructed,
-            allowing to compute higher order derivative products. Default: False.
-        batched (bool, optional): use faster but experimental pytorch batched jacobian
-            This only has effect when `input` has more than 1 element. Defaults to True.
-    Returns:
-        sequence of tensors with the length as `wrt`.
-    """
-    if batched: return _jacobian_batched(input, wrt, create_graph)
-    return _jacobian(input, wrt, create_graph)
-def hessian(input: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
-    """Calculate hessian of a sequence of tensors w.r.t another sequence of tensors.
-    Returns a sequence of tensors with the length as `wrt`.
-    If you need a hessian matrix out of that sequence, pass it to `hessian_list_to_mat`.
-    Args:
-        input (Sequence[torch.Tensor]): input sequence of tensors.
-        wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
-        create_graph (bool, optional):
-            pytorch option, if True, graph of the derivative will be constructed,
-            allowing to compute higher order derivative products. Default: False.
-        batched (bool, optional): use faster but experimental pytorch batched grad. Defaults to True.
-    Returns:
-        sequence of tensors with the length as `wrt`.
-    """
-    return jacobian(jacobian(input, wrt, create_graph=True, batched=batched), wrt, create_graph=create_graph, batched=batched)
-def jacobian_and_hessian(input: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
-    """Calculate jacobian and hessian of a sequence of tensors w.r.t another sequence of tensors.
-    Calculating hessian requires calculating the jacobian. So this function is more efficient than
-    calling `jacobian` and `hessian` separately, which would calculate jacobian twice.
-    Args:
-        input (Sequence[torch.Tensor]): input sequence of tensors.
-        wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
-        create_graph (bool, optional):
-            pytorch option, if True, graph of the derivative will be constructed,
-            allowing to compute higher order derivative products. Default: False.
-        batched (bool, optional): use faster but experimental pytorch batched grad. Defaults to True.
-    Returns:
-        tuple with jacobians sequence and hessians sequence.
-    """
-    jac = jacobian(input, wrt, create_graph=True, batched = batched)
-    return jac, jacobian(jac, wrt, batched = batched, create_graph=create_graph)
-def jacobian_list_to_vec(jacobians: Iterable[torch.Tensor]):
-    """flattens and concatenates a sequence of tensors."""
-    return torch.cat([i.ravel() for i in jacobians], 0)
-def hessian_list_to_mat(hessians: Sequence[torch.Tensor]):
-    """takes output of `hessian` and returns the 2D hessian matrix.
-    Note - I only tested this for cases where input is a scalar."""
-    return torch.cat([h.reshape(h.size(0), h[1].numel()) for h in hessians], 1)
+from collections.abc import Iterable, Sequence
+import torch
+import torch.autograd.forward_ad as fwAD
+from .torch_tools import swap_tensors_no_use_count_check, vec_to_tensors
+def _jacobian(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
+    flat_input = torch.cat([i.reshape(-1) for i in output])
+    grad_ouputs = torch.eye(len(flat_input), device=output[0].device, dtype=output[0].dtype)
+    jac = []
+    for i in range(flat_input.numel()):
+        jac.append(torch.autograd.grad(
+            flat_input,
+            wrt,
+            grad_ouputs[i],
+            retain_graph=True,
+            create_graph=create_graph,
+            allow_unused=True,
+            is_grads_batched=False,
+        ))
+    return [torch.stack(z) for z in zip(*jac)]
+def _jacobian_batched(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
+    flat_input = torch.cat([i.reshape(-1) for i in output])
+    return torch.autograd.grad(
+        flat_input,
+        wrt,
+        torch.eye(len(flat_input), device=output[0].device, dtype=output[0].dtype),
+        retain_graph=True,
+        create_graph=create_graph,
+        allow_unused=True,
+        is_grads_batched=True,
+    )
+def jacobian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True) -> Sequence[torch.Tensor]:
+    """Calculate jacobian of a sequence of tensors w.r.t another sequence of tensors.
+    Returns a sequence of tensors with the length as `wrt`.
+    Each tensor will have the shape `(*input.shape, *wrt[i].shape)`.
+    Args:
+        input (Sequence[torch.Tensor]): input sequence of tensors.
+        wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
+        create_graph (bool, optional):
+            pytorch option, if True, graph of the derivative will be constructed,
+            allowing to compute higher order derivative products. Default: False.
+        batched (bool, optional): use faster but experimental pytorch batched jacobian
+            This only has effect when `input` has more than 1 element. Defaults to True.
+    Returns:
+        sequence of tensors with the length as `wrt`.
+    """
+    if batched: return _jacobian_batched(output, wrt, create_graph)
+    return _jacobian(output, wrt, create_graph)
+def jacobian_and_hessian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
+    """Calculate jacobian and hessian of a sequence of tensors w.r.t another sequence of tensors.
+    Calculating hessian requires calculating the jacobian. So this function is more efficient than
+    calling `jacobian` and `hessian` separately, which would calculate jacobian twice.
+    Args:
+        input (Sequence[torch.Tensor]): input sequence of tensors.
+        wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
+        create_graph (bool, optional):
+            pytorch option, if True, graph of the derivative will be constructed,
+            allowing to compute higher order derivative products. Default: False.
+        batched (bool, optional): use faster but experimental pytorch batched grad. Defaults to True.
+    Returns:
+        tuple with jacobians sequence and hessians sequence.
+    """
+    jac = jacobian_wrt(output, wrt, create_graph=True, batched = batched)
+    return jac, jacobian_wrt(jac, wrt, batched = batched, create_graph=create_graph)
+def hessian_list_to_mat(hessians: Sequence[torch.Tensor]):
+    """takes output of `hessian` and returns the 2D hessian matrix.
+    Note - I only tested this for cases where input is a scalar."""
+    return torch.cat([h.reshape(h.size(0), h[1].numel()) for h in hessians], 1)
+def jacobian_and_hessian_mat_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
+    """Calculate jacobian and hessian of a sequence of tensors w.r.t another sequence of tensors.
+    Calculating hessian requires calculating the jacobian. So this function is more efficient than
+    calling `jacobian` and `hessian` separately, which would calculate jacobian twice.
+    Args:
+        input (Sequence[torch.Tensor]): input sequence of tensors.
+        wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
+        create_graph (bool, optional):
+            pytorch option, if True, graph of the derivative will be constructed,
+            allowing to compute higher order derivative products. Default: False.
+        batched (bool, optional): use faster but experimental pytorch batched grad. Defaults to True.
+    Returns:
+        tuple with jacobians sequence and hessians sequence.
+    """
+    jac = jacobian_wrt(output, wrt, create_graph=True, batched = batched)
+    H_list = jacobian_wrt(jac, wrt, batched = batched, create_graph=create_graph)
+    return torch.cat([j.view(-1) for j in jac]), hessian_list_to_mat(H_list)
+def hessian(
+    fn,
+    params: Iterable[torch.Tensor],
+    create_graph=False,
+    method="func",
+    vectorize=False,
+    outer_jacobian_strategy="reverse-mode",
+):
+    """
+    returns list of lists of lists of values of hessian matrix of each param wrt each param.
+    To just get a single matrix use the :code:`hessian_mat` function.
+    `vectorize` and `outer_jacobian_strategy` are only for `method = "torch.autograd"`, refer to its documentation.
+    Example:
+    .. code:: py
+        model = nn.Linear(4, 2) # (2, 4) weight and (2, ) bias
+        X = torch.randn(10, 4)
+        y = torch.randn(10, 2)
+        def fn():
+            y_hat = model(X)
+            loss = F.mse_loss(y_hat, y)
+            return loss
+        hessian_mat(fn, model.parameters()) # list of two lists of two lists of 3D and 4D tensors
+    """
+    params = list(params)
+    def func(x: list[torch.Tensor]):
+        for p, x_i in zip(params, x): swap_tensors_no_use_count_check(p, x_i)
+        loss = fn()
+        for p, x_i in zip(params, x): swap_tensors_no_use_count_check(p, x_i)
+        return loss
+    if method == 'func':
+        return torch.func.hessian(func)([p.detach().requires_grad_(create_graph) for p in params])
+    if method == 'autograd.functional':
+        return torch.autograd.functional.hessian(
+            func,
+            [p.detach() for p in params],
+            create_graph=create_graph,
+            vectorize=vectorize,
+            outer_jacobian_strategy=outer_jacobian_strategy,
+        )
+    raise ValueError(method)
+def hessian_mat(
+    fn,
+    params: Iterable[torch.Tensor],
+    create_graph=False,
+    method="func",
+    vectorize=False,
+    outer_jacobian_strategy="reverse-mode",
+):
+    """
+    returns hessian matrix for parameters (as if they were flattened and concatenated into a vector).
+    `vectorize` and `outer_jacobian_strategy` are only for `method = "torch.autograd"`, refer to its documentation.
+    Example:
+    .. code:: py
+        model = nn.Linear(4, 2) # 10 parameters in total
+        X = torch.randn(10, 4)
+        y = torch.randn(10, 2)
+        def fn():
+            y_hat = model(X)
+            loss = F.mse_loss(y_hat, y)
+            return loss
+        hessian_mat(fn, model.parameters()) # 10x10 tensor
+    """
+    params = list(params)
+    def func(x: torch.Tensor):
+        x_params = vec_to_tensors(x, params)
+        for p, x_i in zip(params, x_params): swap_tensors_no_use_count_check(p, x_i)
+        loss = fn()
+        for p, x_i in zip(params, x_params): swap_tensors_no_use_count_check(p, x_i)
+        return loss
+    if method == 'func':
+        return torch.func.hessian(func)(torch.cat([p.view(-1) for p in params]).detach().requires_grad_(create_graph))
+    if method == 'autograd.functional':
+        return torch.autograd.functional.hessian(
+            func,
+            torch.cat([p.view(-1) for p in params]).detach(),
+            create_graph=create_graph,
+            vectorize=vectorize,
+            outer_jacobian_strategy=outer_jacobian_strategy,
+        )
+    raise ValueError(method)
+def jvp(fn, params: Iterable[torch.Tensor], tangent: Iterable[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+    """Jacobian vector product.
+    Example:
+    .. code:: py
+        model = nn.Linear(4, 2)
+        X = torch.randn(10, 4)
+        y = torch.randn(10, 2)
+        tangent = [torch.randn_like(p) for p in model.parameters()]
+        def fn():
+            y_hat = model(X)
+            loss = F.mse_loss(y_hat, y)
+            return loss
+        jvp(fn, model.parameters(), tangent) # scalar
+    """
+    params = list(params)
+    tangent = list(tangent)
+    detached_params = [p.detach() for p in params]
+    duals = []
+    with fwAD.dual_level():
+        for p, d, t in zip(params, detached_params, tangent):
+            dual = fwAD.make_dual(d, t).requires_grad_(p.requires_grad)
+            duals.append(dual)
+            swap_tensors_no_use_count_check(p, dual)
+        loss = fn()
+        res = fwAD.unpack_dual(loss).tangent
+    for p, d in zip(params, duals):
+        swap_tensors_no_use_count_check(p, d)
+    return loss, res
+@torch.no_grad
+def jvp_fd_central(
+    fn,
+    params: Iterable[torch.Tensor],
+    tangent: Iterable[torch.Tensor],
+    h=1e-3,
+    normalize=False,
+) -> tuple[torch.Tensor | None, torch.Tensor]:
+    """Jacobian vector product using central finite difference formula.
+    Example:
+    .. code:: py
+        model = nn.Linear(4, 2)
+        X = torch.randn(10, 4)
+        y = torch.randn(10, 2)
+        tangent = [torch.randn_like(p) for p in model.parameters()]
+        def fn():
+            y_hat = model(X)
+            loss = F.mse_loss(y_hat, y)
+            return loss
+        jvp_fd_central(fn, model.parameters(), tangent) # scalar
+    """
+    params = list(params)
+    tangent = list(tangent)
+    tangent_norm = None
+    if normalize:
+        tangent_norm = torch.linalg.vector_norm(torch.cat([t.view(-1) for t in tangent])) # pylint:disable=not-callable
+        if tangent_norm == 0: return None, torch.tensor(0., device=tangent[0].device, dtype=tangent[0].dtype)
+        tangent = torch._foreach_div(tangent, tangent_norm)
+    tangent_h= torch._foreach_mul(tangent, h)
+    torch._foreach_add_(params, tangent_h)
+    v_plus = fn()
+    torch._foreach_sub_(params, tangent_h)
+    torch._foreach_sub_(params, tangent_h)
+    v_minus = fn()
+    torch._foreach_add_(params, tangent_h)
+    res = (v_plus - v_minus) / (2 * h)
+    if normalize: res = res * tangent_norm
+    return v_plus, res
+@torch.no_grad
+def jvp_fd_forward(
+    fn,
+    params: Iterable[torch.Tensor],
+    tangent: Iterable[torch.Tensor],
+    h=1e-3,
+    v_0=None,
+    normalize=False,
+) -> tuple[torch.Tensor | None, torch.Tensor]:
+    """Jacobian vector product using forward finite difference formula.
+    Loss at initial point can be specified in the `v_0` argument.
+    Example:
+    .. code:: py
+        model = nn.Linear(4, 2)
+        X = torch.randn(10, 4)
+        y = torch.randn(10, 2)
+        tangent1 = [torch.randn_like(p) for p in model.parameters()]
+        tangent2 = [torch.randn_like(p) for p in model.parameters()]
+        def fn():
+            y_hat = model(X)
+            loss = F.mse_loss(y_hat, y)
+            return loss
+        v_0 = fn() # pre-calculate loss at initial point
+        jvp1 = jvp_fd_forward(fn, model.parameters(), tangent1, v_0=v_0) # scalar
+        jvp2 = jvp_fd_forward(fn, model.parameters(), tangent2, v_0=v_0) # scalar
+    """
+    params = list(params)
+    tangent = list(tangent)
+    tangent_norm = None
+    if normalize:
+        tangent_norm = torch.linalg.vector_norm(torch.cat([t.view(-1) for t in tangent])) # pylint:disable=not-callable
+        if tangent_norm == 0: return None, torch.tensor(0., device=tangent[0].device, dtype=tangent[0].dtype)
+        tangent = torch._foreach_div(tangent, tangent_norm)
+    tangent_h= torch._foreach_mul(tangent, h)
+    if v_0 is None: v_0 = fn()
+    torch._foreach_add_(params, tangent_h)
+    v_plus = fn()
+    torch._foreach_sub_(params, tangent_h)
+    res = (v_plus - v_0) / h
+    if normalize: res = res * tangent_norm
+    return v_0, res
+def hvp(
+    params: Iterable[torch.Tensor],
+    grads: Iterable[torch.Tensor],
+    vec: Iterable[torch.Tensor],
+    retain_graph=None,
+    create_graph=False,
+    allow_unused=None,
+):
+    """Hessian-vector product
+    Example:
+    .. code:: py
+        model = nn.Linear(4, 2)
+        X = torch.randn(10, 4)
+        y = torch.randn(10, 2)
+        y_hat = model(X)
+        loss = F.mse_loss(y_hat, y)
+        loss.backward(create_graph=True)
+        grads = [p.grad for p in model.parameters()]
+        vec = [torch.randn_like(p) for p in model.parameters()]
+        # list of tensors, same layout as model.parameters()
+        hvp(model.parameters(), grads, vec=vec)
+    """
+    params = list(params)
+    g = list(grads)
+    vec = list(vec)
+    with torch.enable_grad():
+        return torch.autograd.grad(g, params, vec, create_graph=create_graph, retain_graph=retain_graph, allow_unused=allow_unused)
+@torch.no_grad
+def hvp_fd_central(
+    closure,
+    params: Iterable[torch.Tensor],
+    vec: Iterable[torch.Tensor],
+    h=1e-3,
+    normalize=False,
+) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
+    """Hessian-vector product using central finite difference formula.
+    Please note that this will clear :code:`grad` attributes in params.
+    Example:
+    .. code:: py
+        model = nn.Linear(4, 2)
+        X = torch.randn(10, 4)
+        y = torch.randn(10, 2)
+        def closure():
+            y_hat = model(X)
+            loss = F.mse_loss(y_hat, y)
+            model.zero_grad()
+            loss.backward()
+            return loss
+        vec = [torch.randn_like(p) for p in model.parameters()]
+        # list of tensors, same layout as model.parameters()
+        hvp_fd_central(closure, model.parameters(), vec=vec)
+    """
+    params = list(params)
+    vec = list(vec)
+    vec_norm = None
+    if normalize:
+        vec_norm = torch.linalg.vector_norm(torch.cat([t.view(-1) for t in vec])) # pylint:disable=not-callable
+        if vec_norm == 0: return None, [torch.zeros_like(p) for p in params]
+        vec = torch._foreach_div(vec, vec_norm)
+    vec_h = torch._foreach_mul(vec, h)
+    torch._foreach_add_(params, vec_h)
+    with torch.enable_grad(): loss = closure()
+    g_plus = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
+    torch._foreach_sub_(params, vec_h)
+    torch._foreach_sub_(params, vec_h)
+    with torch.enable_grad(): loss = closure()
+    g_minus = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
+    torch._foreach_add_(params, vec_h)
+    for p in params: p.grad = None
+    hvp_ = g_plus
+    torch._foreach_sub_(hvp_, g_minus)
+    torch._foreach_div_(hvp_, 2*h)
+    if normalize: torch._foreach_mul_(hvp_, vec_norm)
+    return loss, hvp_
+@torch.no_grad
+def hvp_fd_forward(
+    closure,
+    params: Iterable[torch.Tensor],
+    vec: Iterable[torch.Tensor],
+    h=1e-3,
+    g_0=None,
+    normalize=False,
+) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
+    """Hessian-vector product using forward finite difference formula.
+    Gradient at initial point can be specified in the `g_0` argument.
+    Please note that this will clear :code:`grad` attributes in params.
+    Example:
+    .. code:: py
+        model = nn.Linear(4, 2)
+        X = torch.randn(10, 4)
+        y = torch.randn(10, 2)
+        def closure():
+            y_hat = model(X)
+            loss = F.mse_loss(y_hat, y)
+            model.zero_grad()
+            loss.backward()
+            return loss
+        vec = [torch.randn_like(p) for p in model.parameters()]
+        # pre-compute gradient at initial point
+        closure()
+        g_0 = [p.grad for p in model.parameters()]
+        # list of tensors, same layout as model.parameters()
+        hvp_fd_forward(closure, model.parameters(), vec=vec, g_0=g_0)
+    """
+    params = list(params)
+    vec = list(vec)
+    loss = None
+    vec_norm = None
+    if normalize:
+        vec_norm = torch.linalg.vector_norm(torch.cat([t.view(-1) for t in vec])) # pylint:disable=not-callable
+        if vec_norm == 0: return None, [torch.zeros_like(p) for p in params]
+        vec = torch._foreach_div(vec, vec_norm)
+    vec_h = torch._foreach_mul(vec, h)
+    if g_0 is None:
+        with torch.enable_grad(): loss = closure()
+        g_0 = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
+    else:
+        g_0 = list(g_0)
+    torch._foreach_add_(params, vec_h)
+    with torch.enable_grad():
+        l = closure()
+        if loss is None: loss = l
+    g_plus = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
+    torch._foreach_sub_(params, vec_h)
+    for p in params: p.grad = None
+    hvp_ = g_plus
+    torch._foreach_sub_(hvp_, g_0)
+    torch._foreach_div_(hvp_, h)
+    if normalize: torch._foreach_mul_(hvp_, vec_norm)
+    return loss, hvp_

torchzero/utils/linalg/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .matrix_funcs import inv_sqrt_2x2, eigvals_func, singular_vals_func, matrix_power_eigh, x_inv
+from .orthogonalize import gram_schmidt
+from .qr import qr_householder
+from .svd import randomized_svd
+from .solve import cg, nystrom_approximation, nystrom_sketch_and_solve

torchzero 0.1.8__py3-none-any.whl → 0.3.1__py3-none-any.whl

torchzero 0.1.8py3-none-any.whl → 0.3.1py3-none-any.whl