PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +47 -36
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +8 -2
torchzero/core/chain.py +47 -0
torchzero/core/functional.py +103 -0
torchzero/core/modular.py +233 -0
torchzero/core/module.py +132 -643
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +56 -23
torchzero/core/transform.py +261 -365
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +132 -34
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +3 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/{higher_order → experimental}/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +20 -17
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +5 -5
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +8 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +3 -3
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +42 -51
torchzero/modules/line_search/strong_wolfe.py +5 -5
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +63 -61
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +25 -16
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +58 -0
torchzero/modules/second_order/inm.py +101 -0
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +105 -228
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +158 -178
torchzero/modules/second_order/rsn.py +237 -0
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +57 -50
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +112 -88
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/modules/experimental/momentum.py +0 -160
torchzero/modules/higher_order/__init__.py +0 -1
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.14.dist-info/RECORD +0 -167
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/utils/derivatives.py CHANGED Viewed

@@ -4,14 +4,15 @@ import torch
 import torch.autograd.forward_ad as fwAD
 from .torch_tools import swap_tensors_no_use_count_check, vec_to_tensors
+from .tensorlist import TensorList
-def _jacobian(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
-    flat_input = torch.cat([i.reshape(-1) for i in output])
-    grad_ouputs = torch.eye(len(flat_input), device=output[0].device, dtype=output[0].dtype)
+def _jacobian(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
+    flat_outputs = torch.cat([i.reshape(-1) for i in outputs])
+    grad_ouputs = torch.eye(len(flat_outputs), device=outputs[0].device, dtype=outputs[0].dtype)
     jac = []
-    for i in range(flat_input.numel()):
+    for i in range(flat_outputs.numel()):
         jac.append(torch.autograd.grad(
-            flat_input,
+            flat_outputs,
             wrt,
             grad_ouputs[i],
             retain_graph=True,
@@ -22,12 +23,12 @@ def _jacobian(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], creat
     return [torch.stack(z) for z in zip(*jac)]
-def _jacobian_batched(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
-    flat_input = torch.cat([i.reshape(-1) for i in output])
+def _jacobian_batched(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
+    flat_outputs = torch.cat([i.reshape(-1) for i in outputs])
     return torch.autograd.grad(
-        flat_input,
+        flat_outputs,
         wrt,
-        torch.eye(len(flat_input), device=output[0].device, dtype=output[0].dtype),
+        torch.eye(len(flat_outputs), device=outputs[0].device, dtype=outputs[0].dtype),
         retain_graph=True,
         create_graph=create_graph,
         allow_unused=True,
@@ -51,13 +52,13 @@ def flatten_jacobian(jacs: Sequence[torch.Tensor]) -> torch.Tensor:
     return torch.cat([j.reshape(n_out, -1) for j in jacs], dim=1)
-def jacobian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True) -> Sequence[torch.Tensor]:
+def jacobian_wrt(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True) -> Sequence[torch.Tensor]:
     """Calculate jacobian of a sequence of tensors w.r.t another sequence of tensors.
     Returns a sequence of tensors with the length as `wrt`.
     Each tensor will have the shape `(*output.shape, *wrt[i].shape)`.
     Args:
-        input (Sequence[torch.Tensor]): input sequence of tensors.
+        outputs (Sequence[torch.Tensor]): input sequence of tensors.
         wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
         create_graph (bool, optional):
             pytorch option, if True, graph of the derivative will be constructed,
@@ -68,16 +69,16 @@ def jacobian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], cr
     Returns:
         sequence of tensors with the length as `wrt`.
     """
-    if batched: return _jacobian_batched(output, wrt, create_graph)
-    return _jacobian(output, wrt, create_graph)
+    if batched: return _jacobian_batched(outputs, wrt, create_graph)
+    return _jacobian(outputs, wrt, create_graph)
-def jacobian_and_hessian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
+def jacobian_and_hessian_wrt(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
     """Calculate jacobian and hessian of a sequence of tensors w.r.t another sequence of tensors.
     Calculating hessian requires calculating the jacobian. So this function is more efficient than
     calling `jacobian` and `hessian` separately, which would calculate jacobian twice.
     Args:
-        input (Sequence[torch.Tensor]): input sequence of tensors.
+        outputs (Sequence[torch.Tensor]): input sequence of tensors.
         wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
         create_graph (bool, optional):
             pytorch option, if True, graph of the derivative will be constructed,
@@ -87,7 +88,7 @@ def jacobian_and_hessian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch
     Returns:
         tuple with jacobians sequence and hessians sequence.
     """
-    jac = jacobian_wrt(output, wrt, create_graph=True, batched = batched)
+    jac = jacobian_wrt(outputs, wrt, create_graph=True, batched = batched)
     return jac, jacobian_wrt(jac, wrt, batched = batched, create_graph=create_graph)
@@ -96,13 +97,13 @@ def jacobian_and_hessian_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch
 #     Note - I only tested this for cases where input is a scalar."""
 #     return torch.cat([h.reshape(h.size(0), h[1].numel()) for h in hessians], 1)
-def jacobian_and_hessian_mat_wrt(output: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
+def jacobian_and_hessian_mat_wrt(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False, batched=True):
     """Calculate jacobian and hessian of a sequence of tensors w.r.t another sequence of tensors.
     Calculating hessian requires calculating the jacobian. So this function is more efficient than
     calling `jacobian` and `hessian` separately, which would calculate jacobian twice.
     Args:
-        input (Sequence[torch.Tensor]): input sequence of tensors.
+        outputs (Sequence[torch.Tensor]): input sequence of tensors.
         wrt (Sequence[torch.Tensor]): sequence of tensors to differentiate w.r.t.
         create_graph (bool, optional):
             pytorch option, if True, graph of the derivative will be constructed,
@@ -112,7 +113,7 @@ def jacobian_and_hessian_mat_wrt(output: Sequence[torch.Tensor], wrt: Sequence[t
     Returns:
         tuple with jacobians sequence and hessians sequence.
     """
-    jac = jacobian_wrt(output, wrt, create_graph=True, batched = batched)
+    jac = jacobian_wrt(outputs, wrt, create_graph=True, batched = batched)
     H_list = jacobian_wrt(jac, wrt, batched = batched, create_graph=create_graph)
     return flatten_jacobian(jac), flatten_jacobian(H_list)
@@ -261,7 +262,7 @@ def jvp_fd_central(
     params: Iterable[torch.Tensor],
     tangent: Iterable[torch.Tensor],
     h=1e-3,
-    normalize=False,
+    normalize=True,
 ) -> tuple[torch.Tensor | None, torch.Tensor]:
     """Jacobian vector product using central finite difference formula.
@@ -310,7 +311,7 @@ def jvp_fd_forward(
     tangent: Iterable[torch.Tensor],
     h=1e-3,
     v_0=None,
-    normalize=False,
+    normalize=True,
 ) -> tuple[torch.Tensor | None, torch.Tensor]:
     """Jacobian vector product using forward finite difference formula.
     Loss at initial point can be specified in the `v_0` argument.
@@ -357,52 +358,18 @@ def jvp_fd_forward(
     if normalize: res = res * tangent_norm
     return v_0, res
-def hvp(
-    params: Iterable[torch.Tensor],
-    grads: Iterable[torch.Tensor],
-    vec: Iterable[torch.Tensor],
-    retain_graph=None,
-    create_graph=False,
-    allow_unused=None,
-):
-    """Hessian-vector product
-    Example:
-    ```python
-    model = nn.Linear(4, 2)
-    X = torch.randn(10, 4)
-    y = torch.randn(10, 2)
-    y_hat = model(X)
-    loss = F.mse_loss(y_hat, y)
-    loss.backward(create_graph=True)
-    grads = [p.grad for p in model.parameters()]
-    vec = [torch.randn_like(p) for p in model.parameters()]
-    # list of tensors, same layout as model.parameters()
-    hvp(model.parameters(), grads, vec=vec)
-    ```
-    """
-    params = list(params)
-    g = list(grads)
-    vec = list(vec)
-    with torch.enable_grad():
-        return torch.autograd.grad(g, params, vec, create_graph=create_graph, retain_graph=retain_graph, allow_unused=allow_unused)
 @torch.no_grad
 def hvp_fd_central(
     closure,
     params: Iterable[torch.Tensor],
-    vec: Iterable[torch.Tensor],
+    x: Iterable[torch.Tensor],
     h=1e-3,
-    normalize=False,
+    normalize=True,
 ) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
-    """Hessian-vector product using central finite difference formula.
+    """Returns ``(loss_approx, Hx)``.
-    Please note that this will clear :code:`grad` attributes in params.
+    Please note that this will clear ``grad`` attributes in params.
     Example:
     ```python
@@ -424,48 +391,48 @@ def hvp_fd_central(
     ```
     """
     params = list(params)
-    vec = list(vec)
+    x = list(x)
     vec_norm = None
     if normalize:
-        vec_norm = torch.linalg.vector_norm(torch.cat([t.view(-1) for t in vec])) # pylint:disable=not-callable
+        vec_norm = torch.linalg.vector_norm(torch.cat([t.view(-1) for t in x])) # pylint:disable=not-callable
         if vec_norm == 0: return None, [torch.zeros_like(p) for p in params]
-        vec = torch._foreach_div(vec, vec_norm)
+        x = torch._foreach_div(x, vec_norm)
-    vec_h = torch._foreach_mul(vec, h)
-    torch._foreach_add_(params, vec_h)
+    xh = torch._foreach_mul(x, h)
+    torch._foreach_add_(params, xh)
     with torch.enable_grad(): loss = closure()
     g_plus = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
-    torch._foreach_sub_(params, vec_h)
-    torch._foreach_sub_(params, vec_h)
+    torch._foreach_sub_(params, xh)
+    torch._foreach_sub_(params, xh)
     with torch.enable_grad(): loss = closure()
     g_minus = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
-    torch._foreach_add_(params, vec_h)
+    torch._foreach_add_(params, xh)
     for p in params: p.grad = None
-    hvp_ = g_plus
-    torch._foreach_sub_(hvp_, g_minus)
-    torch._foreach_div_(hvp_, 2*h)
+    hx = g_plus
+    torch._foreach_sub_(hx, g_minus)
+    torch._foreach_div_(hx, 2*h)
-    if normalize: torch._foreach_mul_(hvp_, vec_norm)
-    return loss, hvp_
+    if normalize: torch._foreach_mul_(hx, vec_norm)
+    return loss, hx
 @torch.no_grad
 def hvp_fd_forward(
     closure,
     params: Iterable[torch.Tensor],
-    vec: Iterable[torch.Tensor],
+    x: Iterable[torch.Tensor],
     h=1e-3,
     g_0=None,
-    normalize=False,
+    normalize=True,
 ) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
-    """Hessian-vector product using forward finite difference formula.
+    """Returns ``(loss_approx, Hx)``.
-    Gradient at initial point can be specified in the `g_0` argument.
+    Gradient at initial point can be specified in the ``g_0`` argument.
-    Please note that this will clear :code:`grad` attributes in params.
+    Please note that this will clear ``grad`` attributes in params.
     Example:
     ```python
@@ -492,16 +459,16 @@ def hvp_fd_forward(
     """
     params = list(params)
-    vec = list(vec)
+    x = list(x)
     loss = None
     vec_norm = None
     if normalize:
-        vec_norm = torch.linalg.vector_norm(torch.cat([t.ravel() for t in vec])) # pylint:disable=not-callable
+        vec_norm = torch.linalg.vector_norm(torch.cat([t.ravel() for t in x])) # pylint:disable=not-callable
         if vec_norm == 0: return None, [torch.zeros_like(p) for p in params]
-        vec = torch._foreach_div(vec, vec_norm)
+        x = torch._foreach_div(x, vec_norm)
-    vec_h = torch._foreach_mul(vec, h)
+    xh = torch._foreach_mul(x, h)
     if g_0 is None:
         with torch.enable_grad(): loss = closure()
@@ -509,18 +476,75 @@ def hvp_fd_forward(
     else:
         g_0 = list(g_0)
-    torch._foreach_add_(params, vec_h)
+    torch._foreach_add_(params, xh)
     with torch.enable_grad():
         l = closure()
         if loss is None: loss = l
     g_plus = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
-    torch._foreach_sub_(params, vec_h)
+    torch._foreach_sub_(params, xh)
     for p in params: p.grad = None
-    hvp_ = g_plus
-    torch._foreach_sub_(hvp_, g_0)
-    torch._foreach_div_(hvp_, h)
+    hx = g_plus
+    torch._foreach_sub_(hx, g_0)
+    torch._foreach_div_(hx, h)
+    if normalize: torch._foreach_mul_(hx, vec_norm)
+    return loss, hx
+@torch.no_grad
+def hessian_fd(fn, params: Sequence[torch.Tensor], eps: float = 1e-4, full: bool = True):
+    """returns ``f(x), g(x), H(x)``, where ``g(x)`` is a tensor list.
+    Number of evals for full is: 4n^2 - 2n + 1
+    Number of evals for upper is: 2n^2 + 1.
+    """
+    params = TensorList(params)
+    p_0 = params.clone()
+    n = sum(t.numel() for t in params)
+    device = params[0].device; dtype = params[0].dtype
+    fx = fn()
+    g = params.zeros_like()
+    H = torch.zeros((n, n), device=device, dtype=dtype)
+    for i in range(n):
+        for j in (range(n) if full else range(i, n)):
+            if i == j:
+                params.flat_set_lambda_(i, lambda x: x + eps)
+                f_plus = fn()
+                params.flat_set_lambda_(i, lambda x: x - 2 * eps)
+                f_minus = fn()
+                # params.flat_set_lambda_(i, lambda x: x + eps)
+                g.flat_set_(i, (f_plus - f_minus) / (2*eps))
+                H[i, i] = (f_plus - 2 * fx + f_minus) / (eps ** 2)
+            else:
+                params.flat_set_lambda_(i, lambda x: x + eps)
+                params.flat_set_lambda_(j, lambda x: x + eps)
+                f_pp = fn()
+                params.flat_set_lambda_(i, lambda x: x - 2 * eps)
+                f_np = fn()
+                params.flat_set_lambda_(j, lambda x: x - 2 * eps)
+                f_nn = fn()
+                params.flat_set_lambda_(i, lambda x: x + 2 * eps)
+                f_pn = fn()
+                # params.flat_set_lambda_(i, lambda x: x - eps)
+                # params.flat_set_lambda_(j, lambda x: x + eps)
+                H[i, j] = (f_pp - f_np - f_pn + f_nn) / (4 * eps ** 2)
+                if not full: H[j, i] = H[i, j]
+            params.copy_(p_0) # otherwise inaccuracy builds up
+    if full:
+        H = H + H.T
+        H /= 2
-    if normalize: torch._foreach_mul_(hvp_, vec_norm)
-    return loss, hvp_
+    return fx, g, H

torchzero/utils/optimizer.py CHANGED Viewed

@@ -64,22 +64,15 @@ def get_group_vals(param_groups: Iterable[Mapping[str, Any]],
                 values[i].extend(group_value for _ in range(num_params))
     return values
-_InitLiterals = Literal['param', 'grad']
-Init = _InitLiterals | Any | list[_InitLiterals | Any] | tuple[_InitLiterals | Any]
+Init =  Any
-def _make_initial_state_value(param: torch.Tensor, init: Init, i: int | None):
-    if callable(init): return init(param)
+def _make_initial_state_value(tensor: torch.Tensor, init: Init, i: int | None):
+    if callable(init): return init(tensor)
     if isinstance(init, torch.Tensor): return init.detach().clone()
-    if isinstance(init, str):
-        if init in ('param','params'): return param.detach().clone()
-        if init in ('grad', 'grads'):
-            if param.grad is None: raise RuntimeError('init is set to "grad, but param.grad is None"')
-            return param.grad.detach().clone()
     if isinstance(init, (list,tuple)):
         if i is None: raise RuntimeError(f'init is per-parameter ({type(init)}) but parameter index i is None')
-        return _make_initial_state_value(param, init[i], None)
+        return _make_initial_state_value(tensor, init[i], None)
     return init
@@ -133,72 +126,6 @@ def get_state_vals(state: Mapping[torch.Tensor, MutableMapping[str, Any]], param
     return values
-class Optimizer(torch.optim.Optimizer, ABC):
-    """subclass of torch.optim.Optimizer with some helper methods for fast experimentation, it's not used anywhere in torchzero.
-    Args:
-        params (iterable): an iterable of :class:`torch.Tensor` s or
-            :class:`dict` s. Specifies what Tensors should be optimized.
-        defaults (dict | None): a dict containing default values of optimization
-            options (used when a parameter group doesn't specify them).
-    """
-    def __init__(self, params, defaults: dict[str, Any] | None = None, **_defaults):
-        if defaults is None: defaults = {}
-        defaults.update(_defaults)
-        super().__init__(params, defaults)
-        self.global_state = self.state[self.param_groups[0]['params'][0]]
-        """state of 1st parameter, can be used as global state which is how L-BFGS uses it in pytorch, and there is some kind of good reason to do it like that"""
-    def get_params(self, mode: ParamFilter = 'requires_grad', cls: type[ListLike] = TensorList) -> ListLike:
-        return get_params(self.param_groups, mode, cls)
-    @overload
-    def group_vals(self, key: str, *,
-                   mode: ParamFilter = 'requires_grad', cls: type[ListLike] = NumberList) -> ListLike: ...
-    @overload
-    def group_vals(self, key: list[str] | tuple[str,...], *,
-                   mode: ParamFilter = 'requires_grad', cls: type[ListLike] = NumberList) -> list[ListLike]: ...
-    @overload
-    def group_vals(self, key: str, key2: str, *keys: str,
-                   mode: ParamFilter = 'requires_grad', cls: type[ListLike] = NumberList) -> list[ListLike]: ...
-    def group_vals(self, key: str | list[str] | tuple[str,...], key2: str | None = None, *keys: str,
-                   mode: ParamFilter = 'requires_grad', cls: type[ListLike] = NumberList) -> ListLike | list[ListLike]:
-        return get_group_vals(self.param_groups, key, key2, *keys, mode = mode, cls = cls) # pyright:ignore[reportArgumentType]
-    @overload
-    def state_vals(self, key: str, *,
-                   init: Init = torch.zeros_like,
-                   mode: ParamFilter | list[torch.Tensor] | tuple[torch.Tensor, ...] = 'requires_grad',
-                   cls: type[ListLike] = TensorList) -> ListLike: ...
-    @overload
-    def state_vals(self, key: list[str] | tuple[str,...], *,
-                   init: Init | Sequence[Init] = torch.zeros_like,
-                   mode: ParamFilter | list[torch.Tensor] | tuple[torch.Tensor, ...] = 'requires_grad',
-                   cls: type[ListLike] = TensorList) -> list[ListLike]: ...
-    @overload
-    def state_vals(self, key: str, key2: str, *keys: str,
-                   init: Init | Sequence[Init] = torch.zeros_like,
-                   mode: ParamFilter | list[torch.Tensor] | tuple[torch.Tensor, ...] = 'requires_grad',
-                   cls: type[ListLike] = TensorList) -> list[ListLike]: ...
-    def state_vals(self, key: str | list[str] | tuple[str,...], key2: str | None = None, *keys: str,
-                   init: Init | Sequence[Init] = torch.zeros_like,
-                   mode: ParamFilter | list[torch.Tensor] | tuple[torch.Tensor, ...] = 'requires_grad',
-                   cls: type[ListLike] = TensorList) -> ListLike | list[ListLike]:
-        if isinstance(mode, (list,tuple)): params = mode
-        else: params = self.get_params(mode)
-        return get_state_vals(self.state, params, key, key2, *keys, init = init, cls = cls) # type:ignore[reportArgumentType]
-    # shut up pylance
-    @abstractmethod
-    def step(self, closure) -> Any: ... # pylint:disable=signature-differs # pyright:ignore[reportIncompatibleMethodOverride]
 def zero_grad_(params: Iterable[torch.Tensor], set_to_none):
     if set_to_none:
         for p in params:

torchzero/utils/python_tools.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import importlib
 import functools
 import operator
 from typing import Any, TypeVar, overload
@@ -40,6 +41,11 @@ def generic_ne(x: int | float | Iterable[int | float], y: int | float | Iterable
         return any(i!=y for i in x)
     return any(i!=j for i,j in zip(x,y))
+def generic_is_none(x: Any | Iterable[Any]):
+    """returns True if x is None or iterable with all elements set to None"""
+    if x is None: return True
+    if isinstance(x, Iterable): return all(i is None for i in x)
+    return False
 def zipmap(self, fn: Callable, other: Any | list | tuple, *args, **kwargs):
     """If `other` is list/tuple, applies `fn` to self zipped with `other`.
@@ -68,3 +74,28 @@ def safe_dict_update_(d1_:dict, d2:dict):
     if len(inter) > 0: raise RuntimeError(f"Duplicate keys {inter}")
     d1_.update(d2)
+# lazy loader from https://stackoverflow.com/a/78312674/15673832
+class LazyLoader:
+    'thin shell class to wrap modules.  load real module on first access and pass thru'
+    def __init__(self, modname):
+        self._modname = modname
+        self._mod = None
+    def __getattr__(self, attr):
+        'import module on first attribute access'
+        try:
+            return getattr(self._mod, attr)
+        except Exception as e :
+            if self._mod is None :
+                # module is unset, load it
+                self._mod = importlib.import_module (self._modname)
+            else :
+                # module is set, got different exception from getattr ().  reraise it
+                raise e
+        # retry getattr if module was just loaded for first time
+        # call this outside exception handler in case it raises new exception
+        return getattr (self._mod, attr)

torchzero/utils/tensorlist.py CHANGED Viewed

@@ -22,7 +22,6 @@ from typing_extensions import Self, TypeAlias, Unpack
 from .metrics import Metrics, evaluate_metric, calculate_metric_list
 from .numberlist import NumberList, as_numberlist, maybe_numberlist
-from .ops import where_
 from .python_tools import generic_ne, zipmap
 _Scalar = int | float | bool | complex
@@ -346,6 +345,10 @@ class TensorList(list[torch.Tensor | Any]):
     def global_all(self): return builtins.all(self.all())
     def global_numel(self) -> int: return builtins.sum(self.numel())
+    def global_allclose(self, other: _TensorSeq, rtol: float = 0.00001, atol: float = 1e-8, equal_nan: bool = False) -> bool:
+        bools = self.zipmap_args(torch.allclose, other, rtol, atol, equal_nan)
+        return all(bools)
     def empty_like(self, **kwargs: Unpack[_NewTensorKwargs]): return self.__class__(torch.empty_like(i, **kwargs) for i in self)
     def zeros_like(self, **kwargs: Unpack[_NewTensorKwargs]): return self.__class__(torch.zeros_like(i, **kwargs) for i in self)
     def ones_like(self, **kwargs: Unpack[_NewTensorKwargs]): return self.__class__(torch.ones_like(i, **kwargs) for i in self)
@@ -509,7 +512,6 @@ class TensorList(list[torch.Tensor | Any]):
         torch._foreach_mul_(self, other)
         return self
-    # TODO: benchmark
     def lazy_mul(self, other: int | float | list[int | float] | tuple[int | float], clone=False):
         if generic_ne(other, 1):
             return self * other
@@ -536,6 +538,13 @@ class TensorList(list[torch.Tensor | Any]):
         torch._foreach_pow_(self, exponent)
         return self
+    def lazy_pow(self, other: int | float | list[int | float] | tuple[int | float]):
+        if generic_ne(other, 1): return self.pow(other)
+        return self
+    def lazy_pow_(self, other: int | float | list[int | float] | tuple[int | float]):
+        if generic_ne(other, 1): return self.pow_(other)
+        return self
     def rpow(self, input: _Scalar | _TensorSeq): return self.__class__(torch._foreach_pow(input, self))
     def rpow_(self, input: _TensorSeq):
         torch._foreach_pow_(input, self)
@@ -984,9 +993,6 @@ class TensorList(list[torch.Tensor | Any]):
     def where(self, condition: "torch.Tensor | _TensorSeq", other: _STOrSTSeq):
         """self where condition is true other otherwise"""
         return self.zipmap_args(_MethodCallerWithArgs('where'), condition, other)
-    def where_(self, condition: "torch.Tensor | _TensorSeq", other: "torch.Tensor | _TensorSeq"):
-        """self where condition is true other otherwise"""
-        return self.zipmap_args_inplace_(where_, condition, other)
     def masked_fill(self, mask: "torch.Tensor | _TensorSeq", fill_value: "_Scalar | _ScalarSeq"):
         """Same as tensor[mask] = value (not in-place), where value must be scalar/scalars"""

torchzero/utils/thoad_tools.py ADDED Viewed

@@ -0,0 +1,68 @@
+import itertools
+from collections.abc import Callable
+from importlib.util import find_spec
+from typing import TYPE_CHECKING, cast
+import torch
+from .python_tools import LazyLoader
+lazy_thoad = LazyLoader("thoad")
+if TYPE_CHECKING:
+    import thoad
+    lazy_thoad = cast(thoad, lazy_thoad)
+def thoad_single_tensor(
+    ctrl: "thoad.Controller",
+    params: list[torch.Tensor],
+    order: int
+) -> torch.Tensor:
+    """treats params as if they were concatenated into a vector."""
+    if not all(p.requires_grad for p in params):
+        raise ValueError("All parameters must have requires_grad=True")
+    if order < 1:
+        raise ValueError("Order must be at least 1")
+    # we need parameter sizes and total size N
+    # final tensor is (N, N, ..., N) with `order` dimensions.
+    param_numels = [p.numel() for p in params]
+    total_params = sum(param_numels)
+    final_shape = (total_params,) * order
+    p = params[0]
+    T = torch.zeros(final_shape, device=p.device, dtype=p.dtype)
+    # start/end indices for each parameter in the flattened vector.
+    offsets = torch.cumsum(torch.tensor([0] + param_numels), dim=0)
+    # for order=2 this iterates through (p0,p0), (p0,p1), (p1,p0), (p1,p1), etc.
+    param_indices = range(len(params))
+    for block_indices in itertools.product(param_indices, repeat=order):
+        block_params = tuple(params[i] for i in block_indices)
+        block_tensor, _ = ctrl.fetch_hgrad(variables=block_params) # (1, *p1.shape, *p2.shape, ...).
+        block_tensor = block_tensor.squeeze(0) # (*p1.shape, *p2.shape, ...)
+        # convert (*p1.shape, *p2.shape) to (p1.numel(), p2.numel())
+        block_flat_shape = tuple(param_numels[i] for i in block_indices)
+        block_tensor_flat = block_tensor.reshape(block_flat_shape)
+        # place the flattened block into T
+        slicing = tuple(
+            slice(offsets[i], offsets[i+1]) for i in block_indices
+        )
+        T[slicing] = block_tensor_flat
+    ctrl.clear()
+    return T
+def thoad_derivatives(
+    ctrl: "thoad.Controller",
+    params: list[torch.Tensor],
+    order: int,
+):
+    """returns all derivatives up to ``order`` in ascending order, all as single tensors
+    as if parameters were concatenated to a vector"""
+    return [thoad_single_tensor(ctrl, params, o) for o in range(1, order+1)]

{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchzero
-Version: 0.3.14
+Version: 0.4.0
 Summary: Modular optimization library for PyTorch.
 Author-email: Ivan Nikishev <nkshv2@gmail.com>
 Project-URL: Homepage, https://github.com/inikishev/torchzero

torchzero 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl