PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/utils/compile.py CHANGED Viewed

@@ -38,11 +38,11 @@ class _MaybeCompiledFunc:
 _optional_compiler = _OptionalCompiler()
 """this holds .enable attribute, set to True to enable compiling for a few functions that benefit from it."""
-def set_compilation(enable: bool=True):
+def enable_compilation(enable: bool=True):
     """`enable` is False by default. When True, certain functions will be compiled, which may not work on some systems like Windows, but it usually improves performance."""
     _optional_compiler.enable = enable
-def enable_compilation(fn): return _optional_compiler.enable_compilation(fn)
+def allow_compile(fn): return _optional_compiler.enable_compilation(fn)
 def benchmark_compile_cuda(fn, n: int, **kwargs):
     # warmup

torchzero/utils/derivatives.py CHANGED Viewed

@@ -4,9 +4,10 @@ import torch
 import torch.autograd.forward_ad as fwAD
 from .torch_tools import swap_tensors_no_use_count_check, vec_to_tensors
+from .tensorlist import TensorList
 def _jacobian(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
-    flat_outputs = torch.cat([i.reshape(-1) for i in outputs])
+    flat_outputs = torch.cat([i.ravel() for i in outputs])
     grad_ouputs = torch.eye(len(flat_outputs), device=outputs[0].device, dtype=outputs[0].dtype)
     jac = []
     for i in range(flat_outputs.numel()):
@@ -23,7 +24,7 @@ def _jacobian(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], crea
 def _jacobian_batched(outputs: Sequence[torch.Tensor], wrt: Sequence[torch.Tensor], create_graph=False):
-    flat_outputs = torch.cat([i.reshape(-1) for i in outputs])
+    flat_outputs = torch.cat([i.ravel() for i in outputs])
     return torch.autograd.grad(
         flat_outputs,
         wrt,
@@ -39,10 +40,10 @@ def flatten_jacobian(jacs: Sequence[torch.Tensor]) -> torch.Tensor:
     Args:
         jacs (Sequence[torch.Tensor]):
-            output from jacobian_wrt where ach tensor has the shape `(*output.shape, *wrt[i].shape)`.
+            output from jacobian_wrt where ach tensor has the shape ``(*output.shape, *wrt[i].shape)``.
     Returns:
-        torch.Tensor: has the shape `(output.ndim, wrt.ndim)`.
+        torch.Tensor: has the shape ``(output.ndim, wrt.ndim)``.
     """
     if not jacs:
         return torch.empty(0, 0)
@@ -261,7 +262,7 @@ def jvp_fd_central(
     params: Iterable[torch.Tensor],
     tangent: Iterable[torch.Tensor],
     h=1e-3,
-    normalize=False,
+    normalize=True,
 ) -> tuple[torch.Tensor | None, torch.Tensor]:
     """Jacobian vector product using central finite difference formula.
@@ -310,7 +311,7 @@ def jvp_fd_forward(
     tangent: Iterable[torch.Tensor],
     h=1e-3,
     v_0=None,
-    normalize=False,
+    normalize=True,
 ) -> tuple[torch.Tensor | None, torch.Tensor]:
     """Jacobian vector product using forward finite difference formula.
     Loss at initial point can be specified in the `v_0` argument.
@@ -357,52 +358,18 @@ def jvp_fd_forward(
     if normalize: res = res * tangent_norm
     return v_0, res
-def hvp(
-    params: Iterable[torch.Tensor],
-    grads: Iterable[torch.Tensor],
-    vec: Iterable[torch.Tensor],
-    retain_graph=None,
-    create_graph=False,
-    allow_unused=None,
-):
-    """Hessian-vector product
-    Example:
-    ```python
-    model = nn.Linear(4, 2)
-    X = torch.randn(10, 4)
-    y = torch.randn(10, 2)
-    y_hat = model(X)
-    loss = F.mse_loss(y_hat, y)
-    loss.backward(create_graph=True)
-    grads = [p.grad for p in model.parameters()]
-    vec = [torch.randn_like(p) for p in model.parameters()]
-    # list of tensors, same layout as model.parameters()
-    hvp(model.parameters(), grads, vec=vec)
-    ```
-    """
-    params = list(params)
-    g = list(grads)
-    vec = list(vec)
-    with torch.enable_grad():
-        return torch.autograd.grad(g, params, vec, create_graph=create_graph, retain_graph=retain_graph, allow_unused=allow_unused)
 @torch.no_grad
 def hvp_fd_central(
     closure,
     params: Iterable[torch.Tensor],
-    vec: Iterable[torch.Tensor],
+    x: Iterable[torch.Tensor],
     h=1e-3,
-    normalize=False,
+    normalize=True,
 ) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
-    """Hessian-vector product using central finite difference formula.
+    """Returns ``(loss_approx, Hx)``.
-    Please note that this will clear :code:`grad` attributes in params.
+    Please note that this will clear ``grad`` attributes in params.
     Example:
     ```python
@@ -424,48 +391,48 @@ def hvp_fd_central(
     ```
     """
     params = list(params)
-    vec = list(vec)
+    x = list(x)
     vec_norm = None
     if normalize:
-        vec_norm = torch.linalg.vector_norm(torch.cat([t.view(-1) for t in vec])) # pylint:disable=not-callable
+        vec_norm = torch.linalg.vector_norm(torch.cat([t.view(-1) for t in x])) # pylint:disable=not-callable
         if vec_norm == 0: return None, [torch.zeros_like(p) for p in params]
-        vec = torch._foreach_div(vec, vec_norm)
+        x = torch._foreach_div(x, vec_norm)
-    vec_h = torch._foreach_mul(vec, h)
-    torch._foreach_add_(params, vec_h)
+    xh = torch._foreach_mul(x, h)
+    torch._foreach_add_(params, xh)
     with torch.enable_grad(): loss = closure()
     g_plus = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
-    torch._foreach_sub_(params, vec_h)
-    torch._foreach_sub_(params, vec_h)
+    torch._foreach_sub_(params, xh)
+    torch._foreach_sub_(params, xh)
     with torch.enable_grad(): loss = closure()
     g_minus = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
-    torch._foreach_add_(params, vec_h)
+    torch._foreach_add_(params, xh)
     for p in params: p.grad = None
-    hvp_ = g_plus
-    torch._foreach_sub_(hvp_, g_minus)
-    torch._foreach_div_(hvp_, 2*h)
+    hx = g_plus
+    torch._foreach_sub_(hx, g_minus)
+    torch._foreach_div_(hx, 2*h)
-    if normalize: torch._foreach_mul_(hvp_, vec_norm)
-    return loss, hvp_
+    if normalize: torch._foreach_mul_(hx, vec_norm)
+    return loss, hx
 @torch.no_grad
 def hvp_fd_forward(
     closure,
     params: Iterable[torch.Tensor],
-    vec: Iterable[torch.Tensor],
+    x: Iterable[torch.Tensor],
     h=1e-3,
     g_0=None,
-    normalize=False,
+    normalize=True,
 ) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
-    """Hessian-vector product using forward finite difference formula.
+    """Returns ``(loss_approx, Hx)``.
-    Gradient at initial point can be specified in the `g_0` argument.
+    Gradient at initial point can be specified in the ``g_0`` argument.
-    Please note that this will clear :code:`grad` attributes in params.
+    Please note that this will clear ``grad`` attributes in params.
     Example:
     ```python
@@ -492,16 +459,16 @@ def hvp_fd_forward(
     """
     params = list(params)
-    vec = list(vec)
+    x = list(x)
     loss = None
     vec_norm = None
     if normalize:
-        vec_norm = torch.linalg.vector_norm(torch.cat([t.ravel() for t in vec])) # pylint:disable=not-callable
+        vec_norm = torch.linalg.vector_norm(torch.cat([t.ravel() for t in x])) # pylint:disable=not-callable
         if vec_norm == 0: return None, [torch.zeros_like(p) for p in params]
-        vec = torch._foreach_div(vec, vec_norm)
+        x = torch._foreach_div(x, vec_norm)
-    vec_h = torch._foreach_mul(vec, h)
+    xh = torch._foreach_mul(x, h)
     if g_0 is None:
         with torch.enable_grad(): loss = closure()
@@ -509,18 +476,75 @@ def hvp_fd_forward(
     else:
         g_0 = list(g_0)
-    torch._foreach_add_(params, vec_h)
+    torch._foreach_add_(params, xh)
     with torch.enable_grad():
         l = closure()
         if loss is None: loss = l
     g_plus = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
-    torch._foreach_sub_(params, vec_h)
+    torch._foreach_sub_(params, xh)
     for p in params: p.grad = None
-    hvp_ = g_plus
-    torch._foreach_sub_(hvp_, g_0)
-    torch._foreach_div_(hvp_, h)
+    hx = g_plus
+    torch._foreach_sub_(hx, g_0)
+    torch._foreach_div_(hx, h)
+    if normalize: torch._foreach_mul_(hx, vec_norm)
+    return loss, hx
+@torch.no_grad
+def hessian_fd(fn, params: Sequence[torch.Tensor], eps: float = 1e-4, full: bool = True):
+    """returns ``f(x), g(x), H(x)``, where ``g(x)`` is a tensor list.
+    Number of evals for full is: 4n^2 - 2n + 1
+    Number of evals for upper is: 2n^2 + 1.
+    """
+    params = TensorList(params)
+    p_0 = params.clone()
+    n = sum(t.numel() for t in params)
+    device = params[0].device; dtype = params[0].dtype
+    fx = fn()
+    g = params.zeros_like()
+    H = torch.zeros((n, n), device=device, dtype=dtype)
+    for i in range(n):
+        for j in (range(n) if full else range(i, n)):
+            if i == j:
+                params.flat_set_lambda_(i, lambda x: x + eps)
+                f_plus = fn()
+                params.flat_set_lambda_(i, lambda x: x - 2 * eps)
+                f_minus = fn()
+                # params.flat_set_lambda_(i, lambda x: x + eps)
+                g.flat_set_(i, (f_plus - f_minus) / (2*eps))
+                H[i, i] = (f_plus - 2 * fx + f_minus) / (eps ** 2)
+            else:
+                params.flat_set_lambda_(i, lambda x: x + eps)
+                params.flat_set_lambda_(j, lambda x: x + eps)
+                f_pp = fn()
+                params.flat_set_lambda_(i, lambda x: x - 2 * eps)
+                f_np = fn()
+                params.flat_set_lambda_(j, lambda x: x - 2 * eps)
+                f_nn = fn()
+                params.flat_set_lambda_(i, lambda x: x + 2 * eps)
+                f_pn = fn()
+                # params.flat_set_lambda_(i, lambda x: x - eps)
+                # params.flat_set_lambda_(j, lambda x: x + eps)
+                H[i, j] = (f_pp - f_np - f_pn + f_nn) / (4 * eps ** 2)
+                if not full: H[j, i] = H[i, j]
+            params.copy_(p_0) # otherwise inaccuracy builds up
+    if full:
+        H = H + H.T
+        H /= 2
-    if normalize: torch._foreach_mul_(hvp_, vec_norm)
-    return loss, hvp_
+    return fx, g, H

torchzero/utils/optimizer.py CHANGED Viewed

@@ -64,22 +64,15 @@ def get_group_vals(param_groups: Iterable[Mapping[str, Any]],
                 values[i].extend(group_value for _ in range(num_params))
     return values
-_InitLiterals = Literal['param', 'grad']
-Init = _InitLiterals | Any | list[_InitLiterals | Any] | tuple[_InitLiterals | Any]
+Init =  Any
-def _make_initial_state_value(param: torch.Tensor, init: Init, i: int | None):
-    if callable(init): return init(param)
+def _make_initial_state_value(tensor: torch.Tensor, init: Init, i: int | None):
+    if callable(init): return init(tensor)
     if isinstance(init, torch.Tensor): return init.detach().clone()
-    if isinstance(init, str):
-        if init in ('param','params'): return param.detach().clone()
-        if init in ('grad', 'grads'):
-            if param.grad is None: raise RuntimeError('init is set to "grad, but param.grad is None"')
-            return param.grad.detach().clone()
     if isinstance(init, (list,tuple)):
         if i is None: raise RuntimeError(f'init is per-parameter ({type(init)}) but parameter index i is None')
-        return _make_initial_state_value(param, init[i], None)
+        return _make_initial_state_value(tensor, init[i], None)
     return init
@@ -133,72 +126,6 @@ def get_state_vals(state: Mapping[torch.Tensor, MutableMapping[str, Any]], param
     return values
-class Optimizer(torch.optim.Optimizer, ABC):
-    """subclass of torch.optim.Optimizer with some helper methods for fast experimentation, it's not used anywhere in torchzero.
-    Args:
-        params (iterable): an iterable of :class:`torch.Tensor` s or
-            :class:`dict` s. Specifies what Tensors should be optimized.
-        defaults (dict | None): a dict containing default values of optimization
-            options (used when a parameter group doesn't specify them).
-    """
-    def __init__(self, params, defaults: dict[str, Any] | None = None, **_defaults):
-        if defaults is None: defaults = {}
-        defaults.update(_defaults)
-        super().__init__(params, defaults)
-        self.global_state = self.state[self.param_groups[0]['params'][0]]
-        """state of 1st parameter, can be used as global state which is how L-BFGS uses it in pytorch, and there is some kind of good reason to do it like that"""
-    def get_params(self, mode: ParamFilter = 'requires_grad', cls: type[ListLike] = TensorList) -> ListLike:
-        return get_params(self.param_groups, mode, cls)
-    @overload
-    def group_vals(self, key: str, *,
-                   mode: ParamFilter = 'requires_grad', cls: type[ListLike] = NumberList) -> ListLike: ...
-    @overload
-    def group_vals(self, key: list[str] | tuple[str,...], *,
-                   mode: ParamFilter = 'requires_grad', cls: type[ListLike] = NumberList) -> list[ListLike]: ...
-    @overload
-    def group_vals(self, key: str, key2: str, *keys: str,
-                   mode: ParamFilter = 'requires_grad', cls: type[ListLike] = NumberList) -> list[ListLike]: ...
-    def group_vals(self, key: str | list[str] | tuple[str,...], key2: str | None = None, *keys: str,
-                   mode: ParamFilter = 'requires_grad', cls: type[ListLike] = NumberList) -> ListLike | list[ListLike]:
-        return get_group_vals(self.param_groups, key, key2, *keys, mode = mode, cls = cls) # pyright:ignore[reportArgumentType]
-    @overload
-    def state_vals(self, key: str, *,
-                   init: Init = torch.zeros_like,
-                   mode: ParamFilter | list[torch.Tensor] | tuple[torch.Tensor, ...] = 'requires_grad',
-                   cls: type[ListLike] = TensorList) -> ListLike: ...
-    @overload
-    def state_vals(self, key: list[str] | tuple[str,...], *,
-                   init: Init | Sequence[Init] = torch.zeros_like,
-                   mode: ParamFilter | list[torch.Tensor] | tuple[torch.Tensor, ...] = 'requires_grad',
-                   cls: type[ListLike] = TensorList) -> list[ListLike]: ...
-    @overload
-    def state_vals(self, key: str, key2: str, *keys: str,
-                   init: Init | Sequence[Init] = torch.zeros_like,
-                   mode: ParamFilter | list[torch.Tensor] | tuple[torch.Tensor, ...] = 'requires_grad',
-                   cls: type[ListLike] = TensorList) -> list[ListLike]: ...
-    def state_vals(self, key: str | list[str] | tuple[str,...], key2: str | None = None, *keys: str,
-                   init: Init | Sequence[Init] = torch.zeros_like,
-                   mode: ParamFilter | list[torch.Tensor] | tuple[torch.Tensor, ...] = 'requires_grad',
-                   cls: type[ListLike] = TensorList) -> ListLike | list[ListLike]:
-        if isinstance(mode, (list,tuple)): params = mode
-        else: params = self.get_params(mode)
-        return get_state_vals(self.state, params, key, key2, *keys, init = init, cls = cls) # type:ignore[reportArgumentType]
-    # shut up pylance
-    @abstractmethod
-    def step(self, closure) -> Any: ... # pylint:disable=signature-differs # pyright:ignore[reportIncompatibleMethodOverride]
 def zero_grad_(params: Iterable[torch.Tensor], set_to_none):
     if set_to_none:
         for p in params:

torchzero/utils/python_tools.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import importlib
 import functools
 import operator
 from typing import Any, TypeVar, overload
@@ -40,6 +41,11 @@ def generic_ne(x: int | float | Iterable[int | float], y: int | float | Iterable
         return any(i!=y for i in x)
     return any(i!=j for i,j in zip(x,y))
+def generic_is_none(x: Any | Iterable[Any]):
+    """returns True if x is None or iterable with all elements set to None"""
+    if x is None: return True
+    if isinstance(x, Iterable): return all(i is None for i in x)
+    return False
 def zipmap(self, fn: Callable, other: Any | list | tuple, *args, **kwargs):
     """If `other` is list/tuple, applies `fn` to self zipped with `other`.
@@ -68,3 +74,28 @@ def safe_dict_update_(d1_:dict, d2:dict):
     if len(inter) > 0: raise RuntimeError(f"Duplicate keys {inter}")
     d1_.update(d2)
+# lazy loader from https://stackoverflow.com/a/78312674/15673832
+class LazyLoader:
+    'thin shell class to wrap modules.  load real module on first access and pass thru'
+    def __init__(self, modname):
+        self._modname = modname
+        self._mod = None
+    def __getattr__(self, attr):
+        'import module on first attribute access'
+        try:
+            return getattr(self._mod, attr)
+        except Exception as e :
+            if self._mod is None :
+                # module is unset, load it
+                self._mod = importlib.import_module (self._modname)
+            else :
+                # module is set, got different exception from getattr ().  reraise it
+                raise e
+        # retry getattr if module was just loaded for first time
+        # call this outside exception handler in case it raises new exception
+        return getattr (self._mod, attr)

torchzero/utils/tensorlist.py CHANGED Viewed

@@ -22,7 +22,6 @@ from typing_extensions import Self, TypeAlias, Unpack
 from .metrics import Metrics, evaluate_metric, calculate_metric_list
 from .numberlist import NumberList, as_numberlist, maybe_numberlist
-from .ops import where_
 from .python_tools import generic_ne, zipmap
 _Scalar = int | float | bool | complex
@@ -346,6 +345,10 @@ class TensorList(list[torch.Tensor | Any]):
     def global_all(self): return builtins.all(self.all())
     def global_numel(self) -> int: return builtins.sum(self.numel())
+    def global_allclose(self, other: _TensorSeq, rtol: float = 0.00001, atol: float = 1e-8, equal_nan: bool = False) -> bool:
+        bools = self.zipmap_args(torch.allclose, other, rtol, atol, equal_nan)
+        return all(bools)
     def empty_like(self, **kwargs: Unpack[_NewTensorKwargs]): return self.__class__(torch.empty_like(i, **kwargs) for i in self)
     def zeros_like(self, **kwargs: Unpack[_NewTensorKwargs]): return self.__class__(torch.zeros_like(i, **kwargs) for i in self)
     def ones_like(self, **kwargs: Unpack[_NewTensorKwargs]): return self.__class__(torch.ones_like(i, **kwargs) for i in self)
@@ -509,7 +512,6 @@ class TensorList(list[torch.Tensor | Any]):
         torch._foreach_mul_(self, other)
         return self
-    # TODO: benchmark
     def lazy_mul(self, other: int | float | list[int | float] | tuple[int | float], clone=False):
         if generic_ne(other, 1):
             return self * other
@@ -536,6 +538,13 @@ class TensorList(list[torch.Tensor | Any]):
         torch._foreach_pow_(self, exponent)
         return self
+    def lazy_pow(self, other: int | float | list[int | float] | tuple[int | float]):
+        if generic_ne(other, 1): return self.pow(other)
+        return self
+    def lazy_pow_(self, other: int | float | list[int | float] | tuple[int | float]):
+        if generic_ne(other, 1): return self.pow_(other)
+        return self
     def rpow(self, input: _Scalar | _TensorSeq): return self.__class__(torch._foreach_pow(input, self))
     def rpow_(self, input: _TensorSeq):
         torch._foreach_pow_(input, self)
@@ -984,9 +993,6 @@ class TensorList(list[torch.Tensor | Any]):
     def where(self, condition: "torch.Tensor | _TensorSeq", other: _STOrSTSeq):
         """self where condition is true other otherwise"""
         return self.zipmap_args(_MethodCallerWithArgs('where'), condition, other)
-    def where_(self, condition: "torch.Tensor | _TensorSeq", other: "torch.Tensor | _TensorSeq"):
-        """self where condition is true other otherwise"""
-        return self.zipmap_args_inplace_(where_, condition, other)
     def masked_fill(self, mask: "torch.Tensor | _TensorSeq", fill_value: "_Scalar | _ScalarSeq"):
         """Same as tensor[mask] = value (not in-place), where value must be scalar/scalars"""

torchzero/utils/thoad_tools.py ADDED Viewed

@@ -0,0 +1,68 @@
+import itertools
+from collections.abc import Callable
+from importlib.util import find_spec
+from typing import TYPE_CHECKING, cast
+import torch
+from .python_tools import LazyLoader
+lazy_thoad = LazyLoader("thoad")
+if TYPE_CHECKING:
+    import thoad
+    lazy_thoad = cast(thoad, lazy_thoad)
+def thoad_single_tensor(
+    ctrl: "thoad.Controller",
+    params: list[torch.Tensor],
+    order: int
+) -> torch.Tensor:
+    """treats params as if they were concatenated into a vector."""
+    if not all(p.requires_grad for p in params):
+        raise ValueError("All parameters must have requires_grad=True")
+    if order < 1:
+        raise ValueError("Order must be at least 1")
+    # we need parameter sizes and total size N
+    # final tensor is (N, N, ..., N) with `order` dimensions.
+    param_numels = [p.numel() for p in params]
+    total_params = sum(param_numels)
+    final_shape = (total_params,) * order
+    p = params[0]
+    T = torch.zeros(final_shape, device=p.device, dtype=p.dtype)
+    # start/end indices for each parameter in the flattened vector.
+    offsets = torch.cumsum(torch.tensor([0] + param_numels), dim=0)
+    # for order=2 this iterates through (p0,p0), (p0,p1), (p1,p0), (p1,p1), etc.
+    param_indices = range(len(params))
+    for block_indices in itertools.product(param_indices, repeat=order):
+        block_params = tuple(params[i] for i in block_indices)
+        block_tensor, _ = ctrl.fetch_hgrad(variables=block_params) # (1, *p1.shape, *p2.shape, ...).
+        block_tensor = block_tensor.squeeze(0) # (*p1.shape, *p2.shape, ...)
+        # convert (*p1.shape, *p2.shape) to (p1.numel(), p2.numel())
+        block_flat_shape = tuple(param_numels[i] for i in block_indices)
+        block_tensor_flat = block_tensor.reshape(block_flat_shape)
+        # place the flattened block into T
+        slicing = tuple(
+            slice(offsets[i], offsets[i+1]) for i in block_indices
+        )
+        T[slicing] = block_tensor_flat
+    ctrl.clear()
+    return T
+def thoad_derivatives(
+    ctrl: "thoad.Controller",
+    params: list[torch.Tensor],
+    order: int,
+):
+    """returns all derivatives up to ``order`` in ascending order, all as single tensors
+    as if parameters were concatenated to a vector"""
+    return [thoad_single_tensor(ctrl, params, o) for o in range(1, order+1)]

{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchzero
-Version: 0.3.15
+Version: 0.4.1
 Summary: Modular optimization library for PyTorch.
 Author-email: Ivan Nikishev <nkshv2@gmail.com>
 Project-URL: Homepage, https://github.com/inikishev/torchzero

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl