PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +47 -36
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +8 -2
torchzero/core/chain.py +47 -0
torchzero/core/functional.py +103 -0
torchzero/core/modular.py +233 -0
torchzero/core/module.py +132 -643
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +56 -23
torchzero/core/transform.py +261 -365
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +132 -34
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +3 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/{higher_order → experimental}/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +20 -17
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +5 -5
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +8 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +3 -3
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +42 -51
torchzero/modules/line_search/strong_wolfe.py +5 -5
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +63 -61
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +25 -16
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +58 -0
torchzero/modules/second_order/inm.py +101 -0
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +105 -228
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +158 -178
torchzero/modules/second_order/rsn.py +237 -0
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +57 -50
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +112 -88
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/modules/experimental/momentum.py +0 -160
torchzero/modules/higher_order/__init__.py +0 -1
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.14.dist-info/RECORD +0 -167
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/smoothing/laplacian.py CHANGED Viewed

@@ -4,7 +4,7 @@ from collections.abc import Iterable
 import torch
 from ...utils.tensorlist import TensorList
-from ...core import Transform, Target
+from ...core import TensorTransform
 def vector_laplacian_smoothing(input: torch.Tensor, sigma: float = 1) -> torch.Tensor:
@@ -55,7 +55,7 @@ def _precompute_denominator(tensor: torch.Tensor, sigma) -> torch.Tensor:
     v[-1] = 1
     return 1 - sigma * torch.fft.fft(v) # pylint: disable = not-callable
-class LaplacianSmoothing(Transform):
+class LaplacianSmoothing(TensorTransform):
     """Applies laplacian smoothing via a fast Fourier transform solver which can improve generalization.
     Args:
@@ -70,29 +70,30 @@ class LaplacianSmoothing(Transform):
             what to set on var.
     Examples:
-        Laplacian Smoothing Gradient Descent optimizer as in the paper
+    Laplacian Smoothing Gradient Descent optimizer as in the paper
-        .. code-block:: python
+    ```python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LaplacianSmoothing(),
-                tz.m.LR(1e-2),
-            )
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LaplacianSmoothing(),
+        tz.m.LR(1e-2),
+    )
+    ```
     Reference:
         Osher, S., Wang, B., Yin, P., Luo, X., Barekat, F., Pham, M., & Lin, A. (2022). Laplacian smoothing gradient descent. Research in the Mathematical Sciences, 9(3), 55.
     """
-    def __init__(self, sigma:float = 1, layerwise=True, min_numel = 4, target: Target = 'update'):
+    def __init__(self, sigma:float = 1, layerwise=True, min_numel = 4):
         defaults = dict(sigma = sigma, layerwise=layerwise, min_numel=min_numel)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults)
         # precomputed denominator for when layerwise=False
         self.global_state['full_denominator'] = None
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         layerwise = settings[0]['layerwise']
         # layerwise laplacian smoothing

torchzero/modules/smoothing/sampling.py CHANGED Viewed

@@ -7,14 +7,15 @@ from typing import Literal, cast
 import torch
-from ...core import Chainable, Modular, Module, Var
+from ...core import Chainable, Modular, Module, Objective
 from ...core.reformulation import Reformulation
 from ...utils import Distributions, NumberList, TensorList
 from ..termination import TerminationCriteriaBase, make_termination_criteria
-def _reset_except_self(optimizer: Modular, var: Var, self: Module):
-    for m in optimizer.unrolled_modules:
+def _reset_except_self(objective: Objective, modules, self: Module):
+    assert objective.modular is not None
+    for m in objective.modular.flat_modules:
         if m is not self:
             m.reset()
@@ -98,15 +99,15 @@ class GradientSampling(Reformulation):
             self.set_child('termination', make_termination_criteria(extra=termination))
     @torch.no_grad
-    def pre_step(self, var):
-        params = TensorList(var.params)
+    def pre_step(self, objective):
+        params = TensorList(objective.params)
         fixed = self.defaults['fixed']
         # check termination criteria
         if 'termination' in self.children:
             termination = cast(TerminationCriteriaBase, self.children['termination'])
-            if termination.should_terminate(var):
+            if termination.should_terminate(objective):
                 # decay sigmas
                 states = [self.state[p] for p in params]
@@ -118,7 +119,7 @@ class GradientSampling(Reformulation):
                 # reset on sigmas decay
                 if self.defaults['reset_on_termination']:
-                    var.post_step_hooks.append(partial(_reset_except_self, self=self))
+                    objective.post_step_hooks.append(partial(_reset_except_self, self=self))
                 # clear perturbations
                 self.global_state.pop('perts', None)
@@ -136,7 +137,7 @@ class GradientSampling(Reformulation):
             self.global_state['perts'] = perts
     @torch.no_grad
-    def closure(self, backward, closure, params, var):
+    def closure(self, backward, closure, params, objective):
         params = TensorList(params)
         loss_agg = None
         grad_agg = None
@@ -160,7 +161,7 @@ class GradientSampling(Reformulation):
         # evaluate at x_0
         if include_x0:
-            f_0 = cast(torch.Tensor, var.get_loss(backward=backward))
+            f_0 = objective.get_loss(backward=backward)
             isfinite = math.isfinite(f_0)
             if isfinite:
@@ -168,7 +169,7 @@ class GradientSampling(Reformulation):
                 loss_agg = f_0
             if backward:
-                g_0 = var.get_grad()
+                g_0 = objective.get_grads()
                 if isfinite: grad_agg = g_0
         # evaluate at x_0 + p for each perturbation

torchzero/modules/step_size/adaptive.py CHANGED Viewed

@@ -5,9 +5,9 @@ from typing import Any, Literal
 import torch
-from ...core import Chainable, Transform
+from ...core import Chainable, TensorTransform
 from ...utils import NumberList, TensorList, tofloat, unpack_dicts, unpack_states
-from ...utils.linalg.linear_operator import ScaledIdentity
+from ...linalg.linear_operator import ScaledIdentity
 from ..functional import epsilon_step_size
 def _acceptable_alpha(alpha, param:torch.Tensor):
@@ -16,7 +16,7 @@ def _acceptable_alpha(alpha, param:torch.Tensor):
         return False
     return True
-def _get_H(self: Transform, var):
+def _get_H(self: TensorTransform, var):
     n = sum(p.numel() for p in var.params)
     p = var.params[0]
     alpha = self.global_state.get('alpha', 1)
@@ -25,7 +25,7 @@ def _get_H(self: Transform, var):
     return ScaledIdentity(1 / alpha, shape=(n,n), device=p.device, dtype=p.dtype)
-class PolyakStepSize(Transform):
+class PolyakStepSize(TensorTransform):
     """Polyak's subgradient method with known or unknown f*.
     Args:
@@ -47,7 +47,7 @@ class PolyakStepSize(Transform):
         super().__init__(defaults, uses_grad=use_grad, uses_loss=True, inner=inner)
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         assert grads is not None and loss is not None
         tensors = TensorList(tensors)
         grads = TensorList(grads)
@@ -79,15 +79,15 @@ class PolyakStepSize(Transform):
         self.global_state['alpha'] = alpha
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         alpha = self.global_state.get('alpha', 1)
         if not _acceptable_alpha(alpha, tensors[0]): alpha = epsilon_step_size(TensorList(tensors))
         torch._foreach_mul_(tensors, alpha * unpack_dicts(settings, 'alpha', cls=NumberList))
         return tensors
-    def get_H(self, var):
-        return _get_H(self, var)
+    def get_H(self, objective):
+        return _get_H(self, objective)
 def _bb_short(s: TensorList, y: TensorList, sy, eps):
@@ -116,7 +116,7 @@ def _bb_geom(s: TensorList, y: TensorList, sy, eps, fallback:bool):
         return None
     return (short * long) ** 0.5
-class BarzilaiBorwein(Transform):
+class BarzilaiBorwein(TensorTransform):
     """Barzilai-Borwein step size method.
     Args:
@@ -144,7 +144,7 @@ class BarzilaiBorwein(Transform):
         self.global_state['reset'] = True
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -175,11 +175,11 @@ class BarzilaiBorwein(Transform):
         prev_p.copy_(params)
         prev_g.copy_(g)
-    def get_H(self, var):
-        return _get_H(self, var)
+    def get_H(self, objective):
+        return _get_H(self, objective)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         alpha = self.global_state.get('alpha', None)
         if not _acceptable_alpha(alpha, tensors[0]):
@@ -189,7 +189,7 @@ class BarzilaiBorwein(Transform):
         return tensors
-class BBStab(Transform):
+class BBStab(TensorTransform):
     """Stabilized Barzilai-Borwein method (https://arxiv.org/abs/1907.06409).
     This clips the norm of the Barzilai-Borwein update by ``delta``, where ``delta`` can be adaptive if ``c`` is specified.
@@ -228,7 +228,7 @@ class BBStab(Transform):
         self.global_state['reset'] = True
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -287,11 +287,11 @@ class BBStab(Transform):
         prev_p.copy_(params)
         prev_g.copy_(g)
-    def get_H(self, var):
-        return _get_H(self, var)
+    def get_H(self, objective):
+        return _get_H(self, objective)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         alpha = self.global_state.get('alpha', None)
         if not _acceptable_alpha(alpha, tensors[0]):
@@ -301,7 +301,7 @@ class BBStab(Transform):
         return tensors
-class AdGD(Transform):
+class AdGD(TensorTransform):
     """AdGD and AdGD-2 (https://arxiv.org/abs/2308.02261)"""
     def __init__(self, variant:Literal[1,2]=2, alpha_0:float = 1e-7, sqrt:bool=True, use_grad=True, inner: Chainable | None = None,):
         defaults = dict(variant=variant, alpha_0=alpha_0, sqrt=sqrt)
@@ -313,7 +313,7 @@ class AdGD(Transform):
         self.global_state['reset'] = True
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         variant = settings[0]['variant']
         theta_0 = 0 if variant == 1 else 1/3
         theta = self.global_state.get('theta', theta_0)
@@ -371,7 +371,7 @@ class AdGD(Transform):
         prev_g.copy_(g)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         alpha = self.global_state.get('alpha', None)
         if not _acceptable_alpha(alpha, tensors[0]):
@@ -383,5 +383,5 @@ class AdGD(Transform):
         torch._foreach_mul_(tensors, alpha)
         return tensors
-    def get_H(self, var):
-        return _get_H(self, var)
+    def get_H(self, objective):
+        return _get_H(self, objective)

torchzero/modules/step_size/lr.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import torch
 import random
-from ...core import Transform
+from ...core import TensorTransform
 from ...utils import NumberList, TensorList, generic_ne, unpack_dicts
 def lazy_lr(tensors: TensorList, lr: float | list, inplace:bool):
@@ -12,24 +12,24 @@ def lazy_lr(tensors: TensorList, lr: float | list, inplace:bool):
         return tensors * lr
     return tensors
-class LR(Transform):
+class LR(TensorTransform):
     """Learning rate. Adding this module also adds support for LR schedulers."""
     def __init__(self, lr: float):
         defaults=dict(lr=lr)
-        super().__init__(defaults, uses_grad=False)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         return lazy_lr(TensorList(tensors), lr=[s['lr'] for s in settings], inplace=True)
-class StepSize(Transform):
+class StepSize(TensorTransform):
     """this is exactly the same as LR, except the `lr` parameter can be renamed to any other name to avoid clashes"""
     def __init__(self, step_size: float, key = 'step_size'):
         defaults={"key": key, key: step_size}
-        super().__init__(defaults, uses_grad=False)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         return lazy_lr(TensorList(tensors), lr=[s[s['key']] for s in settings], inplace=True)
@@ -38,8 +38,8 @@ def _warmup_lr(step: int, start_lr: float | NumberList, end_lr: float | NumberLi
     if step > steps: return end_lr
     return start_lr + (end_lr - start_lr) * (step / steps)
-class Warmup(Transform):
-    """Learning rate warmup, linearly increases learning rate multiplier from :code:`start_lr` to :code:`end_lr` over :code:`steps` steps.
+class Warmup(TensorTransform):
+    """Learning rate warmup, linearly increases learning rate multiplier from ``start_lr`` to ``end_lr`` over ``steps`` steps.
     Args:
         steps (int, optional): number of steps to perform warmup for. Defaults to 100.
@@ -64,7 +64,7 @@ class Warmup(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         start_lr, end_lr = unpack_dicts(settings, 'start_lr', 'end_lr', cls = NumberList)
         num_steps = settings[0]['steps']
         step = self.global_state.get('step', 0)
@@ -77,7 +77,7 @@ class Warmup(Transform):
         self.global_state['step'] = step + 1
         return tensors
-class WarmupNormClip(Transform):
+class WarmupNormClip(TensorTransform):
     """Warmup via clipping of the update norm.
     Args:
@@ -102,7 +102,7 @@ class WarmupNormClip(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         start_norm, end_norm = unpack_dicts(settings, 'start_norm', 'end_norm', cls = NumberList)
         num_steps = settings[0]['steps']
         step = self.global_state.get('step', 0)
@@ -118,8 +118,8 @@ class WarmupNormClip(Transform):
         return tensors
-class RandomStepSize(Transform):
-    """Uses random global or layer-wise step size from `low` to `high`.
+class RandomStepSize(TensorTransform):
+    """Uses random global or layer-wise step size from ``low`` to ``high``.
     Args:
         low (float, optional): minimum learning rate. Defaults to 0.
@@ -133,7 +133,7 @@ class RandomStepSize(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         s = settings[0]
         parameterwise = s['parameterwise']

torchzero/modules/termination/termination.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
-from typing import cast
+from typing import cast, final
 import torch
-from ...core import Module, Var
+from ...core import Module, Objective
 from ...utils import Metrics, TensorList, safe_dict_update_, tofloat
@@ -16,14 +16,15 @@ class TerminationCriteriaBase(Module):
         super().__init__(defaults)
     @abstractmethod
-    def termination_criteria(self, var: Var) -> bool:
+    def termination_criteria(self, objective: Objective) -> bool:
         ...
-    def should_terminate(self, var: Var) -> bool:
+    @final
+    def should_terminate(self, objective: Objective) -> bool:
         n_bad = self.global_state.get('_n_bad', 0)
         n = self.defaults['_n']
-        if self.termination_criteria(var):
+        if self.termination_criteria(objective):
             n_bad += 1
             if n_bad >= n:
                 self.global_state['_n_bad'] = 0
@@ -36,12 +37,12 @@ class TerminationCriteriaBase(Module):
         return False
-    def update(self, var):
-        var.should_terminate = self.should_terminate(var)
-        if var.should_terminate: self.global_state['_n_bad'] = 0
+    def update(self, objective):
+        objective.should_terminate = self.should_terminate(objective)
+        if objective.should_terminate: self.global_state['_n_bad'] = 0
-    def apply(self, var):
-        return var
+    def apply(self, objective):
+        return objective
 class TerminateAfterNSteps(TerminationCriteriaBase):
@@ -49,7 +50,7 @@ class TerminateAfterNSteps(TerminationCriteriaBase):
         defaults = dict(steps=steps)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -61,16 +62,17 @@ class TerminateAfterNEvaluations(TerminationCriteriaBase):
         defaults = dict(maxevals=maxevals)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         maxevals = self.defaults['maxevals']
-        return var.modular.num_evaluations >= maxevals
+        assert objective.modular is not None
+        return objective.modular.num_evaluations >= maxevals
 class TerminateAfterNSeconds(TerminationCriteriaBase):
     def __init__(self, seconds:float, sec_fn = time.time):
         defaults = dict(seconds=seconds, sec_fn=sec_fn)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         max_seconds = self.defaults['seconds']
         sec_fn = self.defaults['sec_fn']
@@ -88,10 +90,10 @@ class TerminateByGradientNorm(TerminationCriteriaBase):
         defaults = dict(tol=tol, ord=ord)
         super().__init__(defaults, n=n)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         tol = self.defaults['tol']
         ord = self.defaults['ord']
-        return TensorList(var.get_grad()).global_metric(ord) <= tol
+        return TensorList(objective.get_grads()).global_metric(ord) <= tol
 class TerminateByUpdateNorm(TerminationCriteriaBase):
@@ -100,20 +102,20 @@ class TerminateByUpdateNorm(TerminationCriteriaBase):
         defaults = dict(tol=tol, ord=ord)
         super().__init__(defaults, n=n)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
         tol = self.defaults['tol']
         ord = self.defaults['ord']
-        p_prev = self.get_state(var.params, 'p_prev', cls=TensorList)
+        p_prev = self.get_state(objective.params, 'p_prev', cls=TensorList)
         if step == 0:
-            p_prev.copy_(var.params)
+            p_prev.copy_(objective.params)
             return False
-        should_terminate = (p_prev - var.params).global_metric(ord) <= tol
-        p_prev.copy_(var.params)
+        should_terminate = (p_prev - objective.params).global_metric(ord) <= tol
+        p_prev.copy_(objective.params)
         return should_terminate
@@ -122,10 +124,10 @@ class TerminateOnNoImprovement(TerminationCriteriaBase):
         defaults = dict(tol=tol)
         super().__init__(defaults, n=n)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         tol = self.defaults['tol']
-        f = tofloat(var.get_loss(False))
+        f = tofloat(objective.get_loss(False))
         if 'f_min' not in self.global_state:
             self.global_state['f_min'] = f
             return False
@@ -141,9 +143,9 @@ class TerminateOnLossReached(TerminationCriteriaBase):
         defaults = dict(value=value)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         value = self.defaults['value']
-        return var.get_loss(False) <= value
+        return objective.get_loss(False) <= value
 class TerminateAny(TerminationCriteriaBase):
     def __init__(self, *criteria: TerminationCriteriaBase):
@@ -151,9 +153,9 @@ class TerminateAny(TerminationCriteriaBase):
         self.set_children_sequence(criteria)
-    def termination_criteria(self, var: Var) -> bool:
+    def termination_criteria(self, objective: Objective) -> bool:
         for c in self.get_children_sequence():
-            if cast(TerminationCriteriaBase, c).termination_criteria(var): return True
+            if cast(TerminationCriteriaBase, c).termination_criteria(objective): return True
         return False
@@ -163,9 +165,9 @@ class TerminateAll(TerminationCriteriaBase):
         self.set_children_sequence(criteria)
-    def termination_criteria(self, var: Var) -> bool:
+    def termination_criteria(self, objective: Objective) -> bool:
         for c in self.get_children_sequence():
-            if not cast(TerminationCriteriaBase, c).termination_criteria(var): return False
+            if not cast(TerminationCriteriaBase, c).termination_criteria(objective): return False
         return True
@@ -173,7 +175,7 @@ class TerminateNever(TerminationCriteriaBase):
     def __init__(self):
         super().__init__()
-    def termination_criteria(self, var): return False
+    def termination_criteria(self, objective): return False
 def make_termination_criteria(
     ftol: float | None = None,

torchzero/modules/trust_region/cubic_regularization.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from ...core import Chainable, Module
 from ...utils import TensorList, vec_to_tensors
-from ...utils.linalg.linear_operator import LinearOperator
+from ...linalg.linear_operator import LinearOperator
 from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
@@ -58,7 +58,7 @@ def ls_cubic_solver(f, g:torch.Tensor, H:LinearOperator, M: float, loss_at_param
     for _ in range(it_max):
         r_try = (r_min + r_max) / 2
         lam = r_try * M
-        s_lam = H.add_diagonal(lam).solve(g).neg()
+        s_lam = H.solve_plus_diag(g, lam).neg()
         # s_lam = -torch.linalg.solve(B + lam*id_matrix, g)
         solver_it += 1
         crit = conv_criterion(s_lam, r_try)

torchzero/modules/trust_region/levenberg_marquardt.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import torch
 from ...core import Chainable, Module
-from ...utils.linalg import linear_operator
+from ...linalg import linear_operator
 from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
@@ -32,38 +32,31 @@ class LevenbergMarquardt(TrustRegionBase):
         max_attempts (max_attempts, optional):
             maximum number of trust region size size reductions per step. A zero update vector is returned when
             this limit is exceeded. Defaults to 10.
+        adaptive (bool, optional):
+            if True, trust radius is multiplied by square root of gradient norm.
         fallback (bool, optional):
             if ``True``, when ``hess_module`` maintains hessian inverse which can't be inverted efficiently, it will
             be inverted anyway. When ``False`` (default), a ``RuntimeError`` will be raised instead.
         inner (Chainable | None, optional): preconditioning is applied to output of thise module. Defaults to None.
-    Examples:
-        Gauss-Newton with Levenberg-Marquardt trust-region
+    ### Examples:
-        .. code-block:: python
+    Gauss-Newton with Levenberg-Marquardt trust-region
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LevenbergMarquardt(tz.m.GaussNewton()),
-            )
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.GaussNewton()),
+    )
+    ```
-        LM-SR1
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LevenbergMarquardt(tz.m.SR1(inverse=False)),
-            )
-        First order trust region (hessian is assumed to be identity)
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LevenbergMarquardt(tz.m.Identity()),
-            )
+    LM-SR1
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.SR1(inverse=False)),
+    )
+    ```
     """
     def __init__(
@@ -78,11 +71,12 @@ class LevenbergMarquardt(TrustRegionBase):
         max_attempts: int = 10,
         radius_strategy: _RadiusStrategy | _RADIUS_KEYS = 'default',
         y: float = 0,
+        adaptive: bool = False,
         fallback: bool = False,
         update_freq: int = 1,
         inner: Chainable | None = None,
     ):
-        defaults = dict(y=y, fallback=fallback)
+        defaults = dict(y=y, fallback=fallback, adaptive=adaptive)
         super().__init__(
             defaults=defaults,
             hess_module=hess_module,
@@ -103,6 +97,7 @@ class LevenbergMarquardt(TrustRegionBase):
     def trust_solve(self, f, g, H, radius, params, closure, settings):
         y = settings['y']
+        adaptive = settings["adaptive"]
         if isinstance(H, linear_operator.DenseInverse):
             if settings['fallback']:
@@ -117,12 +112,14 @@ class LevenbergMarquardt(TrustRegionBase):
                 )
         reg = 1/radius
+        if adaptive: reg = reg * torch.linalg.vector_norm(g).sqrt()
         if y == 0:
-            return H.add_diagonal(reg).solve(g)
+            return H.solve_plus_diag(g, reg) # pyright:ignore[reportAttributeAccessIssue]
         diag = H.diagonal()
         diag = torch.where(diag < torch.finfo(diag.dtype).tiny * 2, 1, diag)
         if y != 1: diag = (diag*y) + (1-y)
-        return H.add_diagonal(diag*reg).solve(g)
+        return H.solve_plus_diag(g, diag*reg)

torchzero 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl