PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +47 -36
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +8 -2
torchzero/core/chain.py +47 -0
torchzero/core/functional.py +103 -0
torchzero/core/modular.py +233 -0
torchzero/core/module.py +132 -643
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +56 -23
torchzero/core/transform.py +261 -365
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +132 -34
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +3 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/{higher_order → experimental}/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +20 -17
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +5 -5
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +8 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +3 -3
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +42 -51
torchzero/modules/line_search/strong_wolfe.py +5 -5
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +63 -61
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +25 -16
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +58 -0
torchzero/modules/second_order/inm.py +101 -0
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +105 -228
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +158 -178
torchzero/modules/second_order/rsn.py +237 -0
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +57 -50
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +112 -88
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/modules/experimental/momentum.py +0 -160
torchzero/modules/higher_order/__init__.py +0 -1
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.14.dist-info/RECORD +0 -167
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/ops/utility.py CHANGED Viewed

@@ -2,78 +2,78 @@ from collections import deque
 import torch
-from ...core import Module, Target, Transform
+from ...core import Module,  Transform
 from ...utils.tensorlist import Distributions, TensorList
-from ...utils.linalg.linear_operator import ScaledIdentity
+from ...linalg.linear_operator import ScaledIdentity
 class Clone(Module):
     """Clones input. May be useful to store some intermediate result and make sure it doesn't get affected by in-place operations"""
     def __init__(self):
         super().__init__({})
     @torch.no_grad
-    def step(self, var):
-        var.update = [u.clone() for u in var.get_update()]
-        return var
+    def apply(self, objective):
+        objective.updates = [u.clone() for u in objective.get_updates()]
+        return objective
 class Grad(Module):
     """Outputs the gradient"""
     def __init__(self):
         super().__init__({})
     @torch.no_grad
-    def step(self, var):
-        var.update = [g.clone() for g in var.get_grad()]
-        return var
+    def apply(self, objective):
+        objective.updates = [g.clone() for g in objective.get_grads()]
+        return objective
 class Params(Module):
     """Outputs parameters"""
     def __init__(self):
         super().__init__({})
     @torch.no_grad
-    def step(self, var):
-        var.update = [p.clone() for p in var.params]
-        return var
+    def apply(self, objective):
+        objective.updates = [p.clone() for p in objective.params]
+        return objective
 class Zeros(Module):
     """Outputs zeros"""
     def __init__(self):
         super().__init__({})
     @torch.no_grad
-    def step(self, var):
-        var.update = [torch.zeros_like(p) for p in var.params]
-        return var
+    def apply(self, objective):
+        objective.updates = [torch.zeros_like(p) for p in objective.params]
+        return objective
 class Ones(Module):
     """Outputs ones"""
     def __init__(self):
         super().__init__({})
     @torch.no_grad
-    def step(self, var):
-        var.update = [torch.ones_like(p) for p in var.params]
-        return var
+    def apply(self, objective):
+        objective.updates = [torch.ones_like(p) for p in objective.params]
+        return objective
 class Fill(Module):
-    """Outputs tensors filled with :code:`value`"""
+    """Outputs tensors filled with ``value``"""
     def __init__(self, value: float):
         defaults = dict(value=value)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
-        var.update = [torch.full_like(p, self.settings[p]['value']) for p in var.params]
-        return var
+    def apply(self, objective):
+        objective.updates = [torch.full_like(p, self.settings[p]['value']) for p in objective.params]
+        return objective
 class RandomSample(Module):
-    """Outputs tensors filled with random numbers from distribution depending on value of :code:`distribution`."""
+    """Outputs tensors filled with random numbers from distribution depending on value of ``distribution``."""
     def __init__(self, distribution: Distributions = 'normal', variance:float | None = None):
         defaults = dict(distribution=distribution, variance=variance)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
+    def apply(self, objective):
         distribution = self.defaults['distribution']
-        variance = self.get_settings(var.params, 'variance')
-        var.update = TensorList(var.params).sample_like(distribution=distribution, variance=variance)
-        return var
+        variance = self.get_settings(objective.params, 'variance')
+        objective.updates = TensorList(objective.params).sample_like(distribution=distribution, variance=variance)
+        return objective
 class Randn(Module):
     """Outputs tensors filled with random numbers from a normal distribution with mean 0 and variance 1."""
@@ -81,43 +81,44 @@ class Randn(Module):
         super().__init__({})
     @torch.no_grad
-    def step(self, var):
-        var.update = [torch.randn_like(p) for p in var.params]
-        return var
+    def apply(self, objective):
+        objective.updates = [torch.randn_like(p) for p in objective.params]
+        return objective
 class Uniform(Module):
-    """Outputs tensors filled with random numbers from uniform distribution between :code:`low` and :code:`high`."""
+    """Outputs tensors filled with random numbers from uniform distribution between ``low`` and ``high``."""
     def __init__(self, low: float, high: float):
         defaults = dict(low=low, high=high)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
-        low,high = self.get_settings(var.params, 'low','high')
-        var.update = [torch.empty_like(t).uniform_(l,h) for t,l,h in zip(var.params, low, high)]
-        return var
+    def apply(self, objective):
+        low,high = self.get_settings(objective.params, 'low','high')
+        objective.updates = [torch.empty_like(t).uniform_(l,h) for t,l,h in zip(objective.params, low, high)]
+        return objective
 class GradToNone(Module):
-    """Sets :code:`grad` attribute to None on :code:`var`."""
+    """Sets ``grad`` attribute to None on ``objective``."""
     def __init__(self): super().__init__()
-    def step(self, var):
-        var.grad = None
-        return var
+    def apply(self, objective):
+        objective.grads = None
+        return objective
 class UpdateToNone(Module):
-    """Sets :code:`update` attribute to None on :code:`var`."""
+    """Sets ``update`` attribute to None on ``var``."""
     def __init__(self): super().__init__()
-    def step(self, var):
-        var.update = None
-        return var
+    def apply(self, objective):
+        objective.updates = None
+        return objective
 class Identity(Module):
     """Identity operator that is argument-insensitive. This also can be used as identity hessian for trust region methods."""
     def __init__(self, *args, **kwargs): super().__init__()
-    def step(self, var): return var
-    def get_H(self, var):
-        n = sum(p.numel() for p in var.params)
-        p = var.params[0]
+    def update(self, objective): pass
+    def apply(self, objective): return objective
+    def get_H(self, objective):
+        n = sum(p.numel() for p in objective.params)
+        p = objective.params[0]
         return ScaledIdentity(shape=(n,n), device=p.device, dtype=p.dtype)
 Noop = Identity

torchzero/modules/projections/galore.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Any, Literal
 import torch
-from ...core import Chainable, Module, Var
+from ...core import Chainable, Module, Objective
 from .projection import ProjectionBase
 # TODO

torchzero/modules/projections/projection.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Any, Literal
 import torch
-from ...core import Chainable, Module, Var
+from ...core import Chainable, Module, Objective
 from ...utils import set_storage_, vec_to_tensors
@@ -80,7 +80,7 @@ class _FakeProjectedClosure:
 class ProjectionBase(Module, ABC):
     """
     Base class for projections.
-    This is an abstract class, to use it, subclass it and override `project` and `unproject`.
+    This is an abstract class, to use it, subclass it and override ``project`` and ``unproject``.
     Args:
         modules (Chainable): modules that will be applied in the projected domain.
@@ -150,8 +150,8 @@ class ProjectionBase(Module, ABC):
         """
     @torch.no_grad
-    def step(self, var: Var):
-        params = var.params
+    def apply(self, objective: Objective):
+        params = objective.params
         settings = [self.settings[p] for p in params]
         def _project(tensors: list[torch.Tensor], current: Literal['params', 'grads', 'update']):
@@ -159,16 +159,16 @@ class ProjectionBase(Module, ABC):
             return list(self.project(
                 tensors=tensors,
                 params=params,
-                grads=var.grad,
-                loss=var.loss,
+                grads=objective.grads,
+                loss=objective.loss,
                 states=states,
                 settings=settings,
                 current=current,
             ))
-        projected_var = var.clone(clone_update=False, parent=var)
+        projected_obj = objective.clone(clone_updates=False, parent=objective)
-        closure = var.closure
+        closure = objective.closure
         # if this is True, update and grad were projected simultaneously under current="grads"
         # so update will have to be unprojected with current="grads"
@@ -179,9 +179,9 @@ class ProjectionBase(Module, ABC):
         # but if it has already been computed, it should be projected
         if self._project_params and closure is not None:
-            if self._project_update and var.update is not None:
+            if self._project_update and objective.updates is not None:
                 # project update only if it already exists
-                projected_var.update = _project(var.update, current='update')
+                projected_obj.updates = _project(objective.updates, current='update')
             else:
                 # update will be set to gradients on var.get_grad()
@@ -189,43 +189,43 @@ class ProjectionBase(Module, ABC):
                 update_is_grad = True
             # project grad only if it already exists
-            if self._project_grad and var.grad is not None:
-                projected_var.grad = _project(var.grad, current='grads')
+            if self._project_grad and objective.grads is not None:
+                projected_obj.grads = _project(objective.grads, current='grads')
         # otherwise update/grad needs to be calculated and projected here
         else:
             if self._project_update:
-                if var.update is None:
+                if objective.updates is None:
                     # update is None, meaning it will be set to `grad`.
                     # we can project grad and use it for update
-                    grad = var.get_grad()
-                    projected_var.grad = _project(grad, current='grads')
-                    projected_var.update = [g.clone() for g in projected_var.grad]
-                    del var.update
+                    grad = objective.get_grads()
+                    projected_obj.grads = _project(grad, current='grads')
+                    projected_obj.updates = [g.clone() for g in projected_obj.grads]
+                    del objective.updates
                     update_is_grad = True
                 else:
                     # update exists so it needs to be projected
-                    update = var.get_update()
-                    projected_var.update = _project(update, current='update')
-                    del update, var.update
+                    update = objective.get_updates()
+                    projected_obj.updates = _project(update, current='update')
+                    del update, objective.updates
-            if self._project_grad and projected_var.grad is None:
+            if self._project_grad and projected_obj.grads is None:
                 # projected_vars.grad may have been projected simultaneously with update
                 # but if that didn't happen, it is projected here
-                grad = var.get_grad()
-                projected_var.grad = _project(grad, current='grads')
+                grad = objective.get_grads()
+                projected_obj.grads = _project(grad, current='grads')
         original_params = None
         if self._project_params:
-            original_params = [p.clone() for p in var.params]
-            projected_params = _project(var.params, current='params')
+            original_params = [p.clone() for p in objective.params]
+            projected_params = _project(objective.params, current='params')
         else:
             # make fake params for correct shapes and state storage
             # they reuse update or grad storage for memory efficiency
-            projected_params = projected_var.update if projected_var.update is not None else projected_var.grad
+            projected_params = projected_obj.updates if projected_obj.updates is not None else projected_obj.grads
             assert projected_params is not None
         if self._projected_params is None:
@@ -245,8 +245,8 @@ class ProjectionBase(Module, ABC):
             return list(self.unproject(
                 projected_tensors=projected_tensors,
                 params=params,
-                grads=var.grad,
-                loss=var.loss,
+                grads=objective.grads,
+                loss=objective.loss,
                 states=states,
                 settings=settings,
                 current=current,
@@ -254,19 +254,19 @@ class ProjectionBase(Module, ABC):
         # project closure
         if self._project_params:
-            projected_var.closure = _make_projected_closure(closure, project_fn=_project, unproject_fn=_unproject,
+            projected_obj.closure = _make_projected_closure(closure, project_fn=_project, unproject_fn=_unproject,
                                                             params=params, projected_params=projected_params)
         elif closure is not None:
-            projected_var.closure = _FakeProjectedClosure(closure, project_fn=_project,
+            projected_obj.closure = _FakeProjectedClosure(closure, project_fn=_project,
                                                           params=params, fake_params=projected_params)
         else:
-            projected_var.closure = None
+            projected_obj.closure = None
         # ----------------------------------- step ----------------------------------- #
-        projected_var.params = projected_params
-        projected_var = self.children['modules'].step(projected_var)
+        projected_obj.params = projected_params
+        projected_obj = self.children['modules'].apply(projected_obj)
         # empty fake params storage
         # this doesn't affect update/grad because it is a different python object, set_ changes storage on an object
@@ -275,24 +275,24 @@ class ProjectionBase(Module, ABC):
                 set_storage_(p, torch.empty(0, device=p.device, dtype=p.dtype))
         # --------------------------------- unproject -------------------------------- #
-        unprojected_var = projected_var.clone(clone_update=False)
-        unprojected_var.closure = var.closure
-        unprojected_var.params = var.params
-        unprojected_var.grad = var.grad # this may also be set by projected_var since it has var as parent
+        unprojected_obj = projected_obj.clone(clone_updates=False)
+        unprojected_obj.closure = objective.closure
+        unprojected_obj.params = objective.params
+        unprojected_obj.grads = objective.grads # this may also be set by projected_var since it has var as parent
         if self._project_update:
-            assert projected_var.update is not None
-            unprojected_var.update = _unproject(projected_var.update, current='grads' if update_is_grad else 'update')
-            del projected_var.update
+            assert projected_obj.updates is not None
+            unprojected_obj.updates = _unproject(projected_obj.updates, current='grads' if update_is_grad else 'update')
+            del projected_obj.updates
-        del projected_var
+        del projected_obj
         # original params are stored if params are projected
         if original_params is not None:
-            for p, o in zip(unprojected_var.params, original_params):
+            for p, o in zip(unprojected_obj.params, original_params):
                 p.set_(o) # pyright: ignore[reportArgumentType]
-        return unprojected_var
+        return unprojected_obj

torchzero/modules/quasi_newton/__init__.py CHANGED Viewed

@@ -29,3 +29,5 @@ from .quasi_newton import (
     ShorR,
     ThomasOptimalMethod,
 )
+from .sg2 import SG2, SPSA2

torchzero/modules/quasi_newton/damping.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Literal, Protocol, overload
 import torch
 from ...utils import TensorList
-from ...utils.linalg.linear_operator import DenseInverse, LinearOperator
+from ...linalg.linear_operator import DenseInverse, LinearOperator
 from ..functional import safe_clip

torchzero/modules/quasi_newton/lbfgs.py CHANGED Viewed

@@ -4,9 +4,9 @@ from typing import overload
 import torch
-from ...core import Chainable, Transform
+from ...core import Chainable, TensorTransform
 from ...utils import TensorList, as_tensorlist, unpack_states
-from ...utils.linalg.linear_operator import LinearOperator
+from ...linalg.linear_operator import LinearOperator
 from ..functional import initial_step_size
 from .damping import DampingStrategyType, apply_damping
@@ -154,7 +154,7 @@ class LBFGSLinearOperator(LinearOperator):
         return (n, n)
-class LBFGS(Transform):
+class LBFGS(TensorTransform):
     """Limited-memory BFGS algorithm. A line search or trust region is recommended.
     Args:
@@ -226,7 +226,7 @@ class LBFGS(Transform):
             sy_tol=sy_tol,
             damping = damping,
         )
-        super().__init__(defaults, uses_grad=False, inner=inner, update_freq=update_freq)
+        super().__init__(defaults, inner=inner, update_freq=update_freq)
         self.global_state['s_history'] = deque(maxlen=history_size)
         self.global_state['y_history'] = deque(maxlen=history_size)
@@ -249,7 +249,7 @@ class LBFGS(Transform):
         self.global_state.pop('step', None)
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         p = as_tensorlist(params)
         g = as_tensorlist(tensors)
         step = self.global_state.get('step', 0)
@@ -311,14 +311,14 @@ class LBFGS(Transform):
             y_history.append(y)
             sy_history.append(sy)
-    def get_H(self, var=...):
+    def get_H(self, objective=...):
         s_history = [tl.to_vec() for tl in self.global_state['s_history']]
         y_history = [tl.to_vec() for tl in self.global_state['y_history']]
         sy_history = self.global_state['sy_history']
         return LBFGSLinearOperator(s_history, y_history, sy_history)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         scale_first = self.defaults['scale_first']
         tensors = as_tensorlist(tensors)

torchzero/modules/quasi_newton/lsr1.py CHANGED Viewed

@@ -4,9 +4,9 @@ from operator import itemgetter
 import torch
-from ...core import Chainable, Module, Transform, Var, apply_transform
+from ...core import Chainable, Module, TensorTransform, Objective, step
 from ...utils import NumberList, TensorList, as_tensorlist, generic_finfo_tiny, unpack_states, vec_to_tensors_
-from ...utils.linalg.linear_operator import LinearOperator
+from ...linalg.linear_operator import LinearOperator
 from ..functional import initial_step_size
 from .damping import DampingStrategyType, apply_damping
@@ -76,7 +76,7 @@ class LSR1LinearOperator(LinearOperator):
         return (n, n)
-class LSR1(Transform):
+class LSR1(TensorTransform):
     """Limited-memory SR1 algorithm. A line search or trust region is recommended.
     Args:
@@ -146,7 +146,7 @@ class LSR1(Transform):
             gtol_restart=gtol_restart,
             damping = damping,
         )
-        super().__init__(defaults, uses_grad=False, inner=inner, update_freq=update_freq)
+        super().__init__(defaults, inner=inner, update_freq=update_freq)
         self.global_state['s_history'] = deque(maxlen=history_size)
         self.global_state['y_history'] = deque(maxlen=history_size)
@@ -167,7 +167,7 @@ class LSR1(Transform):
         self.global_state.pop('step', None)
     @torch.no_grad
-    def update_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
         p = as_tensorlist(params)
         g = as_tensorlist(tensors)
         step = self.global_state.get('step', 0)
@@ -225,13 +225,13 @@ class LSR1(Transform):
             s_history.append(s)
             y_history.append(y)
-    def get_H(self, var=...):
+    def get_H(self, objective=...):
         s_history = [tl.to_vec() for tl in self.global_state['s_history']]
         y_history = [tl.to_vec() for tl in self.global_state['y_history']]
         return LSR1LinearOperator(s_history, y_history)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         scale_first = self.defaults['scale_first']
         tensors = as_tensorlist(tensors)

torchzero/modules/quasi_newton/quasi_newton.py CHANGED Viewed

@@ -5,9 +5,9 @@ from typing import Any, Literal
 import torch
-from ...core import Chainable, Module, TensorwiseTransform, Transform
+from ...core import Chainable, Module, TensorTransform, Transform
 from ...utils import TensorList, set_storage_, unpack_states, safe_dict_update_
-from ...utils.linalg import linear_operator
+from ...linalg import linear_operator
 from ..functional import initial_step_size, safe_clip
@@ -17,7 +17,7 @@ def _maybe_lerp_(state, key, value: torch.Tensor, beta: float | None):
     elif state[key].shape != value.shape: state[key] = value
     else: state[key].lerp_(value, 1-beta)
-class HessianUpdateStrategy(TensorwiseTransform, ABC):
+class HessianUpdateStrategy(TensorTransform, ABC):
     """Base class for quasi-newton methods that store and update hessian approximation H or inverse B.
     This is an abstract class, to use it, subclass it and override ``update_H`` and/or ``update_B``,
@@ -157,7 +157,7 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
             else: P *= init_scale
     @torch.no_grad
-    def update_tensor(self, tensor, param, grad, loss, state, setting):
+    def single_tensor_update(self, tensor, param, grad, loss, state, setting):
         p = param.view(-1); g = tensor.view(-1)
         inverse = setting['inverse']
         M_key = 'H' if inverse else 'B'
@@ -223,7 +223,7 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
         state['f_prev'] = loss
     @torch.no_grad
-    def apply_tensor(self, tensor, param, grad, loss, state, setting):
+    def single_tensor_apply(self, tensor, param, grad, loss, state, setting):
         step = state['step']
         if setting['scale_first'] and step == 1:
@@ -250,8 +250,8 @@ class HessianUpdateStrategy(TensorwiseTransform, ABC):
         self.global_state.clear()
         return tensor.mul_(initial_step_size(tensor))
-    def get_H(self, var):
-        param = var.params[0]
+    def get_H(self, objective):
+        param = objective.params[0]
         state = self.state[param]
         settings = self.settings[param]
         if "B" in state:
@@ -1005,7 +1005,7 @@ def gradient_correction(g: TensorList, s: TensorList, y: TensorList):
     return g - (y * (s.dot(g) / sy))
-class GradientCorrection(Transform):
+class GradientCorrection(TensorTransform):
     """
     Estimates gradient at minima along search direction assuming function is quadratic.
@@ -1027,9 +1027,9 @@ class GradientCorrection(Transform):
     """
     def __init__(self):
-        super().__init__(None, uses_grad=False)
+        super().__init__()
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         if 'p_prev' not in states[0]:
             p_prev = unpack_states(states, tensors, 'p_prev', init=params)
             g_prev = unpack_states(states, tensors, 'g_prev', init=tensors)
@@ -1182,16 +1182,19 @@ class ShorR(HessianUpdateStrategy):
     """Shor’s r-algorithm.
     Note:
-        A line search such as ``tz.m.StrongWolfe(a_init="quadratic", fallback=True)`` is required.
-        Similarly to conjugate gradient, ShorR doesn't have an automatic step size scaling,
-        so setting ``a_init`` in the line search is recommended.
+        - A line search such as ``[tz.m.StrongWolfe(a_init="quadratic", fallback=True), tz.m.Mul(1.2)]`` is required. Similarly to conjugate gradient, ShorR doesn't have an automatic step size scaling, so setting ``a_init`` in the line search is recommended.
+        - The line search should try to overstep by a little, therefore it can help to multiply direction given by a line search by some value slightly larger than 1 such as 1.2.
     References:
-        S HOR , N. Z. (1985) Minimization Methods for Non-differentiable Functions. New York: Springer.
+        Those are the original references, but neither seem to be available online:
+            - Shor, N. Z., Utilization of the Operation of Space Dilatation in the Minimization of Convex Functions, Kibernetika, No. 1, pp. 6-12, 1970.
+            - Skokov, V. A., Note on Minimization Methods Employing Space Stretching, Kibernetika, No. 4, pp. 115-117, 1974.
-        Burke, James V., Adrian S. Lewis, and Michael L. Overton. "The Speed of Shor's R-algorithm." IMA Journal of numerical analysis 28.4 (2008): 711-720. - good overview.
+        An overview is available in [Burke, James V., Adrian S. Lewis, and Michael L. Overton. "The Speed of Shor's R-algorithm." IMA Journal of numerical analysis 28.4 (2008): 711-720](https://sites.math.washington.edu/~burke/papers/reprints/60-speed-Shor-R.pdf).
-        Ansari, Zafar A. Limited Memory Space Dilation and Reduction Algorithms. Diss. Virginia Tech, 1998. - this is where a more efficient formula is described.
+        Reference by Skokov, V. A. describes a more efficient formula which can be found here [Ansari, Zafar A. Limited Memory Space Dilation and Reduction Algorithms. Diss. Virginia Tech, 1998.](https://camo.ici.ro/books/thesis/th.pdf)
     """
     def __init__(
@@ -1229,3 +1232,9 @@ class ShorR(HessianUpdateStrategy):
     def update_H(self, H, s, y, p, g, p_prev, g_prev, state, setting):
         return shor_r_(H=H, y=y, alpha=setting['alpha'])
+# Todd, Michael J. "The symmetric rank-one quasi-Newton method is a space-dilation subgradient algorithm." Operations research letters 5.5 (1986): 217-219.
+# TODO
+# Sorensen, D. C. "The q-superlinear convergence of a collinear scaling algorithm for unconstrained optimization." SIAM Journal on Numerical Analysis 17.1 (1980): 84-114.

torchzero 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl