PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

tests/test_identical.py +22 -22
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +225 -214
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +2 -2
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +53 -57
torchzero/core/module.py +132 -52
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +11 -0
torchzero/linalg/eigh.py +253 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +93 -0
torchzero/{utils/linalg → linalg}/qr.py +16 -2
torchzero/{utils/linalg → linalg}/solve.py +74 -88
torchzero/linalg/svd.py +47 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +4 -3
torchzero/modules/adaptive/__init__.py +11 -3
torchzero/modules/adaptive/adagrad.py +167 -217
torchzero/modules/adaptive/adahessian.py +76 -105
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +50 -31
torchzero/modules/adaptive/adaptive_heavyball.py +12 -7
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/ggt.py +186 -0
torchzero/modules/adaptive/lion.py +7 -11
torchzero/modules/adaptive/lre_optimizers.py +299 -0
torchzero/modules/adaptive/mars.py +7 -7
torchzero/modules/adaptive/matrix_momentum.py +48 -52
torchzero/modules/adaptive/msam.py +71 -53
torchzero/modules/adaptive/muon.py +67 -129
torchzero/modules/adaptive/natural_gradient.py +63 -41
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/psgd/__init__.py +5 -0
torchzero/modules/adaptive/psgd/_psgd_utils.py +37 -0
torchzero/modules/adaptive/psgd/psgd.py +1390 -0
torchzero/modules/adaptive/psgd/psgd_dense_newton.py +174 -0
torchzero/modules/adaptive/psgd/psgd_kron_newton.py +203 -0
torchzero/modules/adaptive/psgd/psgd_kron_whiten.py +185 -0
torchzero/modules/adaptive/psgd/psgd_lra_newton.py +118 -0
torchzero/modules/adaptive/psgd/psgd_lra_whiten.py +116 -0
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +149 -130
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +22 -25
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +27 -38
torchzero/modules/experimental/__init__.py +7 -6
torchzero/modules/experimental/adanystrom.py +258 -0
torchzero/modules/experimental/common_directions_whiten.py +142 -0
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/cubic_adam.py +160 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/eigen_sr1.py +182 -0
torchzero/modules/experimental/eigengrad.py +207 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/l_infinity.py +1 -1
torchzero/modules/experimental/matrix_nag.py +122 -0
torchzero/modules/experimental/newton_solver.py +23 -54
torchzero/modules/experimental/newtonnewton.py +45 -48
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +24 -21
torchzero/modules/least_squares/gn.py +121 -50
torchzero/modules/line_search/backtracking.py +4 -4
torchzero/modules/line_search/line_search.py +33 -33
torchzero/modules/line_search/strong_wolfe.py +4 -4
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +11 -79
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +121 -123
torchzero/modules/misc/multistep.py +52 -53
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +31 -29
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +37 -31
torchzero/modules/momentum/momentum.py +12 -12
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +20 -20
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/{functional.py → opt_utils.py} +1 -1
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +46 -43
torchzero/modules/quasi_newton/__init__.py +1 -1
torchzero/modules/quasi_newton/damping.py +2 -2
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +10 -10
torchzero/modules/quasi_newton/lsr1.py +10 -10
torchzero/modules/quasi_newton/quasi_newton.py +54 -39
torchzero/modules/quasi_newton/sg2.py +69 -205
torchzero/modules/restarts/restars.py +39 -37
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +57 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +165 -196
torchzero/modules/second_order/newton_cg.py +105 -157
torchzero/modules/second_order/nystrom.py +216 -185
torchzero/modules/second_order/rsn.py +132 -125
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +10 -10
torchzero/modules/step_size/adaptive.py +24 -24
torchzero/modules/step_size/lr.py +17 -17
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +3 -3
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +2 -2
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +23 -21
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +17 -18
torchzero/modules/wrappers/optim_wrapper.py +14 -14
torchzero/modules/zeroth_order/cd.py +10 -7
torchzero/optim/mbs.py +291 -0
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -13
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +8 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/benchmarks/__init__.py +0 -0
torchzero/utils/benchmarks/logistic.py +122 -0
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +97 -73
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/METADATA +1 -1
torchzero-0.4.1.dist-info/RECORD +209 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/adaptive/lmadagrad.py +0 -186
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.1.dist-info}/top_level.txt +0 -0

torchzero/modules/step_size/lr.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import torch
 import random
-from ...core import Transform
+from ...core import TensorTransform
 from ...utils import NumberList, TensorList, generic_ne, unpack_dicts
 def lazy_lr(tensors: TensorList, lr: float | list, inplace:bool):
@@ -12,24 +12,24 @@ def lazy_lr(tensors: TensorList, lr: float | list, inplace:bool):
         return tensors * lr
     return tensors
-class LR(Transform):
+class LR(TensorTransform):
     """Learning rate. Adding this module also adds support for LR schedulers."""
     def __init__(self, lr: float):
         defaults=dict(lr=lr)
-        super().__init__(defaults, uses_grad=False)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         return lazy_lr(TensorList(tensors), lr=[s['lr'] for s in settings], inplace=True)
-class StepSize(Transform):
+class StepSize(TensorTransform):
     """this is exactly the same as LR, except the `lr` parameter can be renamed to any other name to avoid clashes"""
     def __init__(self, step_size: float, key = 'step_size'):
         defaults={"key": key, key: step_size}
-        super().__init__(defaults, uses_grad=False)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         return lazy_lr(TensorList(tensors), lr=[s[s['key']] for s in settings], inplace=True)
@@ -38,8 +38,8 @@ def _warmup_lr(step: int, start_lr: float | NumberList, end_lr: float | NumberLi
     if step > steps: return end_lr
     return start_lr + (end_lr - start_lr) * (step / steps)
-class Warmup(Transform):
-    """Learning rate warmup, linearly increases learning rate multiplier from :code:`start_lr` to :code:`end_lr` over :code:`steps` steps.
+class Warmup(TensorTransform):
+    """Learning rate warmup, linearly increases learning rate multiplier from ``start_lr`` to ``end_lr`` over ``steps`` steps.
     Args:
         steps (int, optional): number of steps to perform warmup for. Defaults to 100.
@@ -51,7 +51,7 @@ class Warmup(Transform):
         .. code-block:: python
-            opt = tz.Modular(
+            opt = tz.Optimizer(
                 model.parameters(),
                 tz.m.Adam(),
                 tz.m.LR(1e-2),
@@ -64,7 +64,7 @@ class Warmup(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         start_lr, end_lr = unpack_dicts(settings, 'start_lr', 'end_lr', cls = NumberList)
         num_steps = settings[0]['steps']
         step = self.global_state.get('step', 0)
@@ -77,7 +77,7 @@ class Warmup(Transform):
         self.global_state['step'] = step + 1
         return tensors
-class WarmupNormClip(Transform):
+class WarmupNormClip(TensorTransform):
     """Warmup via clipping of the update norm.
     Args:
@@ -90,7 +90,7 @@ class WarmupNormClip(Transform):
         .. code-block:: python
-            opt = tz.Modular(
+            opt = tz.Optimizer(
                 model.parameters(),
                 tz.m.Adam(),
                 tz.m.WarmupNormClip(steps=1000)
@@ -102,7 +102,7 @@ class WarmupNormClip(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         start_norm, end_norm = unpack_dicts(settings, 'start_norm', 'end_norm', cls = NumberList)
         num_steps = settings[0]['steps']
         step = self.global_state.get('step', 0)
@@ -118,8 +118,8 @@ class WarmupNormClip(Transform):
         return tensors
-class RandomStepSize(Transform):
-    """Uses random global or layer-wise step size from `low` to `high`.
+class RandomStepSize(TensorTransform):
+    """Uses random global or layer-wise step size from ``low`` to ``high``.
     Args:
         low (float, optional): minimum learning rate. Defaults to 0.
@@ -133,7 +133,7 @@ class RandomStepSize(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         s = settings[0]
         parameterwise = s['parameterwise']

torchzero/modules/termination/termination.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
-from typing import cast
+from typing import cast, final
 import torch
-from ...core import Module, Var
+from ...core import Module, Objective
 from ...utils import Metrics, TensorList, safe_dict_update_, tofloat
@@ -16,14 +16,15 @@ class TerminationCriteriaBase(Module):
         super().__init__(defaults)
     @abstractmethod
-    def termination_criteria(self, var: Var) -> bool:
+    def termination_criteria(self, objective: Objective) -> bool:
         ...
-    def should_terminate(self, var: Var) -> bool:
+    @final
+    def should_terminate(self, objective: Objective) -> bool:
         n_bad = self.global_state.get('_n_bad', 0)
         n = self.defaults['_n']
-        if self.termination_criteria(var):
+        if self.termination_criteria(objective):
             n_bad += 1
             if n_bad >= n:
                 self.global_state['_n_bad'] = 0
@@ -36,12 +37,12 @@ class TerminationCriteriaBase(Module):
         return False
-    def update(self, var):
-        var.should_terminate = self.should_terminate(var)
-        if var.should_terminate: self.global_state['_n_bad'] = 0
+    def update(self, objective):
+        objective.should_terminate = self.should_terminate(objective)
+        if objective.should_terminate: self.global_state['_n_bad'] = 0
-    def apply(self, var):
-        return var
+    def apply(self, objective):
+        return objective
 class TerminateAfterNSteps(TerminationCriteriaBase):
@@ -49,7 +50,7 @@ class TerminateAfterNSteps(TerminationCriteriaBase):
         defaults = dict(steps=steps)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -61,16 +62,17 @@ class TerminateAfterNEvaluations(TerminationCriteriaBase):
         defaults = dict(maxevals=maxevals)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         maxevals = self.defaults['maxevals']
-        return var.modular.num_evaluations >= maxevals
+        assert objective.modular is not None
+        return objective.modular.num_evaluations >= maxevals
 class TerminateAfterNSeconds(TerminationCriteriaBase):
     def __init__(self, seconds:float, sec_fn = time.time):
         defaults = dict(seconds=seconds, sec_fn=sec_fn)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         max_seconds = self.defaults['seconds']
         sec_fn = self.defaults['sec_fn']
@@ -88,10 +90,10 @@ class TerminateByGradientNorm(TerminationCriteriaBase):
         defaults = dict(tol=tol, ord=ord)
         super().__init__(defaults, n=n)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         tol = self.defaults['tol']
         ord = self.defaults['ord']
-        return TensorList(var.get_grad()).global_metric(ord) <= tol
+        return TensorList(objective.get_grads()).global_metric(ord) <= tol
 class TerminateByUpdateNorm(TerminationCriteriaBase):
@@ -100,20 +102,20 @@ class TerminateByUpdateNorm(TerminationCriteriaBase):
         defaults = dict(tol=tol, ord=ord)
         super().__init__(defaults, n=n)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
         tol = self.defaults['tol']
         ord = self.defaults['ord']
-        p_prev = self.get_state(var.params, 'p_prev', cls=TensorList)
+        p_prev = self.get_state(objective.params, 'p_prev', cls=TensorList)
         if step == 0:
-            p_prev.copy_(var.params)
+            p_prev.copy_(objective.params)
             return False
-        should_terminate = (p_prev - var.params).global_metric(ord) <= tol
-        p_prev.copy_(var.params)
+        should_terminate = (p_prev - objective.params).global_metric(ord) <= tol
+        p_prev.copy_(objective.params)
         return should_terminate
@@ -122,10 +124,10 @@ class TerminateOnNoImprovement(TerminationCriteriaBase):
         defaults = dict(tol=tol)
         super().__init__(defaults, n=n)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         tol = self.defaults['tol']
-        f = tofloat(var.get_loss(False))
+        f = tofloat(objective.get_loss(False))
         if 'f_min' not in self.global_state:
             self.global_state['f_min'] = f
             return False
@@ -141,9 +143,9 @@ class TerminateOnLossReached(TerminationCriteriaBase):
         defaults = dict(value=value)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         value = self.defaults['value']
-        return var.get_loss(False) <= value
+        return objective.get_loss(False) <= value
 class TerminateAny(TerminationCriteriaBase):
     def __init__(self, *criteria: TerminationCriteriaBase):
@@ -151,9 +153,9 @@ class TerminateAny(TerminationCriteriaBase):
         self.set_children_sequence(criteria)
-    def termination_criteria(self, var: Var) -> bool:
+    def termination_criteria(self, objective: Objective) -> bool:
         for c in self.get_children_sequence():
-            if cast(TerminationCriteriaBase, c).termination_criteria(var): return True
+            if cast(TerminationCriteriaBase, c).termination_criteria(objective): return True
         return False
@@ -163,9 +165,9 @@ class TerminateAll(TerminationCriteriaBase):
         self.set_children_sequence(criteria)
-    def termination_criteria(self, var: Var) -> bool:
+    def termination_criteria(self, objective: Objective) -> bool:
         for c in self.get_children_sequence():
-            if not cast(TerminationCriteriaBase, c).termination_criteria(var): return False
+            if not cast(TerminationCriteriaBase, c).termination_criteria(objective): return False
         return True
@@ -173,7 +175,7 @@ class TerminateNever(TerminationCriteriaBase):
     def __init__(self):
         super().__init__()
-    def termination_criteria(self, var): return False
+    def termination_criteria(self, objective): return False
 def make_termination_criteria(
     ftol: float | None = None,

torchzero/modules/trust_region/cubic_regularization.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from ...core import Chainable, Module
 from ...utils import TensorList, vec_to_tensors
-from ...utils.linalg.linear_operator import LinearOperator
+from ...linalg.linear_operator import LinearOperator
 from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
@@ -58,7 +58,7 @@ def ls_cubic_solver(f, g:torch.Tensor, H:LinearOperator, M: float, loss_at_param
     for _ in range(it_max):
         r_try = (r_min + r_max) / 2
         lam = r_try * M
-        s_lam = H.add_diagonal(lam).solve(g).neg()
+        s_lam = H.solve_plus_diag(g, lam).neg()
         # s_lam = -torch.linalg.solve(B + lam*id_matrix, g)
         solver_it += 1
         crit = conv_criterion(s_lam, r_try)
@@ -109,7 +109,7 @@ class CubicRegularization(TrustRegionBase):
         .. code-block:: python
-            opt = tz.Modular(
+            opt = tz.Optimizer(
                 model.parameters(),
                 tz.m.CubicRegularization(tz.m.Newton()),
             )

torchzero/modules/trust_region/levenberg_marquardt.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import torch
 from ...core import Chainable, Module
-from ...utils.linalg import linear_operator
+from ...linalg import linear_operator
 from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
@@ -32,38 +32,31 @@ class LevenbergMarquardt(TrustRegionBase):
         max_attempts (max_attempts, optional):
             maximum number of trust region size size reductions per step. A zero update vector is returned when
             this limit is exceeded. Defaults to 10.
+        adaptive (bool, optional):
+            if True, trust radius is multiplied by square root of gradient norm.
         fallback (bool, optional):
             if ``True``, when ``hess_module`` maintains hessian inverse which can't be inverted efficiently, it will
             be inverted anyway. When ``False`` (default), a ``RuntimeError`` will be raised instead.
         inner (Chainable | None, optional): preconditioning is applied to output of thise module. Defaults to None.
-    Examples:
-        Gauss-Newton with Levenberg-Marquardt trust-region
+    ### Examples:
-        .. code-block:: python
+    Gauss-Newton with Levenberg-Marquardt trust-region
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LevenbergMarquardt(tz.m.GaussNewton()),
-            )
+    ```python
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.GaussNewton()),
+    )
+    ```
-        LM-SR1
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LevenbergMarquardt(tz.m.SR1(inverse=False)),
-            )
-        First order trust region (hessian is assumed to be identity)
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LevenbergMarquardt(tz.m.Identity()),
-            )
+    LM-SR1
+    ```python
+    opt = tz.Optimizer(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.SR1(inverse=False)),
+    )
+    ```
     """
     def __init__(
@@ -78,11 +71,12 @@ class LevenbergMarquardt(TrustRegionBase):
         max_attempts: int = 10,
         radius_strategy: _RadiusStrategy | _RADIUS_KEYS = 'default',
         y: float = 0,
+        adaptive: bool = False,
         fallback: bool = False,
         update_freq: int = 1,
         inner: Chainable | None = None,
     ):
-        defaults = dict(y=y, fallback=fallback)
+        defaults = dict(y=y, fallback=fallback, adaptive=adaptive)
         super().__init__(
             defaults=defaults,
             hess_module=hess_module,
@@ -103,6 +97,7 @@ class LevenbergMarquardt(TrustRegionBase):
     def trust_solve(self, f, g, H, radius, params, closure, settings):
         y = settings['y']
+        adaptive = settings["adaptive"]
         if isinstance(H, linear_operator.DenseInverse):
             if settings['fallback']:
@@ -117,12 +112,14 @@ class LevenbergMarquardt(TrustRegionBase):
                 )
         reg = 1/radius
+        if adaptive: reg = reg * torch.linalg.vector_norm(g).sqrt()
         if y == 0:
-            return H.add_diagonal(reg).solve(g)
+            return H.solve_plus_diag(g, reg) # pyright:ignore[reportAttributeAccessIssue]
         diag = H.diagonal()
         diag = torch.where(diag < torch.finfo(diag.dtype).tiny * 2, 1, diag)
         if y != 1: diag = (diag*y) + (1-y)
-        return H.add_diagonal(diag*reg).solve(g)
+        return H.solve_plus_diag(g, diag*reg)

torchzero/modules/trust_region/trust_cg.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 from ...core import Chainable, Module
-from ...utils.linalg import cg, linear_operator
+from ...linalg import cg, linear_operator
 from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
@@ -47,7 +47,7 @@ class TrustCG(TrustRegionBase):
         .. code-block:: python
-            opt = tz.Modular(
+            opt = tz.Optimizer(
                 model.parameters(),
                 tz.m.TrustCG(hess_module=tz.m.SR1(inverse=False)),
             )

torchzero/modules/trust_region/trust_region.py CHANGED Viewed

@@ -7,9 +7,16 @@ from typing import Any, Literal, Protocol, cast, final, overload
 import torch
-from ...core import Chainable, Module, Var, apply_transform
-from ...utils import TensorList, safe_dict_update_, tofloat, vec_to_tensors, generic_finfo, generic_vector_norm
-from ...utils.linalg.linear_operator import LinearOperator
+from ...core import Chainable, Module, Objective
+from ...linalg.linear_operator import LinearOperator
+from ...utils import (
+    TensorList,
+    generic_finfo,
+    generic_vector_norm,
+    safe_dict_update_,
+    tofloat,
+    vec_to_tensors,
+)
 def _flatten_tensors(tensors: list[torch.Tensor]):
@@ -256,24 +263,24 @@ class TrustRegionBase(Module, ABC):
         """Solve Hx=g with a trust region penalty/bound defined by `radius`"""
         ... # pylint:disable=unnecessary-ellipsis
-    def trust_region_update(self, var: Var, H: LinearOperator | None) -> None:
+    def trust_region_update(self, objective: Objective, H: LinearOperator | None) -> None:
         """updates the state of this module after H or B have been updated, if necessary"""
-    def trust_region_apply(self, var: Var, tensors:list[torch.Tensor], H: LinearOperator | None) -> Var:
-        """Solves the trust region subproblem and outputs ``Var`` with the solution direction."""
+    def trust_region_apply(self, objective: Objective, tensors:list[torch.Tensor], H: LinearOperator | None) -> Objective:
+        """Solves the trust region subproblem and outputs ``Objective`` with the solution direction."""
         assert H is not None
-        params = TensorList(var.params)
+        params = TensorList(objective.params)
         settings = self.settings[params[0]]
         g = _flatten_tensors(tensors)
         max_attempts = settings['max_attempts']
         # loss at x_0
-        loss = var.loss
-        closure = var.closure
+        loss = objective.loss
+        closure = objective.closure
         if closure is None: raise RuntimeError("Trust region requires closure")
-        if loss is None: loss = var.get_loss(False)
+        if loss is None: loss = objective.get_loss(False)
         loss = tofloat(loss)
         # trust region step and update
@@ -313,38 +320,36 @@ class TrustRegionBase(Module, ABC):
             )
         assert d is not None
-        if success: var.update = vec_to_tensors(d, params)
-        else: var.update = params.zeros_like()
+        if success: objective.updates = vec_to_tensors(d, params)
+        else: objective.updates = params.zeros_like()
-        return var
+        return objective
     @final
     @torch.no_grad
-    def update(self, var):
+    def update(self, objective):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
         if step % self.defaults["update_freq"] == 0:
             hessian_module = self.children['hess_module']
-            hessian_module.update(var)
-            H = hessian_module.get_H(var)
+            hessian_module.update(objective)
+            H = hessian_module.get_H(objective)
             self.global_state["H"] = H
-            self.trust_region_update(var, H=H)
+            self.trust_region_update(objective, H=H)
     @final
     @torch.no_grad
-    def apply(self, var):
+    def apply(self, objective):
         H = self.global_state.get('H', None)
         # -------------------------------- inner step -------------------------------- #
-        update = var.get_update()
-        if 'inner' in self.children:
-            update = apply_transform(self.children['inner'], update, params=var.params, grads=var.grad, var=var)
+        objective = self.inner_step("inner", objective, must_exist=False)
         # ----------------------------------- apply ---------------------------------- #
-        return self.trust_region_apply(var=var, tensors=update, H=H)
+        return self.trust_region_apply(objective=objective, tensors=objective.get_updates(), H=H)

torchzero/modules/variance_reduction/svrg.py CHANGED Viewed

@@ -3,15 +3,16 @@ from functools import partial
 import torch
-from ...core.module import Module
+from ...core import Module, Objective
 from ...utils import tofloat
-def _reset_except_self(optimizer, var, self: Module):
-    for m in optimizer.unrolled_modules:
+def _reset_except_self(objective: Objective, modules, self: Module):
+    for m in modules:
         if m is not self:
             m.reset()
 class SVRG(Module):
     """Stochastic variance reduced gradient method (SVRG).
@@ -43,7 +44,7 @@ class SVRG(Module):
     ## Examples:
     SVRG-LBFGS
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.SVRG(len(dataloader)),
         tz.m.LBFGS(),
@@ -53,7 +54,7 @@ class SVRG(Module):
     For extra variance reduction one can use Online versions of algorithms, although it won't always help.
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.SVRG(len(dataloader)),
         tz.m.Online(tz.m.LBFGS()),
@@ -62,7 +63,7 @@ class SVRG(Module):
     Variance reduction can also be applied to gradient estimators.
     ```python
-    opt = tz.Modular(
+    opt = tz.Optimizer(
         model.parameters(),
         tz.m.SPSA(),
         tz.m.SVRG(100),
@@ -71,7 +72,7 @@ class SVRG(Module):
     ```
     ## Notes
-    The SVRG gradient is computed as ``g_b(x) - alpha * g_b(x_0) - g_f(x0.)``, where:
+    The SVRG gradient is computed as ``g_b(x) - alpha * (g_b(x_0) - g_f(x_0))``, where:
     - ``x`` is current parameters
     - ``x_0`` is initial parameters, where full gradient was computed
     - ``g_b`` refers to mini-batch gradient at ``x`` or ``x_0``
@@ -83,17 +84,18 @@ class SVRG(Module):
         defaults = dict(svrg_steps = svrg_steps, accum_steps=accum_steps, reset_before_accum=reset_before_accum, svrg_loss=svrg_loss, alpha=alpha)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
-        params = var.params
-        closure = var.closure
+    def update(self, objective):
+        params = objective.params
+        closure = objective.closure
         assert closure is not None
         if "full_grad" not in self.global_state:
             # -------------------------- calculate full gradient ------------------------- #
-            if "full_closure" in var.storage:
-                full_closure = var.storage['full_closure']
+            if "full_closure" in objective.storage:
+                full_closure = objective.storage['full_closure']
                 with torch.enable_grad():
                     full_loss = full_closure()
                     if all(p.grad is None for p in params):
@@ -116,12 +118,12 @@ class SVRG(Module):
                 # accumulate grads
                 accumulator = self.get_state(params, 'accumulator')
-                grad = var.get_grad()
+                grad = objective.get_grads()
                 torch._foreach_add_(accumulator, grad)
                 # accumulate loss
                 loss_accumulator = self.global_state.get('loss_accumulator', 0)
-                loss_accumulator += tofloat(var.loss)
+                loss_accumulator += tofloat(objective.loss)
                 self.global_state['loss_accumulator'] = loss_accumulator
                 # on nth step, use the accumulated gradient
@@ -136,10 +138,10 @@ class SVRG(Module):
                 # otherwise skip update until enough grads are accumulated
                 else:
-                    var.update = None
-                    var.stop = True
-                    var.skip_update = True
-                    return var
+                    objective.updates = None
+                    objective.stop = True
+                    objective.skip_update = True
+                    return
         svrg_steps = self.defaults['svrg_steps']
@@ -194,7 +196,7 @@ class SVRG(Module):
             return closure(False)
-        var.closure = svrg_closure
+        objective.closure = svrg_closure
         # --- after svrg_steps steps reset so that new full gradient is calculated on next step --- #
         if current_svrg_step >= svrg_steps:
@@ -203,6 +205,6 @@ class SVRG(Module):
             del self.global_state['full_loss']
             del self.global_state['x_0']
             if self.defaults['reset_before_accum']:
-                var.post_step_hooks.append(partial(_reset_except_self, self=self))
+                objective.post_step_hooks.append(partial(_reset_except_self, self=self))
-        return var
+    def apply(self, objective): return objective

torchzero/modules/weight_decay/__init__.py CHANGED Viewed

@@ -1 +1,2 @@
-from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_, RelativeWeightDecay
+from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_, RelativeWeightDecay
+from .reinit import RandomReinitialize

torchzero 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.1py3-none-any.whl