PyPI - torchzero - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +43 -33
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +7 -4
torchzero/core/chain.py +20 -23
torchzero/core/functional.py +90 -24
torchzero/core/modular.py +48 -52
torchzero/core/module.py +130 -50
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +55 -24
torchzero/core/transform.py +261 -367
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +99 -49
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +2 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +15 -12
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +3 -3
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +1 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/backtracking.py +2 -2
torchzero/modules/line_search/line_search.py +32 -32
torchzero/modules/line_search/strong_wolfe.py +2 -2
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +50 -48
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +10 -10
torchzero/modules/quasi_newton/sg2.py +19 -19
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/ifn.py +31 -62
torchzero/modules/second_order/inm.py +49 -53
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +57 -90
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +157 -177
torchzero/modules/second_order/rsn.py +106 -96
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +10 -10
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +93 -69
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/core/var.py +0 -376
torchzero/modules/experimental/momentum.py +0 -160
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.15.dist-info/RECORD +0 -175
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.15.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/termination/termination.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
-from typing import cast
+from typing import cast, final
 import torch
-from ...core import Module, Var
+from ...core import Module, Objective
 from ...utils import Metrics, TensorList, safe_dict_update_, tofloat
@@ -16,14 +16,15 @@ class TerminationCriteriaBase(Module):
         super().__init__(defaults)
     @abstractmethod
-    def termination_criteria(self, var: Var) -> bool:
+    def termination_criteria(self, objective: Objective) -> bool:
         ...
-    def should_terminate(self, var: Var) -> bool:
+    @final
+    def should_terminate(self, objective: Objective) -> bool:
         n_bad = self.global_state.get('_n_bad', 0)
         n = self.defaults['_n']
-        if self.termination_criteria(var):
+        if self.termination_criteria(objective):
             n_bad += 1
             if n_bad >= n:
                 self.global_state['_n_bad'] = 0
@@ -36,12 +37,12 @@ class TerminationCriteriaBase(Module):
         return False
-    def update(self, var):
-        var.should_terminate = self.should_terminate(var)
-        if var.should_terminate: self.global_state['_n_bad'] = 0
+    def update(self, objective):
+        objective.should_terminate = self.should_terminate(objective)
+        if objective.should_terminate: self.global_state['_n_bad'] = 0
-    def apply(self, var):
-        return var
+    def apply(self, objective):
+        return objective
 class TerminateAfterNSteps(TerminationCriteriaBase):
@@ -49,7 +50,7 @@ class TerminateAfterNSteps(TerminationCriteriaBase):
         defaults = dict(steps=steps)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
@@ -61,16 +62,17 @@ class TerminateAfterNEvaluations(TerminationCriteriaBase):
         defaults = dict(maxevals=maxevals)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         maxevals = self.defaults['maxevals']
-        return var.modular.num_evaluations >= maxevals
+        assert objective.modular is not None
+        return objective.modular.num_evaluations >= maxevals
 class TerminateAfterNSeconds(TerminationCriteriaBase):
     def __init__(self, seconds:float, sec_fn = time.time):
         defaults = dict(seconds=seconds, sec_fn=sec_fn)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         max_seconds = self.defaults['seconds']
         sec_fn = self.defaults['sec_fn']
@@ -88,10 +90,10 @@ class TerminateByGradientNorm(TerminationCriteriaBase):
         defaults = dict(tol=tol, ord=ord)
         super().__init__(defaults, n=n)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         tol = self.defaults['tol']
         ord = self.defaults['ord']
-        return TensorList(var.get_grad()).global_metric(ord) <= tol
+        return TensorList(objective.get_grads()).global_metric(ord) <= tol
 class TerminateByUpdateNorm(TerminationCriteriaBase):
@@ -100,20 +102,20 @@ class TerminateByUpdateNorm(TerminationCriteriaBase):
         defaults = dict(tol=tol, ord=ord)
         super().__init__(defaults, n=n)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
         tol = self.defaults['tol']
         ord = self.defaults['ord']
-        p_prev = self.get_state(var.params, 'p_prev', cls=TensorList)
+        p_prev = self.get_state(objective.params, 'p_prev', cls=TensorList)
         if step == 0:
-            p_prev.copy_(var.params)
+            p_prev.copy_(objective.params)
             return False
-        should_terminate = (p_prev - var.params).global_metric(ord) <= tol
-        p_prev.copy_(var.params)
+        should_terminate = (p_prev - objective.params).global_metric(ord) <= tol
+        p_prev.copy_(objective.params)
         return should_terminate
@@ -122,10 +124,10 @@ class TerminateOnNoImprovement(TerminationCriteriaBase):
         defaults = dict(tol=tol)
         super().__init__(defaults, n=n)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         tol = self.defaults['tol']
-        f = tofloat(var.get_loss(False))
+        f = tofloat(objective.get_loss(False))
         if 'f_min' not in self.global_state:
             self.global_state['f_min'] = f
             return False
@@ -141,9 +143,9 @@ class TerminateOnLossReached(TerminationCriteriaBase):
         defaults = dict(value=value)
         super().__init__(defaults)
-    def termination_criteria(self, var):
+    def termination_criteria(self, objective):
         value = self.defaults['value']
-        return var.get_loss(False) <= value
+        return objective.get_loss(False) <= value
 class TerminateAny(TerminationCriteriaBase):
     def __init__(self, *criteria: TerminationCriteriaBase):
@@ -151,9 +153,9 @@ class TerminateAny(TerminationCriteriaBase):
         self.set_children_sequence(criteria)
-    def termination_criteria(self, var: Var) -> bool:
+    def termination_criteria(self, objective: Objective) -> bool:
         for c in self.get_children_sequence():
-            if cast(TerminationCriteriaBase, c).termination_criteria(var): return True
+            if cast(TerminationCriteriaBase, c).termination_criteria(objective): return True
         return False
@@ -163,9 +165,9 @@ class TerminateAll(TerminationCriteriaBase):
         self.set_children_sequence(criteria)
-    def termination_criteria(self, var: Var) -> bool:
+    def termination_criteria(self, objective: Objective) -> bool:
         for c in self.get_children_sequence():
-            if not cast(TerminationCriteriaBase, c).termination_criteria(var): return False
+            if not cast(TerminationCriteriaBase, c).termination_criteria(objective): return False
         return True
@@ -173,7 +175,7 @@ class TerminateNever(TerminationCriteriaBase):
     def __init__(self):
         super().__init__()
-    def termination_criteria(self, var): return False
+    def termination_criteria(self, objective): return False
 def make_termination_criteria(
     ftol: float | None = None,

torchzero/modules/trust_region/cubic_regularization.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from ...core import Chainable, Module
 from ...utils import TensorList, vec_to_tensors
-from ...utils.linalg.linear_operator import LinearOperator
+from ...linalg.linear_operator import LinearOperator
 from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
@@ -58,7 +58,7 @@ def ls_cubic_solver(f, g:torch.Tensor, H:LinearOperator, M: float, loss_at_param
     for _ in range(it_max):
         r_try = (r_min + r_max) / 2
         lam = r_try * M
-        s_lam = H.add_diagonal(lam).solve(g).neg()
+        s_lam = H.solve_plus_diag(g, lam).neg()
         # s_lam = -torch.linalg.solve(B + lam*id_matrix, g)
         solver_it += 1
         crit = conv_criterion(s_lam, r_try)

torchzero/modules/trust_region/levenberg_marquardt.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import torch
 from ...core import Chainable, Module
-from ...utils.linalg import linear_operator
+from ...linalg import linear_operator
 from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy
@@ -32,38 +32,31 @@ class LevenbergMarquardt(TrustRegionBase):
         max_attempts (max_attempts, optional):
             maximum number of trust region size size reductions per step. A zero update vector is returned when
             this limit is exceeded. Defaults to 10.
+        adaptive (bool, optional):
+            if True, trust radius is multiplied by square root of gradient norm.
         fallback (bool, optional):
             if ``True``, when ``hess_module`` maintains hessian inverse which can't be inverted efficiently, it will
             be inverted anyway. When ``False`` (default), a ``RuntimeError`` will be raised instead.
         inner (Chainable | None, optional): preconditioning is applied to output of thise module. Defaults to None.
-    Examples:
-        Gauss-Newton with Levenberg-Marquardt trust-region
+    ### Examples:
-        .. code-block:: python
+    Gauss-Newton with Levenberg-Marquardt trust-region
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LevenbergMarquardt(tz.m.GaussNewton()),
-            )
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.GaussNewton()),
+    )
+    ```
-        LM-SR1
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LevenbergMarquardt(tz.m.SR1(inverse=False)),
-            )
-        First order trust region (hessian is assumed to be identity)
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.LevenbergMarquardt(tz.m.Identity()),
-            )
+    LM-SR1
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.LevenbergMarquardt(tz.m.SR1(inverse=False)),
+    )
+    ```
     """
     def __init__(
@@ -78,11 +71,12 @@ class LevenbergMarquardt(TrustRegionBase):
         max_attempts: int = 10,
         radius_strategy: _RadiusStrategy | _RADIUS_KEYS = 'default',
         y: float = 0,
+        adaptive: bool = False,
         fallback: bool = False,
         update_freq: int = 1,
         inner: Chainable | None = None,
     ):
-        defaults = dict(y=y, fallback=fallback)
+        defaults = dict(y=y, fallback=fallback, adaptive=adaptive)
         super().__init__(
             defaults=defaults,
             hess_module=hess_module,
@@ -103,6 +97,7 @@ class LevenbergMarquardt(TrustRegionBase):
     def trust_solve(self, f, g, H, radius, params, closure, settings):
         y = settings['y']
+        adaptive = settings["adaptive"]
         if isinstance(H, linear_operator.DenseInverse):
             if settings['fallback']:
@@ -117,12 +112,14 @@ class LevenbergMarquardt(TrustRegionBase):
                 )
         reg = 1/radius
+        if adaptive: reg = reg * torch.linalg.vector_norm(g).sqrt()
         if y == 0:
-            return H.add_diagonal(reg).solve(g)
+            return H.solve_plus_diag(g, reg) # pyright:ignore[reportAttributeAccessIssue]
         diag = H.diagonal()
         diag = torch.where(diag < torch.finfo(diag.dtype).tiny * 2, 1, diag)
         if y != 1: diag = (diag*y) + (1-y)
-        return H.add_diagonal(diag*reg).solve(g)
+        return H.solve_plus_diag(g, diag*reg)

torchzero/modules/trust_region/trust_cg.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 from ...core import Chainable, Module
-from ...utils.linalg import cg, linear_operator
+from ...linalg import cg, linear_operator
 from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy

torchzero/modules/trust_region/trust_region.py CHANGED Viewed

@@ -7,9 +7,16 @@ from typing import Any, Literal, Protocol, cast, final, overload
 import torch
-from ...core import Chainable, Module, Var, apply_transform
-from ...utils import TensorList, safe_dict_update_, tofloat, vec_to_tensors, generic_finfo, generic_vector_norm
-from ...utils.linalg.linear_operator import LinearOperator
+from ...core import Chainable, Module, Objective
+from ...linalg.linear_operator import LinearOperator
+from ...utils import (
+    TensorList,
+    generic_finfo,
+    generic_vector_norm,
+    safe_dict_update_,
+    tofloat,
+    vec_to_tensors,
+)
 def _flatten_tensors(tensors: list[torch.Tensor]):
@@ -256,24 +263,24 @@ class TrustRegionBase(Module, ABC):
         """Solve Hx=g with a trust region penalty/bound defined by `radius`"""
         ... # pylint:disable=unnecessary-ellipsis
-    def trust_region_update(self, var: Var, H: LinearOperator | None) -> None:
+    def trust_region_update(self, objective: Objective, H: LinearOperator | None) -> None:
         """updates the state of this module after H or B have been updated, if necessary"""
-    def trust_region_apply(self, var: Var, tensors:list[torch.Tensor], H: LinearOperator | None) -> Var:
-        """Solves the trust region subproblem and outputs ``Var`` with the solution direction."""
+    def trust_region_apply(self, objective: Objective, tensors:list[torch.Tensor], H: LinearOperator | None) -> Objective:
+        """Solves the trust region subproblem and outputs ``Objective`` with the solution direction."""
         assert H is not None
-        params = TensorList(var.params)
+        params = TensorList(objective.params)
         settings = self.settings[params[0]]
         g = _flatten_tensors(tensors)
         max_attempts = settings['max_attempts']
         # loss at x_0
-        loss = var.loss
-        closure = var.closure
+        loss = objective.loss
+        closure = objective.closure
         if closure is None: raise RuntimeError("Trust region requires closure")
-        if loss is None: loss = var.get_loss(False)
+        if loss is None: loss = objective.get_loss(False)
         loss = tofloat(loss)
         # trust region step and update
@@ -313,38 +320,36 @@ class TrustRegionBase(Module, ABC):
             )
         assert d is not None
-        if success: var.update = vec_to_tensors(d, params)
-        else: var.update = params.zeros_like()
+        if success: objective.updates = vec_to_tensors(d, params)
+        else: objective.updates = params.zeros_like()
-        return var
+        return objective
     @final
     @torch.no_grad
-    def update(self, var):
+    def update(self, objective):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
         if step % self.defaults["update_freq"] == 0:
             hessian_module = self.children['hess_module']
-            hessian_module.update(var)
-            H = hessian_module.get_H(var)
+            hessian_module.update(objective)
+            H = hessian_module.get_H(objective)
             self.global_state["H"] = H
-            self.trust_region_update(var, H=H)
+            self.trust_region_update(objective, H=H)
     @final
     @torch.no_grad
-    def apply(self, var):
+    def apply(self, objective):
         H = self.global_state.get('H', None)
         # -------------------------------- inner step -------------------------------- #
-        update = var.get_update()
-        if 'inner' in self.children:
-            update = apply_transform(self.children['inner'], update, params=var.params, grads=var.grad, var=var)
+        objective = self.inner_step("inner", objective, must_exist=False)
         # ----------------------------------- apply ---------------------------------- #
-        return self.trust_region_apply(var=var, tensors=update, H=H)
+        return self.trust_region_apply(objective=objective, tensors=objective.get_updates(), H=H)

torchzero/modules/variance_reduction/svrg.py CHANGED Viewed

@@ -3,15 +3,17 @@ from functools import partial
 import torch
-from ...core.module import Module
+from ...core import Module, Objective
 from ...utils import tofloat
-def _reset_except_self(optimizer, var, self: Module):
-    for m in optimizer.unrolled_modules:
+def _reset_except_self(objective: Objective, modules, self: Module):
+    assert objective.modular is not None
+    for m in objective.modular.flat_modules:
         if m is not self:
             m.reset()
 class SVRG(Module):
     """Stochastic variance reduced gradient method (SVRG).
@@ -71,7 +73,7 @@ class SVRG(Module):
     ```
     ## Notes
-    The SVRG gradient is computed as ``g_b(x) - alpha * g_b(x_0) - g_f(x0.)``, where:
+    The SVRG gradient is computed as ``g_b(x) - alpha * (g_b(x_0) - g_f(x_0))``, where:
     - ``x`` is current parameters
     - ``x_0`` is initial parameters, where full gradient was computed
     - ``g_b`` refers to mini-batch gradient at ``x`` or ``x_0``
@@ -83,17 +85,18 @@ class SVRG(Module):
         defaults = dict(svrg_steps = svrg_steps, accum_steps=accum_steps, reset_before_accum=reset_before_accum, svrg_loss=svrg_loss, alpha=alpha)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
-        params = var.params
-        closure = var.closure
+    def update(self, objective):
+        params = objective.params
+        closure = objective.closure
         assert closure is not None
         if "full_grad" not in self.global_state:
             # -------------------------- calculate full gradient ------------------------- #
-            if "full_closure" in var.storage:
-                full_closure = var.storage['full_closure']
+            if "full_closure" in objective.storage:
+                full_closure = objective.storage['full_closure']
                 with torch.enable_grad():
                     full_loss = full_closure()
                     if all(p.grad is None for p in params):
@@ -116,12 +119,12 @@ class SVRG(Module):
                 # accumulate grads
                 accumulator = self.get_state(params, 'accumulator')
-                grad = var.get_grad()
+                grad = objective.get_grads()
                 torch._foreach_add_(accumulator, grad)
                 # accumulate loss
                 loss_accumulator = self.global_state.get('loss_accumulator', 0)
-                loss_accumulator += tofloat(var.loss)
+                loss_accumulator += tofloat(objective.loss)
                 self.global_state['loss_accumulator'] = loss_accumulator
                 # on nth step, use the accumulated gradient
@@ -136,10 +139,10 @@ class SVRG(Module):
                 # otherwise skip update until enough grads are accumulated
                 else:
-                    var.update = None
-                    var.stop = True
-                    var.skip_update = True
-                    return var
+                    objective.updates = None
+                    objective.stop = True
+                    objective.skip_update = True
+                    return
         svrg_steps = self.defaults['svrg_steps']
@@ -194,7 +197,7 @@ class SVRG(Module):
             return closure(False)
-        var.closure = svrg_closure
+        objective.closure = svrg_closure
         # --- after svrg_steps steps reset so that new full gradient is calculated on next step --- #
         if current_svrg_step >= svrg_steps:
@@ -203,6 +206,6 @@ class SVRG(Module):
             del self.global_state['full_loss']
             del self.global_state['x_0']
             if self.defaults['reset_before_accum']:
-                var.post_step_hooks.append(partial(_reset_except_self, self=self))
+                objective.post_step_hooks.append(partial(_reset_except_self, self=self))
-        return var
+    def apply(self, objective): return objective

torchzero/modules/weight_decay/__init__.py CHANGED Viewed

@@ -1 +1,2 @@
-from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_, RelativeWeightDecay
+from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_, RelativeWeightDecay
+from .reinit import RandomReinitialize

torchzero/modules/weight_decay/reinit.py ADDED Viewed

@@ -0,0 +1,83 @@
+from functools import partial
+import torch
+from ...core import Module
+from ...utils import NumberList, TensorList
+def _reset_except_self(optimizer, var, self: Module):
+    for m in optimizer.unrolled_modules:
+        if m is not self:
+            m.reset()
+class RandomReinitialize(Module):
+    """On each step with probability ``p_reinit`` trigger reinitialization,
+    whereby ``p_weights`` weights are reset to their initial values.
+    This modifies the parameters directly. Place it as the first module.
+    Args:
+        p_reinit (float, optional): probability to trigger reinitialization on each step. Defaults to 0.01.
+        p_weights (float, optional): probability for each weight to be set to initial value when reinitialization is triggered. Defaults to 0.1.
+        store_every (int | None, optional): if set, stores new initial values every this many steps. Defaults to None.
+        beta (float, optional):
+            whenever ``store_every`` is triggered, uses linear interpolation with this beta.
+            If ``store_every=1``, this can be set to some value close to 1 such as 0.999
+            to reinitialize to slow parameter EMA. Defaults to 0.
+        reset (bool, optional): whether to reset states of other modules on reinitialization. Defaults to False.
+        seed (int | None, optional): random seed.
+    """
+    def __init__(
+        self,
+        p_reinit: float = 0.01,
+        p_weights: float = 0.1,
+        store_every: int | None = None,
+        beta: float = 0,
+        reset: bool = False,
+        seed: int | None = None,
+    ):
+        defaults = dict(p_weights=p_weights, p_reinit=p_reinit, store_every=store_every, beta=beta, reset=reset, seed=seed)
+        super().__init__(defaults)
+    def update(self, objective):
+        # this stores initial values to per-parameter states
+        p_init = self.get_state(objective.params, "p_init", init="params", cls=TensorList)
+        # store new params every store_every steps
+        step = self.global_state.get("step", 0)
+        self.global_state["step"] = step + 1
+        store_every = self.defaults["store_every"]
+        if (store_every is not None and step % store_every == 0):
+            beta = self.get_settings(objective.params, "beta", cls=NumberList)
+            p_init.lerp_(objective.params, weight=(1 - beta))
+    @torch.no_grad
+    def apply(self, objective):
+        p_reinit = self.defaults["p_reinit"]
+        device = objective.params[0].device
+        generator = self.get_generator(device, self.defaults["seed"])
+        # determine whether to trigger reinitialization
+        reinitialize = torch.rand(1, generator=generator, device=device) < p_reinit
+        # reinitialize
+        if reinitialize:
+            params = TensorList(objective.params)
+            p_init = self.get_state(params, "p_init", init=params)
+            # mask with p_weights entries being True
+            p_weights = self.get_settings(params, "p_weights")
+            mask = params.bernoulli_like(p_weights, generator=generator).as_bool()
+            # set weights at mask to their initialization
+            params.masked_set_(mask, p_init)
+            # reset
+            if self.defaults["reset"]:
+                objective.post_step_hooks.append(partial(_reset_except_self, self=self))
+        return objective

torchzero/modules/weight_decay/weight_decay.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Literal
 import torch
-from ...core import Module, Target, Transform
+from ...core import Module,  TensorTransform
 from ...utils import NumberList, TensorList, as_tensorlist, unpack_dicts, unpack_states, Metrics
@@ -21,7 +21,7 @@ def weight_decay_(
     return grad_.add_(params.pow(ord-1).copysign_(params).mul_(weight_decay))
-class WeightDecay(Transform):
+class WeightDecay(TensorTransform):
     """Weight decay.
     Args:
@@ -63,19 +63,19 @@ class WeightDecay(Transform):
     ```
     """
-    def __init__(self, weight_decay: float, ord: int = 2, target: Target = 'update'):
+    def __init__(self, weight_decay: float, ord: int = 2):
         defaults = dict(weight_decay=weight_decay, ord=ord)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         weight_decay = NumberList(s['weight_decay'] for s in settings)
         ord = settings[0]['ord']
         return weight_decay_(as_tensorlist(tensors), as_tensorlist(params), weight_decay, ord)
-class RelativeWeightDecay(Transform):
+class RelativeWeightDecay(TensorTransform):
     """Weight decay relative to the mean absolute value of update, gradient or parameters depending on value of ``norm_input`` argument.
     Args:
@@ -117,13 +117,12 @@ class RelativeWeightDecay(Transform):
         ord: int  = 2,
         norm_input: Literal["update", "grad", "params"] = "update",
         metric: Metrics = 'mad',
-        target: Target = "update",
     ):
         defaults = dict(weight_decay=weight_decay, ord=ord, norm_input=norm_input, metric=metric)
-        super().__init__(defaults, uses_grad=norm_input == 'grad', target=target)
+        super().__init__(defaults, uses_grad=norm_input == 'grad')
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         weight_decay = NumberList(s['weight_decay'] for s in settings)
         ord = settings[0]['ord']
@@ -161,9 +160,9 @@ class DirectWeightDecay(Module):
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
-        weight_decay = self.get_settings(var.params, 'weight_decay', cls=NumberList)
+    def apply(self, objective):
+        weight_decay = self.get_settings(objective.params, 'weight_decay', cls=NumberList)
         ord = self.defaults['ord']
-        decay_weights_(var.params, weight_decay, ord)
-        return var
+        decay_weights_(objective.params, weight_decay, ord)
+        return objective

torchzero 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl