PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +47 -36
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +8 -2
torchzero/core/chain.py +47 -0
torchzero/core/functional.py +103 -0
torchzero/core/modular.py +233 -0
torchzero/core/module.py +132 -643
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +56 -23
torchzero/core/transform.py +261 -365
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +132 -34
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +3 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/{higher_order → experimental}/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +20 -17
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +5 -5
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +8 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +3 -3
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +42 -51
torchzero/modules/line_search/strong_wolfe.py +5 -5
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +63 -61
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +25 -16
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +58 -0
torchzero/modules/second_order/inm.py +101 -0
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +105 -228
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +158 -178
torchzero/modules/second_order/rsn.py +237 -0
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +57 -50
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +112 -88
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/modules/experimental/momentum.py +0 -160
torchzero/modules/higher_order/__init__.py +0 -1
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.14.dist-info/RECORD +0 -167
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/trust_region/trust_cg.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 from ...core import Chainable, Module
-from ...utils.linalg import cg, linear_operator
+from ...linalg import cg, linear_operator
 from .trust_region import _RADIUS_KEYS, TrustRegionBase, _RadiusStrategy

torchzero/modules/trust_region/trust_region.py CHANGED Viewed

@@ -7,9 +7,16 @@ from typing import Any, Literal, Protocol, cast, final, overload
 import torch
-from ...core import Chainable, Module, Var, apply_transform
-from ...utils import TensorList, safe_dict_update_, tofloat, vec_to_tensors, generic_finfo, generic_vector_norm
-from ...utils.linalg.linear_operator import LinearOperator
+from ...core import Chainable, Module, Objective
+from ...linalg.linear_operator import LinearOperator
+from ...utils import (
+    TensorList,
+    generic_finfo,
+    generic_vector_norm,
+    safe_dict_update_,
+    tofloat,
+    vec_to_tensors,
+)
 def _flatten_tensors(tensors: list[torch.Tensor]):
@@ -256,24 +263,24 @@ class TrustRegionBase(Module, ABC):
         """Solve Hx=g with a trust region penalty/bound defined by `radius`"""
         ... # pylint:disable=unnecessary-ellipsis
-    def trust_region_update(self, var: Var, H: LinearOperator | None) -> None:
+    def trust_region_update(self, objective: Objective, H: LinearOperator | None) -> None:
         """updates the state of this module after H or B have been updated, if necessary"""
-    def trust_region_apply(self, var: Var, tensors:list[torch.Tensor], H: LinearOperator | None) -> Var:
-        """Solves the trust region subproblem and outputs ``Var`` with the solution direction."""
+    def trust_region_apply(self, objective: Objective, tensors:list[torch.Tensor], H: LinearOperator | None) -> Objective:
+        """Solves the trust region subproblem and outputs ``Objective`` with the solution direction."""
         assert H is not None
-        params = TensorList(var.params)
+        params = TensorList(objective.params)
         settings = self.settings[params[0]]
         g = _flatten_tensors(tensors)
         max_attempts = settings['max_attempts']
         # loss at x_0
-        loss = var.loss
-        closure = var.closure
+        loss = objective.loss
+        closure = objective.closure
         if closure is None: raise RuntimeError("Trust region requires closure")
-        if loss is None: loss = var.get_loss(False)
+        if loss is None: loss = objective.get_loss(False)
         loss = tofloat(loss)
         # trust region step and update
@@ -313,38 +320,36 @@ class TrustRegionBase(Module, ABC):
             )
         assert d is not None
-        if success: var.update = vec_to_tensors(d, params)
-        else: var.update = params.zeros_like()
+        if success: objective.updates = vec_to_tensors(d, params)
+        else: objective.updates = params.zeros_like()
-        return var
+        return objective
     @final
     @torch.no_grad
-    def update(self, var):
+    def update(self, objective):
         step = self.global_state.get('step', 0)
         self.global_state['step'] = step + 1
         if step % self.defaults["update_freq"] == 0:
             hessian_module = self.children['hess_module']
-            hessian_module.update(var)
-            H = hessian_module.get_H(var)
+            hessian_module.update(objective)
+            H = hessian_module.get_H(objective)
             self.global_state["H"] = H
-            self.trust_region_update(var, H=H)
+            self.trust_region_update(objective, H=H)
     @final
     @torch.no_grad
-    def apply(self, var):
+    def apply(self, objective):
         H = self.global_state.get('H', None)
         # -------------------------------- inner step -------------------------------- #
-        update = var.get_update()
-        if 'inner' in self.children:
-            update = apply_transform(self.children['inner'], update, params=var.params, grads=var.grad, var=var)
+        objective = self.inner_step("inner", objective, must_exist=False)
         # ----------------------------------- apply ---------------------------------- #
-        return self.trust_region_apply(var=var, tensors=update, H=H)
+        return self.trust_region_apply(objective=objective, tensors=objective.get_updates(), H=H)

torchzero/modules/variance_reduction/svrg.py CHANGED Viewed

@@ -3,15 +3,17 @@ from functools import partial
 import torch
-from ...core.module import Module
+from ...core import Module, Objective
 from ...utils import tofloat
-def _reset_except_self(optimizer, var, self: Module):
-    for m in optimizer.unrolled_modules:
+def _reset_except_self(objective: Objective, modules, self: Module):
+    assert objective.modular is not None
+    for m in objective.modular.flat_modules:
         if m is not self:
             m.reset()
 class SVRG(Module):
     """Stochastic variance reduced gradient method (SVRG).
@@ -71,7 +73,7 @@ class SVRG(Module):
     ```
     ## Notes
-    The SVRG gradient is computed as ``g_b(x) - alpha * g_b(x_0) - g_f(x0.)``, where:
+    The SVRG gradient is computed as ``g_b(x) - alpha * (g_b(x_0) - g_f(x_0))``, where:
     - ``x`` is current parameters
     - ``x_0`` is initial parameters, where full gradient was computed
     - ``g_b`` refers to mini-batch gradient at ``x`` or ``x_0``
@@ -83,17 +85,18 @@ class SVRG(Module):
         defaults = dict(svrg_steps = svrg_steps, accum_steps=accum_steps, reset_before_accum=reset_before_accum, svrg_loss=svrg_loss, alpha=alpha)
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
-        params = var.params
-        closure = var.closure
+    def update(self, objective):
+        params = objective.params
+        closure = objective.closure
         assert closure is not None
         if "full_grad" not in self.global_state:
             # -------------------------- calculate full gradient ------------------------- #
-            if "full_closure" in var.storage:
-                full_closure = var.storage['full_closure']
+            if "full_closure" in objective.storage:
+                full_closure = objective.storage['full_closure']
                 with torch.enable_grad():
                     full_loss = full_closure()
                     if all(p.grad is None for p in params):
@@ -116,12 +119,12 @@ class SVRG(Module):
                 # accumulate grads
                 accumulator = self.get_state(params, 'accumulator')
-                grad = var.get_grad()
+                grad = objective.get_grads()
                 torch._foreach_add_(accumulator, grad)
                 # accumulate loss
                 loss_accumulator = self.global_state.get('loss_accumulator', 0)
-                loss_accumulator += tofloat(var.loss)
+                loss_accumulator += tofloat(objective.loss)
                 self.global_state['loss_accumulator'] = loss_accumulator
                 # on nth step, use the accumulated gradient
@@ -136,10 +139,10 @@ class SVRG(Module):
                 # otherwise skip update until enough grads are accumulated
                 else:
-                    var.update = None
-                    var.stop = True
-                    var.skip_update = True
-                    return var
+                    objective.updates = None
+                    objective.stop = True
+                    objective.skip_update = True
+                    return
         svrg_steps = self.defaults['svrg_steps']
@@ -194,7 +197,7 @@ class SVRG(Module):
             return closure(False)
-        var.closure = svrg_closure
+        objective.closure = svrg_closure
         # --- after svrg_steps steps reset so that new full gradient is calculated on next step --- #
         if current_svrg_step >= svrg_steps:
@@ -203,6 +206,6 @@ class SVRG(Module):
             del self.global_state['full_loss']
             del self.global_state['x_0']
             if self.defaults['reset_before_accum']:
-                var.post_step_hooks.append(partial(_reset_except_self, self=self))
+                objective.post_step_hooks.append(partial(_reset_except_self, self=self))
-        return var
+    def apply(self, objective): return objective

torchzero/modules/weight_decay/__init__.py CHANGED Viewed

@@ -1 +1,2 @@
-from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_, RelativeWeightDecay
+from .weight_decay import WeightDecay, DirectWeightDecay, decay_weights_, RelativeWeightDecay
+from .reinit import RandomReinitialize

torchzero/modules/weight_decay/reinit.py ADDED Viewed

@@ -0,0 +1,83 @@
+from functools import partial
+import torch
+from ...core import Module
+from ...utils import NumberList, TensorList
+def _reset_except_self(optimizer, var, self: Module):
+    for m in optimizer.unrolled_modules:
+        if m is not self:
+            m.reset()
+class RandomReinitialize(Module):
+    """On each step with probability ``p_reinit`` trigger reinitialization,
+    whereby ``p_weights`` weights are reset to their initial values.
+    This modifies the parameters directly. Place it as the first module.
+    Args:
+        p_reinit (float, optional): probability to trigger reinitialization on each step. Defaults to 0.01.
+        p_weights (float, optional): probability for each weight to be set to initial value when reinitialization is triggered. Defaults to 0.1.
+        store_every (int | None, optional): if set, stores new initial values every this many steps. Defaults to None.
+        beta (float, optional):
+            whenever ``store_every`` is triggered, uses linear interpolation with this beta.
+            If ``store_every=1``, this can be set to some value close to 1 such as 0.999
+            to reinitialize to slow parameter EMA. Defaults to 0.
+        reset (bool, optional): whether to reset states of other modules on reinitialization. Defaults to False.
+        seed (int | None, optional): random seed.
+    """
+    def __init__(
+        self,
+        p_reinit: float = 0.01,
+        p_weights: float = 0.1,
+        store_every: int | None = None,
+        beta: float = 0,
+        reset: bool = False,
+        seed: int | None = None,
+    ):
+        defaults = dict(p_weights=p_weights, p_reinit=p_reinit, store_every=store_every, beta=beta, reset=reset, seed=seed)
+        super().__init__(defaults)
+    def update(self, objective):
+        # this stores initial values to per-parameter states
+        p_init = self.get_state(objective.params, "p_init", init="params", cls=TensorList)
+        # store new params every store_every steps
+        step = self.global_state.get("step", 0)
+        self.global_state["step"] = step + 1
+        store_every = self.defaults["store_every"]
+        if (store_every is not None and step % store_every == 0):
+            beta = self.get_settings(objective.params, "beta", cls=NumberList)
+            p_init.lerp_(objective.params, weight=(1 - beta))
+    @torch.no_grad
+    def apply(self, objective):
+        p_reinit = self.defaults["p_reinit"]
+        device = objective.params[0].device
+        generator = self.get_generator(device, self.defaults["seed"])
+        # determine whether to trigger reinitialization
+        reinitialize = torch.rand(1, generator=generator, device=device) < p_reinit
+        # reinitialize
+        if reinitialize:
+            params = TensorList(objective.params)
+            p_init = self.get_state(params, "p_init", init=params)
+            # mask with p_weights entries being True
+            p_weights = self.get_settings(params, "p_weights")
+            mask = params.bernoulli_like(p_weights, generator=generator).as_bool()
+            # set weights at mask to their initialization
+            params.masked_set_(mask, p_init)
+            # reset
+            if self.defaults["reset"]:
+                objective.post_step_hooks.append(partial(_reset_except_self, self=self))
+        return objective

torchzero/modules/weight_decay/weight_decay.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Literal
 import torch
-from ...core import Module, Target, Transform
+from ...core import Module,  TensorTransform
 from ...utils import NumberList, TensorList, as_tensorlist, unpack_dicts, unpack_states, Metrics
@@ -21,7 +21,7 @@ def weight_decay_(
     return grad_.add_(params.pow(ord-1).copysign_(params).mul_(weight_decay))
-class WeightDecay(Transform):
+class WeightDecay(TensorTransform):
     """Weight decay.
     Args:
@@ -63,19 +63,19 @@ class WeightDecay(Transform):
     ```
     """
-    def __init__(self, weight_decay: float, ord: int = 2, target: Target = 'update'):
+    def __init__(self, weight_decay: float, ord: int = 2):
         defaults = dict(weight_decay=weight_decay, ord=ord)
-        super().__init__(defaults, uses_grad=False, target=target)
+        super().__init__(defaults)
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         weight_decay = NumberList(s['weight_decay'] for s in settings)
         ord = settings[0]['ord']
         return weight_decay_(as_tensorlist(tensors), as_tensorlist(params), weight_decay, ord)
-class RelativeWeightDecay(Transform):
+class RelativeWeightDecay(TensorTransform):
     """Weight decay relative to the mean absolute value of update, gradient or parameters depending on value of ``norm_input`` argument.
     Args:
@@ -117,13 +117,12 @@ class RelativeWeightDecay(Transform):
         ord: int  = 2,
         norm_input: Literal["update", "grad", "params"] = "update",
         metric: Metrics = 'mad',
-        target: Target = "update",
     ):
         defaults = dict(weight_decay=weight_decay, ord=ord, norm_input=norm_input, metric=metric)
-        super().__init__(defaults, uses_grad=norm_input == 'grad', target=target)
+        super().__init__(defaults, uses_grad=norm_input == 'grad')
     @torch.no_grad
-    def apply_tensors(self, tensors, params, grads, loss, states, settings):
+    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
         weight_decay = NumberList(s['weight_decay'] for s in settings)
         ord = settings[0]['ord']
@@ -161,9 +160,9 @@ class DirectWeightDecay(Module):
         super().__init__(defaults)
     @torch.no_grad
-    def step(self, var):
-        weight_decay = self.get_settings(var.params, 'weight_decay', cls=NumberList)
+    def apply(self, objective):
+        weight_decay = self.get_settings(objective.params, 'weight_decay', cls=NumberList)
         ord = self.defaults['ord']
-        decay_weights_(var.params, weight_decay, ord)
-        return var
+        decay_weights_(objective.params, weight_decay, ord)
+        return objective

torchzero/modules/wrappers/optim_wrapper.py CHANGED Viewed

@@ -3,41 +3,55 @@ from typing import Any
 import torch
 from ...core.module import Module
-from ...utils import Params, _copy_param_groups, _make_param_groups
+from ...utils.params import Params, _copy_param_groups, _make_param_groups
 class Wrap(Module):
     """
     Wraps a pytorch optimizer to use it as a module.
-    .. note::
-        Custom param groups are supported only by `set_param_groups`, settings passed to Modular will be ignored.
+    Note:
+        Custom param groups are supported only by ``set_param_groups``, settings passed to Modular will be applied to all parameters.
     Args:
         opt_fn (Callable[..., torch.optim.Optimizer] | torch.optim.Optimizer):
-            function that takes in parameters and returns the optimizer, for example :code:`torch.optim.Adam`
-            or :code:`lambda parameters: torch.optim.Adam(parameters, lr=1e-3)`
+            function that takes in parameters and returns the optimizer, for example ``torch.optim.Adam``
+            or ``lambda parameters: torch.optim.Adam(parameters, lr=1e-3)``
         *args:
         **kwargs:
-            Extra args to be passed to opt_fn. The function is called as :code:`opt_fn(parameters, *args, **kwargs)`.
+            Extra args to be passed to opt_fn. The function is called as ``opt_fn(parameters, *args, **kwargs)``.
+        use_param_groups:
+            Whether to pass settings passed to Modular to the wrapped optimizer.
-    Example:
-        wrapping pytorch_optimizer.StableAdamW
+            Note that settings to the first parameter are used for all parameters,
+            so if you specified per-parameter settings, they will be ignored.
-        .. code-block:: py
+    ### Example:
+    wrapping pytorch_optimizer.StableAdamW
-            from pytorch_optimizer import StableAdamW
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Wrap(StableAdamW, lr=1),
-                tz.m.Cautious(),
-                tz.m.LR(1e-2)
-            )
+    ```python
+    from pytorch_optimizer import StableAdamW
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Wrap(StableAdamW, lr=1),
+        tz.m.Cautious(),
+        tz.m.LR(1e-2)
+    )
+    ```
     """
-    def __init__(self, opt_fn: Callable[..., torch.optim.Optimizer] | torch.optim.Optimizer, *args, **kwargs):
-        super().__init__()
+    def __init__(
+        self,
+        opt_fn: Callable[..., torch.optim.Optimizer] | torch.optim.Optimizer,
+        *args,
+        use_param_groups: bool = True,
+        **kwargs,
+    ):
+        defaults = dict(use_param_groups=use_param_groups)
+        super().__init__(defaults=defaults)
         self._opt_fn = opt_fn
         self._opt_args = args
         self._opt_kwargs = kwargs
@@ -48,12 +62,12 @@ class Wrap(Module):
             self.optimizer = self._opt_fn
     def set_param_groups(self, param_groups):
-        self._custom_param_groups = param_groups
+        self._custom_param_groups = _make_param_groups(param_groups, differentiable=False)
         return super().set_param_groups(param_groups)
     @torch.no_grad
-    def step(self, var):
-        params = var.params
+    def apply(self, objective):
+        params = objective.params
         # initialize opt on 1st step
         if self.optimizer is None:
@@ -61,54 +75,47 @@ class Wrap(Module):
             param_groups = params if self._custom_param_groups is None else self._custom_param_groups
             self.optimizer = self._opt_fn(param_groups, *self._opt_args, **self._opt_kwargs)
+        # set optimizer per-parameter settings
+        if self.defaults["use_param_groups"] and objective.modular is not None:
+            for group in self.optimizer.param_groups:
+                first_param = group['params'][0]
+                setting = self.settings[first_param]
+                # settings passed in `set_param_groups` are the highest priority
+                # schedulers will override defaults but not settings passed in `set_param_groups`
+                # this is consistent with how Modular does it.
+                if self._custom_param_groups is not None:
+                    setting = {k:v for k,v in setting if k not in self._custom_param_groups[0]}
+                group.update(setting)
         # set grad to update
         orig_grad = [p.grad for p in params]
-        for p, u in zip(params, var.get_update()):
+        for p, u in zip(params, objective.get_updates()):
             p.grad = u
-        # if this module is last, can step with _opt directly
-        # direct step can't be applied if next module is LR but _opt doesn't support lr,
-        # and if there are multiple different per-parameter lrs (would be annoying to support)
-        if var.is_last and (
-            (var.last_module_lrs is None)
-            or
-            (('lr' in self.optimizer.defaults) and (len(set(var.last_module_lrs)) == 1))
-        ):
-            lr = 1 if var.last_module_lrs is None else var.last_module_lrs[0]
-            # update optimizer lr with desired lr
-            if lr != 1:
-                self.optimizer.defaults['__original_lr__'] = self.optimizer.defaults['lr']
-                for g in self.optimizer.param_groups:
-                    g['__original_lr__'] = g['lr']
-                    g['lr'] = g['lr'] * lr
-            # step
+        # if this is last module, simply use optimizer to update parameters
+        if objective.modular is not None and self is objective.modular.modules[-1]:
             self.optimizer.step()
-            # restore original lr
-            if lr != 1:
-                self.optimizer.defaults['lr'] = self.optimizer.defaults.pop('__original_lr__')
-                for g in self.optimizer.param_groups:
-                    g['lr'] = g.pop('__original_lr__')
             # restore grad
             for p, g in zip(params, orig_grad):
                 p.grad = g
-            var.stop = True; var.skip_update = True
-            return var
+            objective.stop = True; objective.skip_update = True
+            return objective
         # this is not the last module, meaning update is difference in parameters
+        # and passed to next module
         params_before_step = [p.clone() for p in params]
         self.optimizer.step() # step and update params
         for p, g in zip(params, orig_grad):
             p.grad = g
-        var.update = list(torch._foreach_sub(params_before_step, params)) # set update to difference between params
+        objective.updates = list(torch._foreach_sub(params_before_step, params)) # set update to difference between params
         for p, o in zip(params, params_before_step):
             p.set_(o) # pyright: ignore[reportArgumentType]
-        return var
+        return objective
     def reset(self):
         super().reset()

torchzero/modules/zeroth_order/cd.py CHANGED Viewed

@@ -33,13 +33,16 @@ class CD(Module):
         defaults = dict(h=h, grad=grad, adaptive=adaptive, index=index, threepoint=threepoint)
         super().__init__(defaults)
+    def update(self, objective): raise RuntimeError
+    def apply(self, objective): raise RuntimeError
     @torch.no_grad
-    def step(self, var):
-        closure = var.closure
+    def step(self, objective):
+        closure = objective.closure
         if closure is None:
             raise RuntimeError("CD requires closure")
-        params = TensorList(var.params)
+        params = TensorList(objective.params)
         ndim = params.global_numel()
         grad_step_size = self.defaults['grad']
@@ -79,7 +82,7 @@ class CD(Module):
             else:
                 warnings.warn("CD adaptive=True only works with threepoint=True")
-        f_0 = var.get_loss(False)
+        f_0 = objective.get_loss(False)
         params.flat_set_lambda_(idx, lambda x: x + h)
         f_p = closure(False)
@@ -117,6 +120,6 @@ class CD(Module):
         # ----------------------------- create the update ---------------------------- #
         update = params.zeros_like()
         update.flat_set_(idx, alpha)
-        var.update = update
-        return var
+        objective.updates = update
+        return objective

torchzero/optim/root.py CHANGED Viewed

@@ -3,7 +3,7 @@ from collections.abc import Callable
 from abc import abstractmethod
 import torch
-from ..modules.higher_order.multipoint import sixth_order_im1, sixth_order_p6, _solve
+from ..modules.second_order.multipoint import sixth_order_3p, sixth_order_5p, two_point_newton, sixth_order_3pm2, _solve
 def make_evaluate(f: Callable[[torch.Tensor], torch.Tensor]):
     def evaluate(x, order) -> tuple[torch.Tensor, ...]:
@@ -53,7 +53,7 @@ class Newton(RootBase):
     def one_iteration(self, x, evaluate): return newton(x, evaluate, self.lstsq)
-class SixthOrderP6(RootBase):
+class SixthOrder3P(RootBase):
     """sixth-order iterative method
     Abro, Hameer Akhtar, and Muhammad Mujtaba Shaikh. "A new time-efficient and convergent nonlinear solver." Applied Mathematics and Computation 355 (2019): 516-536.
@@ -62,4 +62,4 @@ class SixthOrderP6(RootBase):
     def one_iteration(self, x, evaluate):
         def f(x): return evaluate(x, 0)[0]
         def f_j(x): return evaluate(x, 1)
-        return sixth_order_p6(x, f, f_j, self.lstsq)
+        return sixth_order_3p(x, f, f_j, self.lstsq)

torchzero/optim/utility/split.py CHANGED Viewed

@@ -3,7 +3,8 @@ from collections.abc import Callable, Iterable
 import torch
-from ...utils import flatten, get_params
+from ...utils import flatten
+from ...utils.optimizer import get_params
 class Split(torch.optim.Optimizer):
     """Steps will all `optimizers`, also has a check that they have no duplicate parameters.

torchzero 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl