PyPI - torchzero - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

torchzero 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

torchzero/core/__init__.py +1 -1
torchzero/core/module.py +72 -49
torchzero/core/tensorlist_optimizer.py +1 -1
torchzero/modules/adaptive/adaptive.py +11 -11
torchzero/modules/experimental/experimental.py +41 -41
torchzero/modules/experimental/quad_interp.py +8 -8
torchzero/modules/experimental/subspace.py +37 -37
torchzero/modules/gradient_approximation/base_approximator.py +19 -24
torchzero/modules/gradient_approximation/fdm.py +1 -1
torchzero/modules/gradient_approximation/newton_fdm.py +13 -13
torchzero/modules/gradient_approximation/rfdm.py +1 -1
torchzero/modules/line_search/armijo.py +8 -8
torchzero/modules/line_search/base_ls.py +8 -8
torchzero/modules/line_search/directional_newton.py +14 -14
torchzero/modules/line_search/grid_ls.py +7 -7
torchzero/modules/line_search/scipy_minimize_scalar.py +3 -3
torchzero/modules/meta/alternate.py +4 -4
torchzero/modules/meta/grafting.py +23 -23
torchzero/modules/meta/optimizer_wrapper.py +14 -14
torchzero/modules/meta/return_overrides.py +8 -8
torchzero/modules/misc/accumulate.py +6 -6
torchzero/modules/misc/basic.py +16 -16
torchzero/modules/misc/lr.py +2 -2
torchzero/modules/misc/multistep.py +7 -7
torchzero/modules/misc/on_increase.py +9 -9
torchzero/modules/momentum/momentum.py +4 -4
torchzero/modules/operations/multi.py +44 -44
torchzero/modules/operations/reduction.py +28 -28
torchzero/modules/operations/singular.py +9 -9
torchzero/modules/optimizers/adagrad.py +1 -1
torchzero/modules/optimizers/adam.py +8 -8
torchzero/modules/optimizers/lion.py +1 -1
torchzero/modules/optimizers/rmsprop.py +1 -1
torchzero/modules/optimizers/rprop.py +1 -1
torchzero/modules/optimizers/sgd.py +2 -2
torchzero/modules/orthogonalization/newtonschulz.py +3 -3
torchzero/modules/orthogonalization/svd.py +1 -1
torchzero/modules/regularization/dropout.py +1 -1
torchzero/modules/regularization/noise.py +3 -3
torchzero/modules/regularization/normalization.py +5 -5
torchzero/modules/regularization/ortho_grad.py +1 -1
torchzero/modules/regularization/weight_decay.py +1 -1
torchzero/modules/scheduling/lr_schedulers.py +2 -2
torchzero/modules/scheduling/step_size.py +8 -8
torchzero/modules/second_order/newton.py +12 -12
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/gaussian_smoothing.py +7 -7
torchzero/modules/smoothing/laplacian_smoothing.py +1 -1
torchzero/modules/weight_averaging/ema.py +3 -3
torchzero/modules/weight_averaging/swa.py +8 -8
torchzero/optim/first_order/forward_gradient.py +1 -1
torchzero/optim/modular.py +4 -4
torchzero/tensorlist.py +8 -1
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/METADATA +1 -1
torchzero-0.1.5.dist-info/RECORD +104 -0
torchzero-0.1.3.dist-info/RECORD +0 -104
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/LICENSE +0 -0
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/WHEEL +0 -0
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/top_level.txt +0 -0

torchzero/core/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import sys
 from .module import (
-    OptimizationState,
+    OptimizationVars,
     OptimizerModule,
     _Chain,
     _Chainable,

torchzero/core/module.py CHANGED Viewed

@@ -23,8 +23,8 @@ def _get_loss(fx0, fx0_approx):
     return fx0
-class OptimizationState:
-    """Holds optimization state. This is usually automatically created by :any:`torchzero.optim.Modular`."""
+class OptimizationVars:
+    """Holds optimization variables. This is usually automatically created by :any:`torchzero.optim.Modular`."""
     def __init__(self, closure: _ClosureType | None, model: torch.nn.Module | None):
         self.closure: _ClosureType | None = closure
@@ -121,23 +121,23 @@ class OptimizationState:
         Returns:
             A copy of this OptimizationState.
         """
-        state = OptimizationState(self.closure, self.model)
-        state.fx0 = self.fx0
-        state.fx0_approx = self.fx0_approx
-        state.grad = self.grad
+        vars = OptimizationVars(self.closure, self.model)
+        vars.fx0 = self.fx0
+        vars.fx0_approx = self.fx0_approx
+        vars.grad = self.grad
-        if clone_ascent and self.ascent is not None: state.ascent = self.ascent.clone()
-        else: state.ascent = self.ascent
+        if clone_ascent and self.ascent is not None: vars.ascent = self.ascent.clone()
+        else: vars.ascent = self.ascent
-        return state
+        return vars
-    def update_attrs_(self, state: "OptimizationState"):
+    def update_attrs_(self, vars: "OptimizationVars"):
         """Updates attributes of this state with attributes of another state.
         This updates `grad`, `fx0` and `fx0_approx`."""
-        if state.grad is not None: self.grad = state.grad
-        if state.fx0 is not None: self.fx0 = state.fx0
-        if state.fx0_approx is not None: self.fx0_approx = state.fx0_approx
+        if vars.grad is not None: self.grad = vars.grad
+        if vars.fx0 is not None: self.fx0 = vars.fx0
+        if vars.fx0_approx is not None: self.fx0_approx = vars.fx0_approx
     def add_post_step_hook(self, hook: Callable):
@@ -283,7 +283,7 @@ class OptimizerModule(TensorListOptimizer, ABC): # type:ignore
         for c in self.children.values():
             self._update_child_params_(c)
-    def _update_params_or_step_with_next(self, state: OptimizationState, params: TensorList | None = None) -> _ScalarLoss | None:
+    def _update_params_or_step_with_next(self, vars: OptimizationVars, params: TensorList | None = None) -> _ScalarLoss | None:
         """If this has no children, update params and return loss. Otherwise step with the next module.
         Optionally pass params to not recreate them if you've already made them.
@@ -293,29 +293,29 @@ class OptimizerModule(TensorListOptimizer, ABC): # type:ignore
         """
         # if this has no children, update params and return loss.
         if self.next_module is None:
-            if state.ascent is None: raise ValueError('Called _update_params_or_step_with_child but ascent_direction is None...')
+            if vars.ascent is None: raise ValueError('Called _update_params_or_step_with_child but ascent_direction is None...')
             if params is None: params = self.get_params()
-            params -= state.ascent # type:ignore
-            return state.get_loss()
+            params -= vars.ascent # type:ignore
+            return vars.get_loss()
         # otherwise pass the updated ascent direction to the child
-        return self.next_module.step(state)
+        return self.next_module.step(vars)
     @torch.no_grad
-    def _step_update_closure(self, state: OptimizationState) -> _ScalarLoss | None:
+    def _step_update_closure(self, vars: OptimizationVars) -> _ScalarLoss | None:
         """Create a new closure which applies the `_update` method and passes it to the next module."""
-        if state.closure is None: raise ValueError('If target == "closure", closure must be provided')
+        if vars.closure is None: raise ValueError('If target == "closure", closure must be provided')
         params = self.get_params()
-        closure = state.closure # closure shouldn't reference state attribute because it can be changed
-        ascent_direction = state.ascent
+        closure = vars.closure # closure shouldn't reference state attribute because it can be changed
+        ascent_direction = vars.ascent
         def update_closure(backward = True):
             loss = _maybe_pass_backward(closure, backward)
             # on backward, update the ascent direction
             if backward:
-                grad = self._update(state, ascent_direction) # type:ignore
+                grad = self._update(vars, ascent_direction) # type:ignore
                 # set new ascent direction as gradients
                 # (accumulation doesn't make sense here as closure always calls zero_grad)
                 for p, g in zip(params,grad):
@@ -327,12 +327,12 @@ class OptimizerModule(TensorListOptimizer, ABC): # type:ignore
         # if self.next_module is None:
         #     raise ValueError(f'{self.__class__.__name__} has no child to step with (maybe set "target" from "closure" to something else??).')
-        state.closure = update_closure
-        return self._update_params_or_step_with_next(state)
+        vars.closure = update_closure
+        return self._update_params_or_step_with_next(vars)
     @torch.no_grad
-    def _step_update_target(self, state: OptimizationState) -> _ScalarLoss | None:
+    def _step_update_target(self, vars: OptimizationVars) -> _ScalarLoss | None:
         """Apply _update method to the ascent direction and pass it to the child, or make a step if child is None."""
         # the following code by default uses `_update` method which simple modules can override.
         # But you can also just override the entire `step`.
@@ -342,50 +342,73 @@ class OptimizerModule(TensorListOptimizer, ABC): # type:ignore
         # update ascent direction
         if self._default_step_target == 'ascent':
             # if this is the first module, it uses the gradients
-            if state.grad is None: params = self.get_params()
-            t = state.maybe_use_grad_(params)
-            state.ascent = self._update(state, t)
+            if vars.grad is None: params = self.get_params()
+            t = vars.maybe_use_grad_(params)
+            vars.ascent = self._update(vars, t)
         # update gradients
         elif self._default_step_target == 'grad':
             if params is None: params = self.get_params()
-            g = state.maybe_compute_grad_(params)
-            g = self._update(state, g)
-            state.set_grad_(g, params)
+            g = vars.maybe_compute_grad_(params)
+            g = self._update(vars, g)
+            vars.set_grad_(g, params)
         else:
             raise ValueError(f"Invalid {self._default_step_target = }")
         # peform an update with the new state, or pass it to the child.
-        return self._update_params_or_step_with_next(state, params=params)
+        return self._update_params_or_step_with_next(vars, params=params)
     @torch.no_grad
     def step( # type:ignore # pylint:disable=signature-differs # pylint:disable = arguments-renamed
         self,
-        state: OptimizationState
+        vars: OptimizationVars
     ) -> _ScalarLoss | None:
         """Perform a single optimization step to update parameter."""
-        if self._default_step_target == 'closure': return self._step_update_closure(state)
-        return self._step_update_target(state)
+        if self._default_step_target == 'closure': return self._step_update_closure(vars)
+        return self._step_update_target(vars)
     @torch.no_grad
-    def _update(self, state: OptimizationState, ascent: TensorList) -> TensorList:
+    def _update(self, vars: OptimizationVars, ascent: TensorList) -> TensorList:
         """Update `ascent_direction` and return the new ascent direction (but it may update it in place).
-        Make sure it doesn't return anything from `state` to avoid future modules modifying that in-place.
+        Make sure it doesn't return anything from `self.state` to avoid future modules modifying that in-place.
         Before calling `_update`, if ascent direction was not provided to `step`, it will be set to the gradients.
         After generating a new ascent direction with this `_update` method,
         if this module has no child, ascent direction will be subtracted from params.
         Otherwise everything is passed to the child."""
+        params = self.get_params()
+        gradients = ascent.grad
+        if gradients is None: gradients = [None] * len(params)
+        settings = tuple(self.get_all_group_keys(list).items())
+        new_ascent = TensorList()
+        for i, (asc, param, grad) in enumerate(zip(ascent, params, gradients)):
+            kwargs = {"vars": vars, "ascent": asc, "param": param, "grad": grad}
+            kwargs.update({k:v[i] for k,v in settings})
+            new_ascent.append(self._single_tensor_update(**kwargs))
+        return new_ascent
+    def _single_tensor_update(self, vars: OptimizationVars, ascent: torch.Tensor, param: torch.Tensor, grad: torch.Tensor | None, **kwargs) -> torch.Tensor:
+        """Update function for a single tensor.
+        Args:
+            vars (OptimizationState): holds loss, gradients, etc.
+            ascent (torch.Tensor): update tensor.
+            param (torch.Tensor): parameter tensor.
+            grad (torch.Tensor | None): gradient tensor (may be None)
+            **kwargs: all per-parameter settings (stuff that you put in `defaults = dict(beta1=beta1, beta2=beta2, eps=eps)`).
+        """
         raise NotImplementedError()
-    def return_ascent(self, state: OptimizationState, params=None) -> TensorList:
+    def return_ascent(self, vars: OptimizationVars, params=None) -> TensorList:
         """step with this module and return the ascent as tensorlist"""
         if params is None: params = self.get_params()
         true_next = self.next_module
         self.next_module = _ReturnAscent(params) # type:ignore
-        ascent: TensorList = self.step(state) # type:ignore
+        ascent: TensorList = self.step(vars) # type:ignore
         self.next_module = true_next
         return ascent
@@ -412,8 +435,8 @@ class _ReturnAscent:
         self.next_module = None
     @torch.no_grad
-    def step(self, state: OptimizationState) -> TensorList: # type:ignore
-        update = state.maybe_use_grad_(self.params) # this will execute the closure which might be modified
+    def step(self, vars: OptimizationVars) -> TensorList: # type:ignore
+        update = vars.maybe_use_grad_(self.params) # this will execute the closure which might be modified
         return update
@@ -424,13 +447,13 @@ class _MaybeReturnAscent(OptimizerModule):
         self._return_ascent = False
     @torch.no_grad
-    def step(self, state: OptimizationState):
+    def step(self, vars: OptimizationVars):
         assert self.next_module is None, self.next_module
         if self._return_ascent:
-            return state.ascent
+            return vars.ascent
-        return self._update_params_or_step_with_next(state)
+        return self._update_params_or_step_with_next(vars)
 _Chainable = OptimizerModule | Iterable[OptimizerModule]
@@ -456,16 +479,16 @@ class _Chain(OptimizerModule):
         self._chain_modules = flat_modules
     @torch.no_grad
-    def step(self, state: OptimizationState):
+    def step(self, vars: OptimizationVars):
         # no next module, step with the child
         if self.next_module is None:
-            return self.children['first'].step(state)
+            return self.children['first'].step(vars)
         # return ascent and pass it to next module
         # we do this because updating parameters directly is often more efficient
         params = self.get_params()
         self._last_module.next_module = _ReturnAscent(params) # type:ignore
-        state.ascent: TensorList = self.children['first'].step(state) # type:ignore
+        vars.ascent: TensorList = self.children['first'].step(vars) # type:ignore
         self._last_module.next_module = None
-        return self._update_params_or_step_with_next(state)
+        return self._update_params_or_step_with_next(vars)

torchzero/core/tensorlist_optimizer.py CHANGED Viewed

@@ -149,7 +149,7 @@ class TensorListOptimizer(torch.optim.Optimizer, ABC):
     # def get_group_keys[CLS: MutableSequence](self, *keys: str, cls: type[CLS] = NumberList) -> list[CLS]:
     def get_group_keys(self, *keys: str, cls: type[CLS] = NumberList) -> list[CLS]:
-        """Returns a TensorList with the param_groups `key` setting of each param."""
+        """Returns a list with the param_groups `key` setting of each param."""
         all_values: list[CLS] = [cls() for _ in keys]
         for group in self.param_groups:

torchzero/modules/adaptive/adaptive.py CHANGED Viewed

@@ -33,16 +33,16 @@ class Cautious(OptimizerModule):
         self.mode: typing.Literal['zero', 'grad', 'backtrack'] = mode
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
-        grad = state.maybe_compute_grad_(params)
+        grad = vars.maybe_compute_grad_(params)
         # mask will be > 0 for parameters where both signs are the same
         mask = (ascent * grad) > 0
         if self.mode in ('zero', 'grad'):
             if self.normalize and self.mode == 'zero':
                 fmask = mask.to(ascent[0].dtype)
-                fmask /= fmask.total_mean() + self.eps
+                fmask /= fmask.total_mean() + self.eps # type:ignore
             else:
                 fmask = mask
@@ -66,9 +66,9 @@ class UseGradSign(OptimizerModule):
         super().__init__({})
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
-        grad = state.maybe_compute_grad_(params)
+        grad = vars.maybe_compute_grad_(params)
         return ascent.abs_().mul_(grad.sign())
@@ -80,9 +80,9 @@ class UseGradMagnitude(OptimizerModule):
         super().__init__({})
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
-        grad = state.maybe_compute_grad_(params)
+        grad = vars.maybe_compute_grad_(params)
         return ascent.sign_().mul_(grad.abs())
@@ -109,10 +109,10 @@ class ScaleLRBySignChange(OptimizerModule):
         self.use_grad = use_grad
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
-        if self.use_grad: cur = state.maybe_compute_grad_(params)
+        if self.use_grad: cur = vars.maybe_compute_grad_(params)
         else: cur = ascent
         nplus, nminus, lb, ub = self.get_group_keys('nplus', 'nminus', 'lb', 'ub')
@@ -168,10 +168,10 @@ class NegateOnSignChange(OptimizerModule):
         self.current_step = 0
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
-        if self.use_grad: cur = state.maybe_compute_grad_(params)
+        if self.use_grad: cur = vars.maybe_compute_grad_(params)
         else: cur = ascent
         prev = self.get_state_key('prev')

torchzero/modules/experimental/experimental.py CHANGED Viewed

@@ -35,9 +35,9 @@ class MinibatchRprop(OptimizerModule):
         self.next_mode = next_mode
     @torch.no_grad
-    def step(self, state):
-        if state.closure is None: raise ValueError("Minibatch Rprop requires closure")
-        if state.ascent is not None: raise ValueError("Minibatch Rprop must be the first module.")
+    def step(self, vars):
+        if vars.closure is None: raise ValueError("Minibatch Rprop requires closure")
+        if vars.ascent is not None: raise ValueError("Minibatch Rprop must be the first module.")
         params = self.get_params()
         nplus, nminus, lb, ub = self.get_group_keys('nplus', 'nminus', 'lb', 'ub')
@@ -47,7 +47,7 @@ class MinibatchRprop(OptimizerModule):
             params=params
         )
-        g1_sign = state.maybe_compute_grad_(params).sign() # no inplace to not modify grads
+        g1_sign = vars.maybe_compute_grad_(params).sign() # no inplace to not modify grads
         # initialize on 1st iteration
         if self.current_step == 0:
             magnitudes.fill_(self.get_group_key('alpha')).clamp_(lb, ub)
@@ -58,8 +58,8 @@ class MinibatchRprop(OptimizerModule):
         # first step
         ascent = g1_sign.mul_(magnitudes).mul_(allowed)
         params -= ascent
-        with torch.enable_grad(): state.fx0_approx = state.closure()
-        f0 = state.fx0; f1 = state.fx0_approx
+        with torch.enable_grad(): vars.fx0_approx = vars.closure()
+        f0 = vars.fx0; f1 = vars.fx0_approx
         assert f0 is not None and f1 is not None
         # if loss increased, reduce all lrs and undo the update
@@ -73,9 +73,9 @@ class MinibatchRprop(OptimizerModule):
         # on `continue` we move to params after 1st update
         # therefore state must be updated to have all attributes after 1st update
         if self.next_mode == 'continue':
-            state.fx0 = state.fx0_approx
-            state.grad = params.ensure_grad_().grad
-            sign = state.grad.sign()
+            vars.fx0 = vars.fx0_approx
+            vars.grad = params.ensure_grad_().grad
+            sign = vars.grad.sign()
         else:
             sign = params.ensure_grad_().grad.sign_() # can use in-place as this is not fx0 grad
@@ -109,19 +109,19 @@ class MinibatchRprop(OptimizerModule):
         # update params or step
         if self.next_mode == 'continue' or (self.next_mode == 'add' and self.next_module is None):
-            state.ascent = ascent2
-            return self._update_params_or_step_with_next(state, params)
+            vars.ascent = ascent2
+            return self._update_params_or_step_with_next(vars, params)
         if self.next_mode == 'add':
             # undo 1st step
             params += ascent
-            state.ascent = ascent + ascent2
-            return self._update_params_or_step_with_next(state, params)
+            vars.ascent = ascent + ascent2
+            return self._update_params_or_step_with_next(vars, params)
         if self.next_mode == 'undo':
             params += ascent
-            state.ascent = ascent2
-            return self._update_params_or_step_with_next(state, params)
+            vars.ascent = ascent2
+            return self._update_params_or_step_with_next(vars, params)
         raise ValueError(f'invalid next_mode: {self.next_mode}')
@@ -140,9 +140,9 @@ class GradMin(OptimizerModule):
         self.create_graph = create_graph
     @torch.no_grad
-    def step(self, state):
-        if state.closure is None: raise ValueError()
-        if state.ascent is not None:
+    def step(self, vars):
+        if vars.closure is None: raise ValueError()
+        if vars.ascent is not None:
             raise ValueError("GradMin doesn't accept ascent_direction")
         params = self.get_params()
@@ -150,26 +150,26 @@ class GradMin(OptimizerModule):
         self.zero_grad()
         with torch.enable_grad():
-            state.fx0 = state.closure(False)
-            grads = jacobian([state.fx0], params, create_graph=True, batched=False) # type:ignore
+            vars.fx0 = vars.closure(False)
+            grads = jacobian([vars.fx0], params, create_graph=True, batched=False) # type:ignore
             grads = TensorList(grads).squeeze_(0)
             if self.square:
                 grads = grads ** 2
             else:
                 grads = grads.abs()
-            if self.maximize_grad: grads: TensorList = grads - (state.fx0 * loss_term) # type:ignore
-            else: grads = grads + (state.fx0 * loss_term)
+            if self.maximize_grad: grads: TensorList = grads - (vars.fx0 * loss_term) # type:ignore
+            else: grads = grads + (vars.fx0 * loss_term)
             grad_mean = torch.sum(torch.stack(grads.sum())) / grads.total_numel()
             if self.create_graph: grad_mean.backward(create_graph=True)
             else: grad_mean.backward(retain_graph=False)
-        if self.maximize_grad: state.grad = params.ensure_grad_().grad.neg_()
-        else: state.grad = params.ensure_grad_().grad
+        if self.maximize_grad: vars.grad = params.ensure_grad_().grad.neg_()
+        else: vars.grad = params.ensure_grad_().grad
-        state.maybe_use_grad_(params)
-        return self._update_params_or_step_with_next(state)
+        vars.maybe_use_grad_(params)
+        return self._update_params_or_step_with_next(vars)
 class HVPDiagNewton(OptimizerModule):
@@ -182,26 +182,26 @@ class HVPDiagNewton(OptimizerModule):
         super().__init__(dict(eps=eps))
     @torch.no_grad
-    def step(self, state):
-        if state.closure is None: raise ValueError()
-        if state.ascent is not None:
+    def step(self, vars):
+        if vars.closure is None: raise ValueError()
+        if vars.ascent is not None:
             raise ValueError("HVPDiagNewton doesn't accept ascent_direction")
         params = self.get_params()
         eps = self.get_group_key('eps')
-        grad_fx0 = state.maybe_compute_grad_(params).clone()
-        state.grad = grad_fx0 # set state grad to the cloned version, since it will be overwritten
+        grad_fx0 = vars.maybe_compute_grad_(params).clone()
+        vars.grad = grad_fx0 # set state grad to the cloned version, since it will be overwritten
         params += grad_fx0 * eps
-        with torch.enable_grad(): _ = state.closure()
+        with torch.enable_grad(): _ = vars.closure()
         params -= grad_fx0 * eps
         newton = grad_fx0 * ((grad_fx0 * eps) / (params.grad - grad_fx0))
         newton.nan_to_num_(0,0,0)
-        state.ascent = newton
-        return self._update_params_or_step_with_next(state)
+        vars.ascent = newton
+        return self._update_params_or_step_with_next(vars)
@@ -219,11 +219,11 @@ class ReduceOutwardLR(OptimizerModule):
         self.invert = invert
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
         mul = self.get_group_key('mul')
-        if self.use_grad: cur = state.maybe_compute_grad_(params)
+        if self.use_grad: cur = vars.maybe_compute_grad_(params)
         else: cur = ascent
         # mask of weights where sign matches with update sign (minus ascent sign), multiplied by `mul`.
@@ -241,7 +241,7 @@ class NoiseSign(OptimizerModule):
         self.distribution:Distributions = distribution
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         return ascent.sample_like(self.alpha, self.distribution).copysign_(ascent)
 class ParamSign(OptimizerModule):
@@ -250,7 +250,7 @@ class ParamSign(OptimizerModule):
         super().__init__({})
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
         return params.copysign(ascent)
@@ -261,7 +261,7 @@ class NegParamSign(OptimizerModule):
         super().__init__({})
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         neg_params = self.get_params().abs()
         max = neg_params.total_max()
         neg_params = neg_params.neg_().add(max)
@@ -274,7 +274,7 @@ class InvParamSign(OptimizerModule):
         self.eps = eps
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         inv_params = self.get_params().abs().add_(self.eps).reciprocal_()
         return inv_params.copysign(ascent)
@@ -286,7 +286,7 @@ class ParamWhereConsistentSign(OptimizerModule):
         self.eps = eps
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
         same_sign = params.sign() == ascent.sign()
         ascent.masked_set_(same_sign, params)

torchzero/modules/experimental/quad_interp.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 import torch
 from ...tensorlist import TensorList
-from ...core import OptimizationState
+from ...core import OptimizationVars
 from ..line_search.base_ls import LineSearchBase
 _FloatOrTensor = float | torch.Tensor
@@ -47,12 +47,12 @@ class QuadraticInterpolation2Point(LineSearchBase):
         self.min_dist = min_dist
     @torch.no_grad
-    def _find_best_lr(self, state: OptimizationState, params: TensorList) -> float:
-        if state.closure is None: raise ValueError('QuardaticLS requires closure')
-        closure = state.closure
-        if state.fx0 is None: state.fx0 = state.closure(False)
-        grad = state.grad
-        if grad is None: grad = state.ascent # in case we used FDM
+    def _find_best_lr(self, vars: OptimizationVars, params: TensorList) -> float:
+        if vars.closure is None: raise ValueError('QuardaticLS requires closure')
+        closure = vars.closure
+        if vars.fx0 is None: vars.fx0 = vars.closure(False)
+        grad = vars.grad
+        if grad is None: grad = vars.ascent # in case we used FDM
         if grad is None: raise ValueError('QuardaticLS requires gradients.')
         params = self.get_params()
@@ -67,7 +67,7 @@ class QuadraticInterpolation2Point(LineSearchBase):
         # make min_dist relative
         min_dist = abs(lr) * self.min_dist
-        points = sorted([Point(0, _ensure_float(state.fx0), dfx0), Point(lr, _ensure_float(fx1))], key = lambda x: x.fx)
+        points = sorted([Point(0, _ensure_float(vars.fx0), dfx0), Point(lr, _ensure_float(fx1))], key = lambda x: x.fx)
         for i in range(self.max_evals):
             # find new point

torchzero 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

torchzero 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl