PyPI - torchzero - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

torchzero 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

torchzero/core/__init__.py +1 -1
torchzero/core/module.py +72 -49
torchzero/core/tensorlist_optimizer.py +1 -1
torchzero/modules/adaptive/adaptive.py +11 -11
torchzero/modules/experimental/experimental.py +41 -41
torchzero/modules/experimental/quad_interp.py +8 -8
torchzero/modules/experimental/subspace.py +37 -37
torchzero/modules/gradient_approximation/base_approximator.py +19 -24
torchzero/modules/gradient_approximation/fdm.py +1 -1
torchzero/modules/gradient_approximation/newton_fdm.py +13 -13
torchzero/modules/gradient_approximation/rfdm.py +1 -1
torchzero/modules/line_search/armijo.py +8 -8
torchzero/modules/line_search/base_ls.py +8 -8
torchzero/modules/line_search/directional_newton.py +14 -14
torchzero/modules/line_search/grid_ls.py +7 -7
torchzero/modules/line_search/scipy_minimize_scalar.py +3 -3
torchzero/modules/meta/alternate.py +4 -4
torchzero/modules/meta/grafting.py +23 -23
torchzero/modules/meta/optimizer_wrapper.py +14 -14
torchzero/modules/meta/return_overrides.py +8 -8
torchzero/modules/misc/accumulate.py +6 -6
torchzero/modules/misc/basic.py +16 -16
torchzero/modules/misc/lr.py +2 -2
torchzero/modules/misc/multistep.py +7 -7
torchzero/modules/misc/on_increase.py +9 -9
torchzero/modules/momentum/momentum.py +4 -4
torchzero/modules/operations/multi.py +44 -44
torchzero/modules/operations/reduction.py +28 -28
torchzero/modules/operations/singular.py +9 -9
torchzero/modules/optimizers/adagrad.py +1 -1
torchzero/modules/optimizers/adam.py +8 -8
torchzero/modules/optimizers/lion.py +1 -1
torchzero/modules/optimizers/rmsprop.py +1 -1
torchzero/modules/optimizers/rprop.py +1 -1
torchzero/modules/optimizers/sgd.py +2 -2
torchzero/modules/orthogonalization/newtonschulz.py +3 -3
torchzero/modules/orthogonalization/svd.py +1 -1
torchzero/modules/regularization/dropout.py +1 -1
torchzero/modules/regularization/noise.py +3 -3
torchzero/modules/regularization/normalization.py +5 -5
torchzero/modules/regularization/ortho_grad.py +1 -1
torchzero/modules/regularization/weight_decay.py +1 -1
torchzero/modules/scheduling/lr_schedulers.py +2 -2
torchzero/modules/scheduling/step_size.py +8 -8
torchzero/modules/second_order/newton.py +12 -12
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/gaussian_smoothing.py +7 -7
torchzero/modules/smoothing/laplacian_smoothing.py +1 -1
torchzero/modules/weight_averaging/ema.py +3 -3
torchzero/modules/weight_averaging/swa.py +8 -8
torchzero/optim/first_order/forward_gradient.py +1 -1
torchzero/optim/modular.py +4 -4
torchzero/tensorlist.py +8 -1
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/METADATA +1 -1
torchzero-0.1.5.dist-info/RECORD +104 -0
torchzero-0.1.3.dist-info/RECORD +0 -104
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/LICENSE +0 -0
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/WHEEL +0 -0
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/top_level.txt +0 -0

torchzero/modules/experimental/subspace.py CHANGED Viewed

@@ -5,14 +5,14 @@ from collections import abc
 import torch
 from ... import tensorlist as tl
-from ...core import OptimizationState, OptimizerModule, _Chain, _maybe_pass_backward
+from ...core import OptimizationVars, OptimizerModule, _Chain, _maybe_pass_backward
 # this whole thing can also be implemented via parameter vectors.
 # Need to test which one is more efficient...
 class Projection(ABC):
     n = 1
     @abstractmethod
-    def sample(self, params: tl.TensorList, state: OptimizationState) -> list[tl.TensorList]:
+    def sample(self, params: tl.TensorList, vars: OptimizationVars) -> list[tl.TensorList]:
         """Generate a projection.
         Args:
@@ -28,7 +28,7 @@ class ProjRandom(Projection):
         self.distribution: tl.Distributions = distribution
         self.n = n
-    def sample(self, params: tl.TensorList, state: OptimizationState):
+    def sample(self, params: tl.TensorList, vars: OptimizationVars):
         return [params.sample_like(distribution=self.distribution) for _ in range(self.n)]
@@ -42,7 +42,7 @@ class Proj2Masks(Projection):
     def n(self):
         return self.n_pairs * 2
-    def sample(self, params: tl.TensorList, state: OptimizationState):
+    def sample(self, params: tl.TensorList, vars: OptimizationVars):
         projections = []
         for i in range(self.n_pairs):
             mask = params.bernoulli_like(0.5)
@@ -55,9 +55,9 @@ class Proj2Masks(Projection):
 class ProjAscent(Projection):
     """Use ascent direction as the projection."""
-    def sample(self, params: tl.TensorList, state: OptimizationState):
-        if state.ascent is None: raise ValueError
-        return [state.ascent]
+    def sample(self, params: tl.TensorList, vars: OptimizationVars):
+        if vars.ascent is None: raise ValueError
+        return [vars.ascent]
 class ProjAscentRay(Projection):
     def __init__(self, eps = 0.1, n = 1, distribution: tl.Distributions = 'normal', ):
@@ -65,14 +65,14 @@ class ProjAscentRay(Projection):
         self.distribution: tl.Distributions = distribution
         self.n = n
-    def sample(self, params: tl.TensorList, state: OptimizationState):
-        if state.ascent is None: raise ValueError
+    def sample(self, params: tl.TensorList, vars: OptimizationVars):
+        if vars.ascent is None: raise ValueError
         mean = params.total_mean().detach().cpu().item()
-        return [state.ascent + state.ascent.sample_like(mean * self.eps, distribution=self.distribution) for _ in range(self.n)]
+        return [vars.ascent + vars.ascent.sample_like(mean * self.eps, distribution=self.distribution) for _ in range(self.n)]
 class ProjGrad(Projection):
-    def sample(self, params: tl.TensorList, state: OptimizationState):
-        grad = state.maybe_compute_grad_(params)
+    def sample(self, params: tl.TensorList, vars: OptimizationVars):
+        grad = vars.maybe_compute_grad_(params)
         return [grad]
 class ProjGradRay(Projection):
@@ -81,8 +81,8 @@ class ProjGradRay(Projection):
         self.distribution: tl.Distributions = distribution
         self.n = n
-    def sample(self, params: tl.TensorList, state: OptimizationState):
-        grad = state.maybe_compute_grad_(params)
+    def sample(self, params: tl.TensorList, vars: OptimizationVars):
+        grad = vars.maybe_compute_grad_(params)
         mean = params.total_mean().detach().cpu().item()
         return [grad + grad.sample_like(mean * self.eps, distribution=self.distribution) for _ in range(self.n)]
@@ -95,23 +95,23 @@ class ProjGradAscentDifference(Projection):
         """
         self.normalize = normalize
-    def sample(self, params: tl.TensorList, state: OptimizationState):
-        grad = state.maybe_compute_grad_(params)
+    def sample(self, params: tl.TensorList, vars: OptimizationVars):
+        grad = vars.maybe_compute_grad_(params)
         if self.normalize:
-            return [state.ascent / state.ascent.total_vector_norm(2) - grad / grad.total_vector_norm(2)] # type:ignore
+            return [vars.ascent / vars.ascent.total_vector_norm(2) - grad / grad.total_vector_norm(2)] # type:ignore
-        return [state.ascent - grad] # type:ignore
+        return [vars.ascent - grad] # type:ignore
 class ProjLastGradDifference(Projection):
     def __init__(self):
         """Use difference between last two gradients as the projection."""
         self.last_grad = None
-    def sample(self, params: tl.TensorList, state: OptimizationState):
+    def sample(self, params: tl.TensorList, vars: OptimizationVars):
         if self.last_grad is None:
-            self.last_grad = state.maybe_compute_grad_(params)
+            self.last_grad = vars.maybe_compute_grad_(params)
             return [self.last_grad]
-        grad = state.maybe_compute_grad_(params)
+        grad = vars.maybe_compute_grad_(params)
         diff = grad - self.last_grad
         self.last_grad = grad
         return [diff]
@@ -121,13 +121,13 @@ class ProjLastAscentDifference(Projection):
         """Use difference between last two ascent directions as the projection."""
         self.last_direction = T.cast(tl.TensorList, None)
-    def sample(self, params: tl.TensorList, state: OptimizationState):
+    def sample(self, params: tl.TensorList, vars: OptimizationVars):
         if self.last_direction is None:
-            self.last_direction: tl.TensorList = state.ascent # type:ignore
+            self.last_direction: tl.TensorList = vars.ascent # type:ignore
             return [self.last_direction]
-        diff = state.ascent - self.last_direction # type:ignore
-        self.last_direction = state.ascent # type:ignore
+        diff = vars.ascent - self.last_direction # type:ignore
+        self.last_direction = vars.ascent # type:ignore
         return [diff]
 class ProjNormalize(Projection):
@@ -139,10 +139,10 @@ class ProjNormalize(Projection):
     def n(self):
         return sum(proj.n for proj in self.projections)
-    def sample(self, params: tl.TensorList, state: OptimizationState): # type:ignore
-        vecs = [proj for obj in self.projections for proj in obj.sample(params, state)]
+    def sample(self, params: tl.TensorList, vars: OptimizationVars): # type:ignore
+        vecs = [proj for obj in self.projections for proj in obj.sample(params, vars)]
         norms = [v.total_vector_norm(2) for v in vecs]
-        return [v/norm if norm!=0 else v.randn_like() for v,norm in zip(vecs,norms)]
+        return [v/norm if norm!=0 else v.randn_like() for v,norm in zip(vecs,norms)] # type:ignore
 class Subspace(OptimizerModule):
     """This is pretty inefficient, I thought of a much better way to do this via jvp and I will rewrite this soon.
@@ -198,17 +198,17 @@ class Subspace(OptimizerModule):
             child.add_param_group({"params": params})
     @torch.no_grad
-    def step(self, state):
+    def step(self, vars):
         #if self.next_module is None: raise ValueError('RandomProjection needs a child')
-        if state.closure is None: raise ValueError('RandomProjection needs a closure')
-        closure = state.closure
+        if vars.closure is None: raise ValueError('RandomProjection needs a closure')
+        closure = vars.closure
         params = self.get_params()
         # every `regenerate_every` steps we generate new random projections.
         if self.current_step == 0 or (self.update_every is not None and self.current_step % self.update_every == 0):
             # generate n projection vetors
-            self.projection_vectors = [sample for proj in self.projections for sample in proj.sample(params, state)]
+            self.projection_vectors = [sample for proj in self.projections for sample in proj.sample(params, vars)]
             # child params is n scalars corresponding to each projection vector
             self.projected_params = self.children['subspace']._params[0] # type:ignore
@@ -235,7 +235,7 @@ class Subspace(OptimizerModule):
         #     ascent_direction = tl.sum([ascent_direction*v for v in self.projection_vectors])
         # perform a step with the child
-        subspace_state = state.copy(False)
+        subspace_state = vars.copy(False)
         subspace_state.closure = projected_closure
         subspace_state.ascent = None
         if subspace_state.grad is not None:
@@ -244,11 +244,11 @@ class Subspace(OptimizerModule):
         # that is going to update child's paramers, which we now project back to the full parameter space
         residual = tl.sum([vec * p for vec, p in zip(self.projection_vectors, self.projected_params)])
-        state.ascent = residual.neg_()
+        vars.ascent = residual.neg_()
         # move fx0 and fx0 approx to state
-        if subspace_state.fx0 is not None: state.fx0 = subspace_state.fx0
-        if subspace_state.fx0_approx is not None: state.fx0 = subspace_state.fx0_approx
+        if subspace_state.fx0 is not None: vars.fx0 = subspace_state.fx0
+        if subspace_state.fx0_approx is not None: vars.fx0 = subspace_state.fx0_approx
         # projected_params are residuals that have been applied to actual params on previous step in some way
         # therefore they need to now become zero (otherwise they work like momentum with no decay).
         # note: THIS WON'T WORK WITH INTEGRATIONS, UNLESS THEY PERFORM FULL MINIMIZATION EACH STEP
@@ -256,4 +256,4 @@ class Subspace(OptimizerModule):
         self.projected_params.zero_()
         self.current_step += 1
-        return self._update_params_or_step_with_next(state)
+        return self._update_params_or_step_with_next(vars)

torchzero/modules/gradient_approximation/base_approximator.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any, Literal
 import torch
 from ...core import (
-    OptimizationState,
+    OptimizationVars,
     OptimizerModule,
     _ClosureType,
     _maybe_pass_backward,
@@ -39,12 +39,12 @@ class GradientApproximatorBase(OptimizerModule, ABC):
         super().__init__(defaults, target)
         self.requires_fx0 = requires_fx0
-    def _step_make_closure_(self, state: OptimizationState, params: TensorList):
-        if state.closure is None: raise ValueError("gradient approximation requires closure")
-        closure = state.closure
+    def _step_make_closure_(self, vars: OptimizationVars, params: TensorList):
+        if vars.closure is None: raise ValueError("gradient approximation requires closure")
+        closure = vars.closure
-        if self.requires_fx0: fx0 = state.evaluate_fx0_(False)
-        else: fx0 = state.fx0
+        if self.requires_fx0: fx0 = vars.evaluate_fx0_(False)
+        else: fx0 = vars.fx0
         def new_closure(backward=True) -> _ScalarLoss:
             if backward:
@@ -56,35 +56,35 @@ class GradientApproximatorBase(OptimizerModule, ABC):
             return closure(False)
-        state.closure = new_closure
+        vars.closure = new_closure
-    def _step_make_target_(self, state: OptimizationState, params: TensorList):
-        if state.closure is None: raise ValueError("gradient approximation requires closure")
+    def _step_make_target_(self, vars: OptimizationVars, params: TensorList):
+        if vars.closure is None: raise ValueError("gradient approximation requires closure")
-        if self.requires_fx0: fx0 = state.evaluate_fx0_(False)
-        else: fx0 = state.fx0
+        if self.requires_fx0: fx0 = vars.evaluate_fx0_(False)
+        else: fx0 = vars.fx0
-        g, state.fx0, state.fx0_approx = self._make_ascent(state.closure, params, fx0)
-        if self._default_step_target == 'ascent': state.ascent = g
-        elif self._default_step_target == 'grad': state.set_grad_(g, params)
+        g, vars.fx0, vars.fx0_approx = self._make_ascent(vars.closure, params, fx0)
+        if self._default_step_target == 'ascent': vars.ascent = g
+        elif self._default_step_target == 'grad': vars.set_grad_(g, params)
         else: raise ValueError(f"Unknown target {self._default_step_target}")
     @torch.no_grad
-    def step(self, state: OptimizationState):
+    def step(self, vars: OptimizationVars):
         params = self.get_params()
         if self._default_step_target == 'closure':
-            self._step_make_closure_(state, params)
+            self._step_make_closure_(vars, params)
         else:
-            self._step_make_target_(state, params)
+            self._step_make_target_(vars, params)
-        return self._update_params_or_step_with_next(state, params)
+        return self._update_params_or_step_with_next(vars, params)
     @abstractmethod
     @torch.no_grad
     def _make_ascent(
         self,
-        # state: OptimizationState,
+        # vars: OptimizationVars,
         closure: _ClosureType,
         params: TensorList,
         fx0: Any,
@@ -95,11 +95,6 @@ class GradientApproximatorBase(OptimizerModule, ABC):
             (ascent, fx0, fx0_approx)
-        :code:`ascent` is the approximated gradient,
-        :code:`fx0` is loss value strictly with initial parameters of the current step,
-        :code:`fx0_approx` is loss value with perturbed parameters (will be returned by optimizer step if fx0 is None).
-        :code:`fx0` and :code:`fx0_approx` can be None.
         Args:
             closure (_ClosureType): closure
             params (TensorList): parameters

torchzero/modules/gradient_approximation/fdm.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 from ...utils.python_tools import _ScalarLoss
 from ...tensorlist import TensorList
-from ...core import _ClosureType, OptimizerModule, OptimizationState
+from ...core import _ClosureType, OptimizerModule, OptimizationVars
 from ._fd_formulas import _FD_Formulas
 from .base_approximator import GradientApproximatorBase

torchzero/modules/gradient_approximation/newton_fdm.py CHANGED Viewed

@@ -121,16 +121,16 @@ class NewtonFDM(OptimizerModule):
         self.tol = tol
     @torch.no_grad
-    def step(self, state):
+    def step(self, vars):
         """Returns a new ascent direction."""
-        if state.closure is None: raise ValueError('NewtonFDM requires a closure.')
-        if state.ascent is not None: raise ValueError('NewtonFDM got ascent direction')
+        if vars.closure is None: raise ValueError('NewtonFDM requires a closure.')
+        if vars.ascent is not None: raise ValueError('NewtonFDM got ascent direction')
         params = self.get_params()
         epsilons = self.get_group_key('eps')
         # evaluate fx0.
-        if state.fx0 is None: state.fx0 = state.closure(False)
+        if vars.fx0 is None: vars.fx0 = vars.closure(False)
         # evaluate gradients and hessian via finite differences.
         grads = params.zeros_like()
@@ -152,7 +152,7 @@ class NewtonFDM(OptimizerModule):
                             cur2 += 1
                             continue
                         _three_point_2cd_(
-                            closure = state.closure,
+                            closure = vars.closure,
                             idx1 = idx1,
                             idx2 = idx2,
                             p1 = flat_param1,
@@ -161,7 +161,7 @@ class NewtonFDM(OptimizerModule):
                             hessian = hessian,
                             eps1 = eps1,
                             eps2 = eps2,
-                            fx0 = state.fx0,
+                            fx0 = vars.fx0,
                             i1 = cur1,
                             i2 = cur2,
                         )
@@ -181,18 +181,18 @@ class NewtonFDM(OptimizerModule):
                     newton_step, success = _fallback_gd(hessian, gvec)
         # update params or pass the gradients to the child.
-        state.ascent = grads.from_vec(newton_step)
+        vars.ascent = grads.from_vec(newton_step)
         # validate if newton step decreased loss
         if self.validate:
-            params.sub_(state.ascent)
-            fx1 = state.closure(False)
-            params.add_(state.ascent)
+            params.sub_(vars.ascent)
+            fx1 = vars.closure(False)
+            params.add_(vars.ascent)
             # if loss increases, set ascent direction to gvec times lr
-            if fx1 - state.fx0 > state.fx0 * self.tol:
-                state.ascent = grads.from_vec(gvec) * self.gd_lr
+            if fx1 - vars.fx0 > vars.fx0 * self.tol:
+                vars.ascent = grads.from_vec(gvec) * self.gd_lr
-        return self._update_params_or_step_with_next(state, params)
+        return self._update_params_or_step_with_next(vars, params)

torchzero/modules/gradient_approximation/rfdm.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 from ...utils.python_tools import _ScalarLoss
 from ...tensorlist import Distributions, TensorList
-from ...core import _ClosureType, OptimizerModule, OptimizationState
+from ...core import _ClosureType, OptimizerModule, OptimizationVars
 from ._fd_formulas import _FD_Formulas
 from .base_approximator import GradientApproximatorBase

torchzero/modules/line_search/armijo.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 from ...tensorlist import TensorList
-from ...core import OptimizationState
+from ...core import OptimizationVars
 from .base_ls import LineSearchBase
@@ -32,23 +32,23 @@ class ArmijoLS(LineSearchBase):
         self.max_iter = max_iter
     @torch.no_grad
-    def _find_best_lr(self, state: OptimizationState, params: TensorList) -> float:
-        if state.closure is None: raise RuntimeError(f"Line searches ({self.__class__.__name__}) require a closure")
-        ascent = state.maybe_use_grad_(params)
-        grad = state.maybe_compute_grad_(params)
+    def _find_best_lr(self, vars: OptimizationVars, params: TensorList) -> float:
+        if vars.closure is None: raise RuntimeError(f"Line searches ({self.__class__.__name__}) require a closure")
+        ascent = vars.maybe_use_grad_(params)
+        grad = vars.maybe_compute_grad_(params)
         alpha = self.get_first_group_key('alpha')
-        if state.fx0 is None: state.fx0 = state.closure(False)
+        if vars.fx0 is None: vars.fx0 = vars.closure(False)
         # loss decrease per lr=1 if function was linear
         decrease_per_lr = (grad*ascent).total_sum()
         for _ in range(self.max_iter):
-            loss = self._evaluate_lr_(alpha, state.closure, ascent, params)
+            loss = self._evaluate_lr_(alpha, vars.closure, ascent, params)
             # expected decrease
             expected_decrease = decrease_per_lr * alpha
-            if (state.fx0 - loss) / expected_decrease >= self.beta:
+            if (vars.fx0 - loss) / expected_decrease >= self.beta:
                 return alpha
             alpha *= self.mul

torchzero/modules/line_search/base_ls.py CHANGED Viewed

@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 import torch
 from ...tensorlist import TensorList
-from ...core import _ClosureType, OptimizationState, OptimizerModule, _maybe_pass_backward
+from ...core import _ClosureType, OptimizationVars, OptimizerModule, _maybe_pass_backward
 from ...utils.python_tools import _ScalarLoss
@@ -108,20 +108,20 @@ class LineSearchBase(OptimizerModule, ABC):
         if isinstance(v, torch.Tensor): return v.detach().cpu().item()
         return float(v)
-    def _find_best_lr(self, state: OptimizationState, params: TensorList) -> float:
+    def _find_best_lr(self, vars: OptimizationVars, params: TensorList) -> float:
         """This should return the best lr."""
         ... # pylint:disable=unnecessary-ellipsis
     @torch.no_grad
-    def step(self, state: OptimizationState):
+    def step(self, vars: OptimizationVars):
         self._reset()
         if self.log_lrs: self._lrs.append({})
         params = self.get_params()
-        ascent_direction = state.maybe_use_grad_(params)
+        ascent_direction = vars.maybe_use_grad_(params)
         try:
-            lr = self._find_best_lr(state, params) # pylint:disable=assignment-from-no-return
+            lr = self._find_best_lr(vars, params) # pylint:disable=assignment-from-no-return
         except MaxIterReached:
             lr = self._best_lr
@@ -133,7 +133,7 @@ class LineSearchBase(OptimizerModule, ABC):
         # otherwise undo the update by setting lr to 0 and instead multiply ascent direction by lr.
         self._set_lr_(0, ascent_direction, params)
         ascent_direction.mul_(self._best_lr)
-        state.ascent = ascent_direction
-        if state.fx0_approx is None: state.fx0_approx = self._lowest_loss
-        return self.next_module.step(state)
+        vars.ascent = ascent_direction
+        if vars.fx0_approx is None: vars.fx0_approx = self._lowest_loss
+        return self.next_module.step(vars)

torchzero/modules/line_search/directional_newton.py CHANGED Viewed

@@ -2,7 +2,7 @@ import numpy as np
 import torch
 from ...tensorlist import TensorList
-from ...core import OptimizationState
+from ...core import OptimizationVars
 from .base_ls import LineSearchBase
 _FloatOrTensor = float | torch.Tensor
@@ -57,14 +57,14 @@ class DirectionalNewton(LineSearchBase):
         self.validate_step = validate_step
     @torch.no_grad
-    def _find_best_lr(self, state: OptimizationState, params: TensorList) -> float:
-        if state.closure is None: raise ValueError('QuardaticLS requires closure')
-        closure = state.closure
+    def _find_best_lr(self, vars: OptimizationVars, params: TensorList) -> float:
+        if vars.closure is None: raise ValueError('QuardaticLS requires closure')
+        closure = vars.closure
         params = self.get_params()
-        grad = state.maybe_compute_grad_(params)
-        ascent = state.maybe_use_grad_(params)
-        if state.fx0 is None: state.fx0 = state.closure(False) # at this stage maybe_compute_grad could've evaluated fx0
+        grad = vars.maybe_compute_grad_(params)
+        ascent = vars.maybe_use_grad_(params)
+        if vars.fx0 is None: vars.fx0 = vars.closure(False) # at this stage maybe_compute_grad could've evaluated fx0
         alpha: float = self.get_first_group_key('alpha') # this doesn't support variable lrs but we still want to support schedulers
@@ -78,7 +78,7 @@ class DirectionalNewton(LineSearchBase):
         if y1_prime != 0:
             xmin, a = _fit_and_minimize_quadratic_2points_grad(
                 x1=0,
-                y1=state.fx0,
+                y1=vars.fx0,
                 y1_prime=-y1_prime,
                 x2=alpha,
                 # we stepped in the direction of minus gradient times lr.
@@ -172,14 +172,14 @@ class DirectionalNewton3Points(LineSearchBase):
         self.validate_step = validate_step
     @torch.no_grad
-    def _find_best_lr(self, state: OptimizationState, params: TensorList) -> float:
-        if state.closure is None: raise ValueError('QuardaticLS requires closure')
-        closure = state.closure
-        ascent_direction = state.ascent
+    def _find_best_lr(self, vars: OptimizationVars, params: TensorList) -> float:
+        if vars.closure is None: raise ValueError('QuardaticLS requires closure')
+        closure = vars.closure
+        ascent_direction = vars.ascent
         if ascent_direction is None: raise ValueError('Ascent direction is None')
         alpha: float = self.get_first_group_key('alpha')
-        if state.fx0 is None: state.fx0 = state.closure(False)
+        if vars.fx0 is None: vars.fx0 = vars.closure(False)
         params = self.get_params()
         # make a step in the direction and evaluate f(x2)
@@ -190,7 +190,7 @@ class DirectionalNewton3Points(LineSearchBase):
         # if gradients weren't 0
         xmin, a = _newton_step_3points(
-            0, state.fx0,
+            0, vars.fx0,
             # we stepped in the direction of minus ascent_direction.
             alpha, y2,
             alpha * 2, y3

torchzero/modules/line_search/grid_ls.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 import torch
 from ...tensorlist import TensorList
-from ...core import _ClosureType, OptimizationState
+from ...core import _ClosureType, OptimizationVars
 from .base_ls import LineSearchBase
 class GridLS(LineSearchBase):
@@ -34,16 +34,16 @@ class GridLS(LineSearchBase):
         self.stop_on_worsened = stop_on_worsened
     @torch.no_grad
-    def _find_best_lr(self, state: OptimizationState, params: TensorList) -> float:
-        if state.closure is None: raise ValueError("closure is not set")
-        if state.ascent is None: raise ValueError("ascent_direction is not set")
+    def _find_best_lr(self, vars: OptimizationVars, params: TensorList) -> float:
+        if vars.closure is None: raise ValueError("closure is not set")
+        if vars.ascent is None: raise ValueError("ascent_direction is not set")
         if self.stop_on_improvement:
-            if state.fx0 is None: state.fx0 = state.closure(False)
-            self._lowest_loss = state.fx0
+            if vars.fx0 is None: vars.fx0 = vars.closure(False)
+            self._lowest_loss = vars.fx0
         for lr in self.lrs:
-            loss = self._evaluate_lr_(float(lr), state.closure, state.ascent, params)
+            loss = self._evaluate_lr_(float(lr), vars.closure, vars.ascent, params)
             # if worsened
             if self.stop_on_worsened and loss != self._lowest_loss:

torchzero/modules/line_search/scipy_minimize_scalar.py CHANGED Viewed

@@ -7,7 +7,7 @@ except ModuleNotFoundError:
     scopt = typing.cast(typing.Any, None)
 from ...tensorlist import TensorList
-from ...core import OptimizationState
+from ...core import OptimizationVars
 from .base_ls import LineSearchBase, MaxIterReached
@@ -45,11 +45,11 @@ class ScipyMinimizeScalarLS(LineSearchBase):
         self.options = options
     @torch.no_grad
-    def _find_best_lr(self, state: OptimizationState, params: TensorList) -> float:
+    def _find_best_lr(self, vars: OptimizationVars, params: TensorList) -> float:
         try:
             res = scopt.minimize_scalar(
                 self._evaluate_lr_ensure_float,
-                args = (state.closure, state.ascent, params),
+                args = (vars.closure, vars.ascent, params),
                 method = self.method,
                 tol = self.tol,
                 bracket = self.bracket,

torchzero/modules/meta/alternate.py CHANGED Viewed

@@ -40,7 +40,7 @@ class Alternate(OptimizerModule):
             if len(self.mode) != len(self.children):
                 raise ValueError(f"got {len(self.children)} modules but {len(mode)} repeats, they should be the same")
-    def step(self, state):
+    def step(self, vars):
         if self.mode == 'random':
             module = self.random.choice(list(self.children.values()))
@@ -58,8 +58,8 @@ class Alternate(OptimizerModule):
             self.remaining -= 1
         if self.next_module is None:
-            return module.step(state)
+            return module.step(vars)
-        state.ascent = module.return_ascent(state)
-        return self._update_params_or_step_with_next(state)
+        vars.ascent = module.return_ascent(vars)
+        return self._update_params_or_step_with_next(vars)

torchzero 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

torchzero 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl