PyPI - torchzero - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

torchzero 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

torchzero/core/__init__.py +1 -1
torchzero/core/module.py +72 -49
torchzero/core/tensorlist_optimizer.py +1 -1
torchzero/modules/adaptive/adaptive.py +11 -11
torchzero/modules/experimental/experimental.py +41 -41
torchzero/modules/experimental/quad_interp.py +8 -8
torchzero/modules/experimental/subspace.py +37 -37
torchzero/modules/gradient_approximation/base_approximator.py +19 -24
torchzero/modules/gradient_approximation/fdm.py +1 -1
torchzero/modules/gradient_approximation/newton_fdm.py +13 -13
torchzero/modules/gradient_approximation/rfdm.py +1 -1
torchzero/modules/line_search/armijo.py +8 -8
torchzero/modules/line_search/base_ls.py +8 -8
torchzero/modules/line_search/directional_newton.py +14 -14
torchzero/modules/line_search/grid_ls.py +7 -7
torchzero/modules/line_search/scipy_minimize_scalar.py +3 -3
torchzero/modules/meta/alternate.py +4 -4
torchzero/modules/meta/grafting.py +23 -23
torchzero/modules/meta/optimizer_wrapper.py +14 -14
torchzero/modules/meta/return_overrides.py +8 -8
torchzero/modules/misc/accumulate.py +6 -6
torchzero/modules/misc/basic.py +16 -16
torchzero/modules/misc/lr.py +2 -2
torchzero/modules/misc/multistep.py +7 -7
torchzero/modules/misc/on_increase.py +9 -9
torchzero/modules/momentum/momentum.py +4 -4
torchzero/modules/operations/multi.py +44 -44
torchzero/modules/operations/reduction.py +28 -28
torchzero/modules/operations/singular.py +9 -9
torchzero/modules/optimizers/adagrad.py +1 -1
torchzero/modules/optimizers/adam.py +8 -8
torchzero/modules/optimizers/lion.py +1 -1
torchzero/modules/optimizers/rmsprop.py +1 -1
torchzero/modules/optimizers/rprop.py +1 -1
torchzero/modules/optimizers/sgd.py +2 -2
torchzero/modules/orthogonalization/newtonschulz.py +3 -3
torchzero/modules/orthogonalization/svd.py +1 -1
torchzero/modules/regularization/dropout.py +1 -1
torchzero/modules/regularization/noise.py +3 -3
torchzero/modules/regularization/normalization.py +5 -5
torchzero/modules/regularization/ortho_grad.py +1 -1
torchzero/modules/regularization/weight_decay.py +1 -1
torchzero/modules/scheduling/lr_schedulers.py +2 -2
torchzero/modules/scheduling/step_size.py +8 -8
torchzero/modules/second_order/newton.py +12 -12
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/gaussian_smoothing.py +7 -7
torchzero/modules/smoothing/laplacian_smoothing.py +1 -1
torchzero/modules/weight_averaging/ema.py +3 -3
torchzero/modules/weight_averaging/swa.py +8 -8
torchzero/optim/first_order/forward_gradient.py +1 -1
torchzero/optim/modular.py +4 -4
torchzero/tensorlist.py +8 -1
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/METADATA +1 -1
torchzero-0.1.5.dist-info/RECORD +104 -0
torchzero-0.1.3.dist-info/RECORD +0 -104
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/LICENSE +0 -0
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/WHEEL +0 -0
{torchzero-0.1.3.dist-info → torchzero-0.1.5.dist-info}/top_level.txt +0 -0

torchzero/modules/operations/multi.py CHANGED Viewed

@@ -16,11 +16,11 @@ class Add(OptimizerModule):
         self.value = value
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if isinstance(self.value, (int, float)):
             return ascent.add_(self.value)
-        state_copy = state.copy(clone_ascent = True)
+        state_copy = vars.copy(clone_ascent = True)
         v = self.children['value'].return_ascent(state_copy)
         return ascent.add_(v)
@@ -36,11 +36,11 @@ class Sub(OptimizerModule):
         self.subtrahend = subtrahend
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if isinstance(self.subtrahend, (int, float)):
             return ascent.sub_(self.subtrahend)
-        state_copy = state.copy(clone_ascent = True)
+        state_copy = vars.copy(clone_ascent = True)
         subtrahend = self.children['subtrahend'].return_ascent(state_copy)
         return ascent.sub_(subtrahend)
@@ -55,11 +55,11 @@ class RSub(OptimizerModule):
         self.minuend = minuend
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if isinstance(self.minuend, (int, float)):
             return ascent.sub_(self.minuend).neg_()
-        state_copy = state.copy(clone_ascent = True)
+        state_copy = vars.copy(clone_ascent = True)
         minuend = self.children['minuend'].return_ascent(state_copy)
         return ascent.sub_(minuend).neg_()
@@ -75,14 +75,14 @@ class Subtract(OptimizerModule):
         self._set_child_('subtrahend', subtrahend)
     @torch.no_grad
-    def step(self, state):
-        state_copy = state.copy(clone_ascent = True)
+    def step(self, vars):
+        state_copy = vars.copy(clone_ascent = True)
         minuend = self.children['minuend'].return_ascent(state_copy)
-        state.update_attrs_(state_copy)
-        subtrahend = self.children['subtrahend'].return_ascent(state)
+        vars.update_attrs_(state_copy)
+        subtrahend = self.children['subtrahend'].return_ascent(vars)
-        state.ascent = minuend.sub_(subtrahend)
-        return self._update_params_or_step_with_next(state)
+        vars.ascent = minuend.sub_(subtrahend)
+        return self._update_params_or_step_with_next(vars)
 class Mul(OptimizerModule):
     """multiplies update by `value`. `value` can be a scalar, an OptimizerModule or sequence of OptimizerModules"""
@@ -95,11 +95,11 @@ class Mul(OptimizerModule):
         self.value = value
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if isinstance(self.value, (int, float)):
             return ascent.mul_(self.value)
-        state_copy = state.copy(clone_ascent = True)
+        state_copy = vars.copy(clone_ascent = True)
         v = self.children['value'].return_ascent(state_copy)
         return ascent.mul_(v)
@@ -115,11 +115,11 @@ class Div(OptimizerModule):
         self.denominator = denominator
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if isinstance(self.denominator, (int, float)):
             return ascent.div_(self.denominator)
-        state_copy = state.copy(clone_ascent = True)
+        state_copy = vars.copy(clone_ascent = True)
         denominator = self.children['denominator'].return_ascent(state_copy)
         return ascent.div_(denominator)
@@ -134,11 +134,11 @@ class RDiv(OptimizerModule):
         self.numerator = numerator
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if isinstance(self.numerator, (int, float)):
             return ascent.reciprocal_().mul_(self.numerator)
-        state_copy = state.copy(clone_ascent = True)
+        state_copy = vars.copy(clone_ascent = True)
         numerator = self.children['numerator'].return_ascent(state_copy)
         return ascent.reciprocal_().mul_(numerator)
@@ -154,14 +154,14 @@ class Divide(OptimizerModule):
         self._set_child_('denominator', denominator)
     @torch.no_grad
-    def step(self, state):
-        state_copy = state.copy(clone_ascent = True)
+    def step(self, vars):
+        state_copy = vars.copy(clone_ascent = True)
         numerator = self.children['numerator'].return_ascent(state_copy)
-        state.update_attrs_(state_copy)
-        denominator = self.children['denominator'].return_ascent(state)
+        vars.update_attrs_(state_copy)
+        denominator = self.children['denominator'].return_ascent(vars)
-        state.ascent = numerator.div_(denominator)
-        return self._update_params_or_step_with_next(state)
+        vars.ascent = numerator.div_(denominator)
+        return self._update_params_or_step_with_next(vars)
 class Pow(OptimizerModule):
@@ -175,11 +175,11 @@ class Pow(OptimizerModule):
         self.power = power
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if isinstance(self.power, (int, float)):
             return ascent.pow_(self.power)
-        state_copy = state.copy(clone_ascent = True)
+        state_copy = vars.copy(clone_ascent = True)
         power = self.children['power'].return_ascent(state_copy)
         return ascent.pow_(power)
@@ -194,11 +194,11 @@ class RPow(OptimizerModule):
         self.base = base
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if isinstance(self.base, (int, float)):
             return self.base ** ascent
-        state_copy = state.copy(clone_ascent = True)
+        state_copy = vars.copy(clone_ascent = True)
         base = self.children['base'].return_ascent(state_copy)
         return base.pow_(ascent)
@@ -214,14 +214,14 @@ class Power(OptimizerModule):
         self._set_child_('power', power)
     @torch.no_grad
-    def step(self, state):
-        state_copy = state.copy(clone_ascent = True)
+    def step(self, vars):
+        state_copy = vars.copy(clone_ascent = True)
         base = self.children['base'].return_ascent(state_copy)
-        state.update_attrs_(state_copy)
-        power = self.children['power'].return_ascent(state)
+        vars.update_attrs_(state_copy)
+        power = self.children['power'].return_ascent(vars)
-        state.ascent = base.pow_(power)
-        return self._update_params_or_step_with_next(state)
+        vars.ascent = base.pow_(power)
+        return self._update_params_or_step_with_next(vars)
 class Lerp(OptimizerModule):
@@ -235,9 +235,9 @@ class Lerp(OptimizerModule):
         self.weight = weight
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
-        state_copy = state.copy(clone_ascent = True)
+        state_copy = vars.copy(clone_ascent = True)
         end = self.children['end'].return_ascent(state_copy)
         return ascent.lerp_(end, self.weight)
@@ -259,15 +259,15 @@ class Interpolate(OptimizerModule):
         self.weight = weight
     @torch.no_grad
-    def step(self, state):
-        state_copy = state.copy(clone_ascent = True)
+    def step(self, vars):
+        state_copy = vars.copy(clone_ascent = True)
         input = self.children['input'].return_ascent(state_copy)
-        state.update_attrs_(state_copy)
-        end = self.children['end'].return_ascent(state)
+        vars.update_attrs_(state_copy)
+        end = self.children['end'].return_ascent(vars)
-        state.ascent = input.lerp_(end, weight = self.weight)
+        vars.ascent = input.lerp_(end, weight = self.weight)
-        return self._update_params_or_step_with_next(state)
+        return self._update_params_or_step_with_next(vars)
 class AddMagnitude(OptimizerModule):
     """Add `value` multiplied by sign of the ascent, i.e. this adds `value` to the magnitude of the update.
@@ -288,11 +288,11 @@ class AddMagnitude(OptimizerModule):
         self.add_to_zero = add_to_zero
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if isinstance(self.value, (int, float)):
             if self.add_to_zero: return ascent.add_(ascent.clamp_magnitude(min=1).sign_().mul_(self.value))
             return ascent.add_(ascent.sign_().mul_(self.value))
-        state_copy = state.copy(clone_ascent = True)
+        state_copy = vars.copy(clone_ascent = True)
         v = self.children['value'].return_ascent(state_copy)
         return ascent.add_(v.abs_().mul_(ascent.sign()))

torchzero/modules/operations/reduction.py CHANGED Viewed

@@ -26,26 +26,26 @@ class Sum(OptimizerModule):
             self._set_child_(i, module)
     @torch.no_grad
-    def step(self, state):
+    def step(self, vars):
         if len(self.children) == 1:
-            state.ascent = self.children[0].return_ascent(state)
-            if self.scalar is not None: state.ascent += self.scalar
-            return self._update_params_or_step_with_next(state)
+            vars.ascent = self.children[0].return_ascent(vars)
+            if self.scalar is not None: vars.ascent += self.scalar
+            return self._update_params_or_step_with_next(vars)
         sum = None
         for i, c in sorted(self.children.items(), key=lambda x: x[0]):
-            if i == len(self.children) - 1: cur_state = state
-            else: cur_state = state.copy(clone_ascent = True)
+            if i == len(self.children) - 1: cur_state = vars
+            else: cur_state = vars.copy(clone_ascent = True)
             if sum is None: sum = c.return_ascent(cur_state)
             else: sum += c.return_ascent(cur_state)
-            if i != len(self.children) - 1: state.update_attrs_(cur_state)
+            if i != len(self.children) - 1: vars.update_attrs_(cur_state)
         assert sum is not None
         if self.scalar is not None: sum += self.scalar
-        state.ascent = sum
-        return self._update_params_or_step_with_next(state)
+        vars.ascent = sum
+        return self._update_params_or_step_with_next(vars)
 class Mean(OptimizerModule):
     """calculates mean of multiple updates.
@@ -69,28 +69,28 @@ class Mean(OptimizerModule):
             self._set_child_(i, module)
     @torch.no_grad
-    def step(self, state):
+    def step(self, vars):
         if len(self.children) == 1:
-            state.ascent = self.children[0].return_ascent(state)
-            if self.scalar is not None: state.ascent += self.scalar
-            if self.n_values > 1: state.ascent /= self.n_values
-            return self._update_params_or_step_with_next(state)
+            vars.ascent = self.children[0].return_ascent(vars)
+            if self.scalar is not None: vars.ascent += self.scalar
+            if self.n_values > 1: vars.ascent /= self.n_values
+            return self._update_params_or_step_with_next(vars)
         sum = None
         for i, c in sorted(self.children.items(), key=lambda x: x[0]):
-            if i == len(self.children) - 1: cur_state = state
-            else: cur_state = state.copy(clone_ascent = True)
+            if i == len(self.children) - 1: cur_state = vars
+            else: cur_state = vars.copy(clone_ascent = True)
             if sum is None: sum = c.return_ascent(cur_state)
             else: sum += c.return_ascent(cur_state)
-            if i != len(self.children) - 1: state.update_attrs_(cur_state)
+            if i != len(self.children) - 1: vars.update_attrs_(cur_state)
         assert sum is not None
         if self.scalar is not None: sum += self.scalar
         if self.n_values > 1: sum /= self.n_values
-        state.ascent = sum
-        return self._update_params_or_step_with_next(state)
+        vars.ascent = sum
+        return self._update_params_or_step_with_next(vars)
 class Product(OptimizerModule):
     """calculates product of multiple updates.
@@ -112,23 +112,23 @@ class Product(OptimizerModule):
             self._set_child_(i, module)
     @torch.no_grad
-    def step(self, state):
+    def step(self, vars):
         if len(self.children) == 1:
-            state.ascent = self.children[0].return_ascent(state)
-            if self.scalar is not None: state.ascent *= self.scalar
-            return self._update_params_or_step_with_next(state)
+            vars.ascent = self.children[0].return_ascent(vars)
+            if self.scalar is not None: vars.ascent *= self.scalar
+            return self._update_params_or_step_with_next(vars)
         prod = None
         for i, c in sorted(self.children.items(), key=lambda x: x[0]):
-            if i == len(self.children) - 1: cur_state = state
-            else: cur_state = state.copy(clone_ascent = True)
+            if i == len(self.children) - 1: cur_state = vars
+            else: cur_state = vars.copy(clone_ascent = True)
             if prod is None: prod = c.return_ascent(cur_state)
             else: prod *= c.return_ascent(cur_state)
-            if i != len(self.children) - 1: state.update_attrs_(cur_state)
+            if i != len(self.children) - 1: vars.update_attrs_(cur_state)
         assert prod is not None
         if self.scalar is not None: prod *= self.scalar
-        state.ascent = prod
-        return self._update_params_or_step_with_next(state)
+        vars.ascent = prod
+        return self._update_params_or_step_with_next(vars)

torchzero/modules/operations/singular.py CHANGED Viewed

@@ -18,7 +18,7 @@ class Operation(OptimizerModule):
         self.operation = methodcaller(f'{operation}_')
     @torch.no_grad
-    def _update(self, state, ascent): return self.operation(ascent)
+    def _update(self, vars, ascent): return self.operation(ascent)
 class Reciprocal(OptimizerModule):
     """*1 / update*"""
@@ -26,7 +26,7 @@ class Reciprocal(OptimizerModule):
         super().__init__({})
     @torch.no_grad()
-    def _update(self, state, ascent): return ascent.reciprocal_()
+    def _update(self, vars, ascent): return ascent.reciprocal_()
 class Negate(OptimizerModule):
     """minus update"""
@@ -34,7 +34,7 @@ class Negate(OptimizerModule):
         super().__init__({})
     @torch.no_grad()
-    def _update(self, state, ascent): return ascent.neg_()
+    def _update(self, vars, ascent): return ascent.neg_()
 def sign_grad_(params: Iterable[torch.Tensor]):
@@ -51,7 +51,7 @@ class Sign(OptimizerModule):
         super().__init__({})
     @torch.no_grad
-    def _update(self, state, ascent): return ascent.sign_()
+    def _update(self, vars, ascent): return ascent.sign_()
 class Abs(OptimizerModule):
     """takes absolute values of the update."""
@@ -59,7 +59,7 @@ class Abs(OptimizerModule):
         super().__init__({})
     @torch.no_grad
-    def _update(self, state, ascent): return ascent.abs_()
+    def _update(self, vars, ascent): return ascent.abs_()
 class Sin(OptimizerModule):
     """applies sin function to the ascent"""
@@ -67,7 +67,7 @@ class Sin(OptimizerModule):
         super().__init__({})
     @torch.no_grad
-    def _update(self, state, ascent): return ascent.sin_()
+    def _update(self, vars, ascent): return ascent.sin_()
 class Cos(OptimizerModule):
     """applies cos function to the ascent"""
@@ -75,7 +75,7 @@ class Cos(OptimizerModule):
         super().__init__({})
     @torch.no_grad
-    def _update(self, state, ascent): return ascent.cos_()
+    def _update(self, vars, ascent): return ascent.cos_()
 class NanToNum(OptimizerModule):
@@ -97,7 +97,7 @@ class NanToNum(OptimizerModule):
         self.neginf = neginf
     @torch.no_grad()
-    def _update(self, state, ascent): return ascent.nan_to_num_(self.nan, self.posinf, self.neginf)
+    def _update(self, vars, ascent): return ascent.nan_to_num_(self.nan, self.posinf, self.neginf)
 class MagnitudePower(OptimizerModule):
@@ -107,7 +107,7 @@ class MagnitudePower(OptimizerModule):
         self.value = value
     @torch.no_grad()
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if self.value % 2 == 1: return ascent.pow_(self.value)
         return ascent.abs().pow_(self.value) * ascent.sign()

torchzero/modules/optimizers/adagrad.py CHANGED Viewed

@@ -31,7 +31,7 @@ class Adagrad(OptimizerModule):
         self.cur_step = 0
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         settings = self.get_all_group_keys()
         if self.cur_step == 0: init = ascent.full_like(settings['initial_accumulator_value'])
         else: init = None

torchzero/modules/optimizers/adam.py CHANGED Viewed

@@ -48,7 +48,7 @@ class Adam(OptimizerModule):
         self.amsgrad = amsgrad
     @torch.no_grad
-    def step(self, state):
+    def step(self, vars):
         # Adam step is a bit differet from other optimizer steps
         # due to how common it is, I implemented two additional optimizations,
@@ -85,14 +85,14 @@ class Adam(OptimizerModule):
             alpha = settings['alpha']
         # get params if ascent is None so we need params to access their gradient as initial ascent
-        if state.ascent is None:
+        if vars.ascent is None:
             if params is None: pg = self.get_params()
             else: pg = params
         else:
             pg = None
         ret = _adam_step(
-            ascent=state.maybe_use_grad_(pg),
+            ascent=vars.maybe_use_grad_(pg),
             exp_avg = exp_avg,
             exp_avg_sq = exp_avg_sq,
             alpha = alpha,
@@ -107,12 +107,12 @@ class Adam(OptimizerModule):
         self.cur_step += 1
         if params is None:
             assert ret is not None
-            state.ascent = ret
-            return self._update_params_or_step_with_next(state)
+            vars.ascent = ret
+            return self._update_params_or_step_with_next(vars)
         # next module is either None or LR
-        if self.next_module is None: return state.get_loss()
+        if self.next_module is None: return vars.get_loss()
         # step with LR, which has _skip = True so it won't apply lr, but may step with the scheduler
-        self.next_module._update(state, None) # type:ignore
-        return state.get_loss()
+        self.next_module._update(vars, None) # type:ignore
+        return vars.get_loss()

torchzero/modules/optimizers/lion.py CHANGED Viewed

@@ -22,7 +22,7 @@ class Lion(OptimizerModule):
         super().__init__(defaults)
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         beta1, beta2 = self.get_group_keys('beta1', 'beta2')
         ema = self.get_state_key('ema')
         return _lion_step_(ascent,ema,beta1,beta2)

torchzero/modules/optimizers/rmsprop.py CHANGED Viewed

@@ -40,7 +40,7 @@ class RMSProp(OptimizerModule):
         self.centered = centered
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         settings = self.get_all_group_keys()
         if self.centered:
             mean, mean_sqr = self.get_state_keys('mean', 'mean_sqr')

torchzero/modules/optimizers/rprop.py CHANGED Viewed

@@ -49,7 +49,7 @@ class Rprop(OptimizerModule):
         self.backtrack = backtrack
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
         sign = ascent.sign_()

torchzero/modules/optimizers/sgd.py CHANGED Viewed

@@ -15,7 +15,7 @@ class SGD(OptimizerModule):
         weight_decay (float, optional): weight decay (L2 regularization). Defaults to 0.
         nesterov (bool, optional):
             enables nesterov momentum, otherwise uses heavyball momentum. Defaults to False.
-        alpha (float, optional): learning rate. Defaults to 1e-3.
+        alpha (float, optional): learning rate. Defaults to 1.
     """
     def __init__(
         self,
@@ -32,7 +32,7 @@ class SGD(OptimizerModule):
         self.current_step = 0
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
         settings = self.get_all_group_keys()

torchzero/modules/orthogonalization/newtonschulz.py CHANGED Viewed

@@ -116,7 +116,7 @@ class ZeropowerViaNewtonSchulz(OptimizerModule):
         if compiled: self._zeropower_via_newtonschulz5 = _compiled_zeropower_via_newtonschulz5
         else: self._zeropower_via_newtonschulz5 = _zeropower_via_newtonschulz5
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         toggle, ns_steps, adaptive = self.get_group_keys('newtonshultz', 'ns_steps', 'adaptive', cls=list)
         for asc, enable, steps, ada in zip(ascent, toggle, ns_steps, adaptive):
@@ -146,11 +146,11 @@ class DualNormCorrection(OptimizerModule):
         defaults = dict(adaptive_scale_min = adaptive_scale_min, adaptive_scale_max = adaptive_scale_max)
         super().__init__(defaults)
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
         adaptive_scale_min, adaptive_scale_max = self.get_group_keys('adaptive_scale_min', 'adaptive_scale_max')
-        for asc, grad, min, max in zip(ascent, state.maybe_compute_grad_(params), adaptive_scale_min, adaptive_scale_max):
+        for asc, grad, min, max in zip(ascent, vars.maybe_compute_grad_(params), adaptive_scale_min, adaptive_scale_max):
             if len([i for i in asc.shape if i > 1]) != 0:
                 scale = torch.einsum('ij,ij->', grad.view(grad.shape[0], -1), asc.view(asc.shape[0], -1))
                 if min is not None or max is not None: scale = scale.clip(min, max)

torchzero/modules/orthogonalization/svd.py CHANGED Viewed

@@ -80,7 +80,7 @@ class Orthogonalize(OptimizerModule):
         super().__init__(defaults, target = target)
         self.warn_fail = warn_fail
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         toggle = self.get_group_key('orth', cls=list)
         _orthogonalize_update_(ascent, toggle, self.warn_fail)
         return ascent

torchzero/modules/regularization/dropout.py CHANGED Viewed

@@ -27,7 +27,7 @@ class Dropout(OptimizerModule):
         super().__init__(defaults)
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         p = self.get_group_key('p')
         ascent *= ascent.bernoulli_like(p)

torchzero/modules/regularization/noise.py CHANGED Viewed

@@ -18,7 +18,7 @@ def add_noise_(
         grads += grads.sample_like(alpha, distribution)
     elif mode == 'global':
-        grads += grads.sample_like((grads.total_vector_norm(1)/grads.total_numel() * alpha).detach().cpu().item(), distribution)
+        grads += grads.sample_like((grads.total_vector_norm(1)/grads.total_numel() * alpha).detach().cpu().item(), distribution) # type:ignore
     elif mode == 'param':
         grads += grads.sample_like(grads.abs().mean()*alpha, distribution)
@@ -57,7 +57,7 @@ class AddNoise(OptimizerModule):
         self.mode: Literal["absolute", "global", "param", "channel"] = mode
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         alpha = self.get_group_key('alpha')
         add_noise_(ascent, alpha, self.distribution, self.mode)
@@ -72,6 +72,6 @@ class Random(OptimizerModule):
         self.distribution: Distributions = distribution
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         alpha = self.get_group_key('alpha')
         return ascent.sample_like(alpha, self.distribution)

torchzero/modules/regularization/normalization.py CHANGED Viewed

@@ -29,7 +29,7 @@ def _normalize_grad_(
         if not isinstance(grads, TensorList): grads = TensorList(grads)
         norm = grads.total_vector_norm(ord)
         if norm > min:
-            grads /= norm / norm_value
+            grads /= norm / norm_value # type:ignore
 @torch.no_grad
 def normalize_grad_(
@@ -112,7 +112,7 @@ class Normalize(OptimizerModule):
         self.min_numel = min_numel
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         _normalize_grad_(
             ascent,
             norm_value = self.norm_value,
@@ -225,7 +225,7 @@ class Centralize(OptimizerModule):
         self.min_numel = min_numel
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         _centralize_grad_(
             ascent,
             mode = self.mode,
@@ -258,7 +258,7 @@ class ClipValue(OptimizerModule):
         super().__init__(defaults)
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         value = self.get_group_key('value')
         ascent.clamp_(-value, value)
         return ascent
@@ -317,7 +317,7 @@ class ClipNorm(OptimizerModule):
         self.mode: typing.Literal["global", "param", "channel"] = mode
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         _normalize_grad_(
             ascent,
             norm_value = self.max_norm,

torchzero/modules/regularization/ortho_grad.py CHANGED Viewed

@@ -58,7 +58,7 @@ class OrthoGrad(OptimizerModule):
         self.renormalize = renormalize
         self.sqrt_scale = sqrt_scale
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
         if self.renormalize: orig_norm = ascent.norm(2) + self.eps

torchzero/modules/regularization/weight_decay.py CHANGED Viewed

@@ -79,7 +79,7 @@ class WeightDecay(OptimizerModule):
         self.ord = ord
     @torch.no_grad
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         params = self.get_params()
         alpha = self.get_group_key('alpha')

torchzero/modules/scheduling/lr_schedulers.py CHANGED Viewed

@@ -81,7 +81,7 @@ if TYPE_CHECKING:
 #         self.id = random.random()
-#     def step(self, state):
+#     def step(self, vars):
 #         if self.cur % self.update_every == 0:
 #             self.scheduler_step_fn()
 #             self.cur_lr = self.dummy_opt.first_param_group['lr']
@@ -113,7 +113,7 @@ class LRWarmup(OptimizerModule):
         self.cur = 0
-    def _update(self, state, ascent):
+    def _update(self, vars, ascent):
         if self.cur < self.delay_steps:
             if self.start_lr != 1: ascent *= self.start_lr

torchzero 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

torchzero 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl