PyPI - torchzero - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

docs/source/conf.py +6 -4
docs/source/docstring template.py +46 -0
tests/test_identical.py +2 -3
tests/test_opts.py +64 -50
tests/test_vars.py +1 -0
torchzero/core/module.py +138 -6
torchzero/core/transform.py +158 -51
torchzero/modules/__init__.py +3 -2
torchzero/modules/clipping/clipping.py +114 -17
torchzero/modules/clipping/ema_clipping.py +27 -13
torchzero/modules/clipping/growth_clipping.py +8 -7
torchzero/modules/experimental/__init__.py +22 -5
torchzero/modules/experimental/absoap.py +5 -2
torchzero/modules/experimental/adadam.py +8 -2
torchzero/modules/experimental/adamY.py +8 -2
torchzero/modules/experimental/adam_lambertw.py +149 -0
torchzero/modules/{line_search/trust_region.py → experimental/adaptive_step_size.py} +21 -4
torchzero/modules/experimental/adasoap.py +7 -2
torchzero/modules/experimental/cosine.py +214 -0
torchzero/modules/experimental/cubic_adam.py +97 -0
torchzero/modules/{projections → experimental}/dct.py +11 -11
torchzero/modules/experimental/eigendescent.py +4 -1
torchzero/modules/experimental/etf.py +32 -9
torchzero/modules/experimental/exp_adam.py +113 -0
torchzero/modules/experimental/expanded_lbfgs.py +141 -0
torchzero/modules/{projections → experimental}/fft.py +10 -10
torchzero/modules/experimental/hnewton.py +85 -0
torchzero/modules/{quasi_newton/experimental → experimental}/modular_lbfgs.py +27 -28
torchzero/modules/experimental/newtonnewton.py +7 -3
torchzero/modules/experimental/parabolic_search.py +220 -0
torchzero/modules/experimental/reduce_outward_lr.py +4 -4
torchzero/modules/{projections/structural.py → experimental/structural_projections.py} +12 -54
torchzero/modules/experimental/subspace_preconditioners.py +11 -4
torchzero/modules/experimental/{tada.py → tensor_adagrad.py} +10 -6
torchzero/modules/functional.py +12 -2
torchzero/modules/grad_approximation/fdm.py +30 -3
torchzero/modules/grad_approximation/forward_gradient.py +13 -3
torchzero/modules/grad_approximation/grad_approximator.py +51 -6
torchzero/modules/grad_approximation/rfdm.py +285 -38
torchzero/modules/higher_order/higher_order_newton.py +152 -89
torchzero/modules/line_search/__init__.py +4 -4
torchzero/modules/line_search/adaptive.py +99 -0
torchzero/modules/line_search/backtracking.py +34 -9
torchzero/modules/line_search/line_search.py +70 -12
torchzero/modules/line_search/polynomial.py +233 -0
torchzero/modules/line_search/scipy.py +2 -2
torchzero/modules/line_search/strong_wolfe.py +34 -7
torchzero/modules/misc/__init__.py +27 -0
torchzero/modules/{ops → misc}/debug.py +24 -1
torchzero/modules/misc/escape.py +60 -0
torchzero/modules/misc/gradient_accumulation.py +70 -0
torchzero/modules/misc/misc.py +316 -0
torchzero/modules/misc/multistep.py +158 -0
torchzero/modules/misc/regularization.py +171 -0
torchzero/modules/{ops → misc}/split.py +29 -1
torchzero/modules/{ops → misc}/switch.py +44 -3
torchzero/modules/momentum/__init__.py +1 -1
torchzero/modules/momentum/averaging.py +6 -6
torchzero/modules/momentum/cautious.py +45 -8
torchzero/modules/momentum/ema.py +7 -7
torchzero/modules/momentum/experimental.py +2 -2
torchzero/modules/momentum/matrix_momentum.py +90 -63
torchzero/modules/momentum/momentum.py +2 -1
torchzero/modules/ops/__init__.py +3 -31
torchzero/modules/ops/accumulate.py +6 -10
torchzero/modules/ops/binary.py +72 -26
torchzero/modules/ops/multi.py +77 -16
torchzero/modules/ops/reduce.py +15 -7
torchzero/modules/ops/unary.py +29 -13
torchzero/modules/ops/utility.py +20 -12
torchzero/modules/optimizers/__init__.py +12 -3
torchzero/modules/optimizers/adagrad.py +23 -13
torchzero/modules/optimizers/adahessian.py +223 -0
torchzero/modules/optimizers/adam.py +7 -6
torchzero/modules/optimizers/adan.py +110 -0
torchzero/modules/optimizers/adaptive_heavyball.py +57 -0
torchzero/modules/optimizers/esgd.py +171 -0
torchzero/modules/{experimental/spectral.py → optimizers/ladagrad.py} +91 -71
torchzero/modules/optimizers/lion.py +1 -1
torchzero/modules/optimizers/mars.py +91 -0
torchzero/modules/optimizers/msam.py +186 -0
torchzero/modules/optimizers/muon.py +30 -5
torchzero/modules/optimizers/orthograd.py +1 -1
torchzero/modules/optimizers/rmsprop.py +7 -4
torchzero/modules/optimizers/rprop.py +42 -8
torchzero/modules/optimizers/sam.py +163 -0
torchzero/modules/optimizers/shampoo.py +39 -5
torchzero/modules/optimizers/soap.py +29 -19
torchzero/modules/optimizers/sophia_h.py +71 -14
torchzero/modules/projections/__init__.py +2 -4
torchzero/modules/projections/cast.py +51 -0
torchzero/modules/projections/galore.py +3 -1
torchzero/modules/projections/projection.py +188 -94
torchzero/modules/quasi_newton/__init__.py +12 -2
torchzero/modules/quasi_newton/cg.py +160 -59
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +163 -0
torchzero/modules/quasi_newton/lbfgs.py +154 -97
torchzero/modules/quasi_newton/lsr1.py +101 -57
torchzero/modules/quasi_newton/quasi_newton.py +863 -215
torchzero/modules/quasi_newton/trust_region.py +397 -0
torchzero/modules/second_order/__init__.py +2 -2
torchzero/modules/second_order/newton.py +220 -41
torchzero/modules/second_order/newton_cg.py +300 -11
torchzero/modules/second_order/nystrom.py +104 -1
torchzero/modules/smoothing/gaussian.py +34 -0
torchzero/modules/smoothing/laplacian.py +14 -4
torchzero/modules/step_size/__init__.py +2 -0
torchzero/modules/step_size/adaptive.py +122 -0
torchzero/modules/step_size/lr.py +154 -0
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +89 -7
torchzero/modules/wrappers/optim_wrapper.py +29 -1
torchzero/optim/wrappers/directsearch.py +39 -2
torchzero/optim/wrappers/fcmaes.py +21 -13
torchzero/optim/wrappers/mads.py +5 -6
torchzero/optim/wrappers/nevergrad.py +16 -1
torchzero/optim/wrappers/optuna.py +1 -1
torchzero/optim/wrappers/scipy.py +5 -3
torchzero/utils/__init__.py +2 -2
torchzero/utils/derivatives.py +3 -3
torchzero/utils/linalg/__init__.py +1 -1
torchzero/utils/linalg/solve.py +251 -12
torchzero/utils/numberlist.py +2 -0
torchzero/utils/python_tools.py +10 -0
torchzero/utils/tensorlist.py +40 -28
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/METADATA +65 -40
torchzero-0.3.11.dist-info/RECORD +159 -0
torchzero/modules/experimental/diagonal_higher_order_newton.py +0 -225
torchzero/modules/experimental/soapy.py +0 -163
torchzero/modules/experimental/structured_newton.py +0 -111
torchzero/modules/lr/__init__.py +0 -2
torchzero/modules/lr/adaptive.py +0 -93
torchzero/modules/lr/lr.py +0 -63
torchzero/modules/ops/misc.py +0 -418
torchzero/modules/quasi_newton/experimental/__init__.py +0 -1
torchzero/modules/quasi_newton/olbfgs.py +0 -196
torchzero-0.3.10.dist-info/RECORD +0 -139
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/WHEEL +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.10.dist-info → torchzero-0.3.11.dist-info}/top_level.txt +0 -0

torchzero/modules/line_search/polynomial.py ADDED Viewed

@@ -0,0 +1,233 @@
+import numpy as np
+import torch
+from .line_search import LineSearchBase
+# polynomial interpolation
+# this code is from https://github.com/hjmshi/PyTorch-LBFGS/blob/master/functions/LBFGS.py
+# PyTorch-LBFGS: A PyTorch Implementation of L-BFGS
+def polyinterp(points, x_min_bound=None, x_max_bound=None, plot=False):
+    """
+    Gives the minimizer and minimum of the interpolating polynomial over given points
+    based on function and derivative information. Defaults to bisection if no critical
+    points are valid.
+    Based on polyinterp.m Matlab function in minFunc by Mark Schmidt with some slight
+    modifications.
+    Implemented by: Hao-Jun Michael Shi and Dheevatsa Mudigere
+    Last edited 12/6/18.
+    Inputs:
+        points (nparray): two-dimensional array with each point of form [x f g]
+        x_min_bound (float): minimum value that brackets minimum (default: minimum of points)
+        x_max_bound (float): maximum value that brackets minimum (default: maximum of points)
+        plot (bool): plot interpolating polynomial
+    Outputs:
+        x_sol (float): minimizer of interpolating polynomial
+        F_min (float): minimum of interpolating polynomial
+    Note:
+      . Set f or g to np.nan if they are unknown
+    """
+    no_points = points.shape[0]
+    order = np.sum(1 - np.isnan(points[:, 1:3]).astype('int')) - 1
+    x_min = np.min(points[:, 0])
+    x_max = np.max(points[:, 0])
+    # compute bounds of interpolation area
+    if x_min_bound is None:
+        x_min_bound = x_min
+    if x_max_bound is None:
+        x_max_bound = x_max
+    # explicit formula for quadratic interpolation
+    if no_points == 2 and order == 2 and plot is False:
+        # Solution to quadratic interpolation is given by:
+        # a = -(f1 - f2 - g1(x1 - x2))/(x1 - x2)^2
+        # x_min = x1 - g1/(2a)
+        # if x1 = 0, then is given by:
+        # x_min = - (g1*x2^2)/(2(f2 - f1 - g1*x2))
+        if points[0, 0] == 0:
+            x_sol = -points[0, 2] * points[1, 0] ** 2 / (2 * (points[1, 1] - points[0, 1] - points[0, 2] * points[1, 0]))
+        else:
+            a = -(points[0, 1] - points[1, 1] - points[0, 2] * (points[0, 0] - points[1, 0])) / (points[0, 0] - points[1, 0]) ** 2
+            x_sol = points[0, 0] - points[0, 2]/(2*a)
+        x_sol = np.minimum(np.maximum(x_min_bound, x_sol), x_max_bound)
+    # explicit formula for cubic interpolation
+    elif no_points == 2 and order == 3 and plot is False:
+        # Solution to cubic interpolation is given by:
+        # d1 = g1 + g2 - 3((f1 - f2)/(x1 - x2))
+        # d2 = sqrt(d1^2 - g1*g2)
+        # x_min = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2))
+        d1 = points[0, 2] + points[1, 2] - 3 * ((points[0, 1] - points[1, 1]) / (points[0, 0] - points[1, 0]))
+        d2 = np.sqrt(d1 ** 2 - points[0, 2] * points[1, 2])
+        if np.isreal(d2):
+            x_sol = points[1, 0] - (points[1, 0] - points[0, 0]) * ((points[1, 2] + d2 - d1) / (points[1, 2] - points[0, 2] + 2 * d2))
+            x_sol = np.minimum(np.maximum(x_min_bound, x_sol), x_max_bound)
+        else:
+            x_sol = (x_max_bound + x_min_bound)/2
+    # solve linear system
+    else:
+        # define linear constraints
+        A = np.zeros((0, order + 1))
+        b = np.zeros((0, 1))
+        # add linear constraints on function values
+        for i in range(no_points):
+            if not np.isnan(points[i, 1]):
+                constraint = np.zeros((1, order + 1))
+                for j in range(order, -1, -1):
+                    constraint[0, order - j] = points[i, 0] ** j
+                A = np.append(A, constraint, 0)
+                b = np.append(b, points[i, 1])
+        # add linear constraints on gradient values
+        for i in range(no_points):
+            if not np.isnan(points[i, 2]):
+                constraint = np.zeros((1, order + 1))
+                for j in range(order):
+                    constraint[0, j] = (order - j) * points[i, 0] ** (order - j - 1)
+                A = np.append(A, constraint, 0)
+                b = np.append(b, points[i, 2])
+        # check if system is solvable
+        if A.shape[0] != A.shape[1] or np.linalg.matrix_rank(A) != A.shape[0]:
+            x_sol = (x_min_bound + x_max_bound)/2
+            f_min = np.inf
+        else:
+            # solve linear system for interpolating polynomial
+            coeff = np.linalg.solve(A, b)
+            # compute critical points
+            dcoeff = np.zeros(order)
+            for i in range(len(coeff) - 1):
+                dcoeff[i] = coeff[i] * (order - i)
+            crit_pts = np.array([x_min_bound, x_max_bound])
+            crit_pts = np.append(crit_pts, points[:, 0])
+            if not np.isinf(dcoeff).any():
+                roots = np.roots(dcoeff)
+                crit_pts = np.append(crit_pts, roots)
+            # test critical points
+            f_min = np.inf
+            x_sol = (x_min_bound + x_max_bound) / 2 # defaults to bisection
+            for crit_pt in crit_pts:
+                if np.isreal(crit_pt) and crit_pt >= x_min_bound and crit_pt <= x_max_bound:
+                    F_cp = np.polyval(coeff, crit_pt)
+                    if np.isreal(F_cp) and F_cp < f_min:
+                        x_sol = np.real(crit_pt)
+                        f_min = np.real(F_cp)
+            if(plot):
+                import matplotlib.pyplot as plt
+                plt.figure()
+                x = np.arange(x_min_bound, x_max_bound, (x_max_bound - x_min_bound)/10000)
+                f = np.polyval(coeff, x)
+                plt.plot(x, f)
+                plt.plot(x_sol, f_min, 'x')
+    return x_sol
+# class PolynomialLineSearch(LineSearch):
+#     """TODO
+#     Line search via polynomial interpolation.
+#     Args:
+#         init (float, optional): Initial step size. Defaults to 1.0.
+#         c1 (float, optional): Acceptance value for weak wolfe condition. Defaults to 1e-4.
+#         c2 (float, optional): Acceptance value for strong wolfe condition (set to 0.1 for conjugate gradient). Defaults to 0.9.
+#         maxiter (int, optional): Maximum number of line search iterations. Defaults to 25.
+#         maxzoom (int, optional): Maximum number of zoom iterations. Defaults to 10.
+#         expand (float, optional): Expansion factor (multipler to step size when weak condition not satisfied). Defaults to 2.0.
+#         adaptive (bool, optional):
+#             when enabled, if line search failed, initial step size is reduced.
+#             Otherwise it is reset to initial value. Defaults to True.
+#         plus_minus (bool, optional):
+#             If enabled and the direction is not descent direction, performs line search in opposite direction. Defaults to False.
+#     Examples:
+#         Conjugate gradient method with strong wolfe line search. Nocedal, Wright recommend setting c2 to 0.1 for CG.
+#         .. code-block:: python
+#             opt = tz.Modular(
+#                 model.parameters(),
+#                 tz.m.PolakRibiere(),
+#                 tz.m.StrongWolfe(c2=0.1)
+#             )
+#         LBFGS strong wolfe line search:
+#         .. code-block:: python
+#             opt = tz.Modular(
+#                 model.parameters(),
+#                 tz.m.LBFGS(),
+#                 tz.m.StrongWolfe()
+#             )
+#     """
+#     def __init__(
+#         self,
+#         init: float = 1.0,
+#         c1: float = 1e-4,
+#         c2: float = 0.9,
+#         maxiter: int = 25,
+#         maxzoom: int = 10,
+#         # a_max: float = 1e10,
+#         expand: float = 2.0,
+#         adaptive = True,
+#         plus_minus = False,
+#     ):
+#         defaults=dict(init=init,c1=c1,c2=c2,maxiter=maxiter,maxzoom=maxzoom,
+#                       expand=expand, adaptive=adaptive, plus_minus=plus_minus)
+#         super().__init__(defaults=defaults)
+#         self.global_state['initial_scale'] = 1.0
+#         self.global_state['beta_scale'] = 1.0
+#     @torch.no_grad
+#     def search(self, update, var):
+#         objective = self.make_objective_with_derivative(var=var)
+#         init, c1, c2, maxiter, maxzoom, expand, adaptive, plus_minus = itemgetter(
+#             'init', 'c1', 'c2', 'maxiter', 'maxzoom',
+#             'expand', 'adaptive', 'plus_minus')(self.settings[var.params[0]])
+#         f_0, g_0 = objective(0)
+#         step_size,f_a = strong_wolfe(
+#             objective,
+#             f_0=f_0, g_0=g_0,
+#             init=init * self.global_state.setdefault("initial_scale", 1),
+#             c1=c1,
+#             c2=c2,
+#             maxiter=maxiter,
+#             maxzoom=maxzoom,
+#             expand=expand,
+#             plus_minus=plus_minus,
+#         )
+#         if f_a is not None and (f_a > f_0 or _notfinite(f_a)): step_size = None
+#         if step_size is not None and step_size != 0 and not _notfinite(step_size):
+#             self.global_state['initial_scale'] = min(1.0, self.global_state['initial_scale'] * math.sqrt(2))
+#             return step_size
+#         # fallback to backtracking on fail
+#         if adaptive: self.global_state['initial_scale'] *= 0.5
+#         return 0

torchzero/modules/line_search/scipy.py CHANGED Viewed

@@ -3,10 +3,10 @@ from operator import itemgetter
 import torch
-from .line_search import LineSearch
+from .line_search import LineSearchBase
-class ScipyMinimizeScalar(LineSearch):
+class ScipyMinimizeScalar(LineSearchBase):
     """Line search via :code:`scipy.optimize.minimize_scalar` which implements brent, golden search and bounded brent methods.
     Args:

torchzero/modules/line_search/strong_wolfe.py CHANGED Viewed

@@ -1,3 +1,4 @@
+"""this needs to be reworked maybe but it also works"""
 import math
 import warnings
 from operator import itemgetter
@@ -5,8 +6,7 @@ from operator import itemgetter
 import torch
 from torch.optim.lbfgs import _cubic_interpolate
-from .line_search import LineSearch
-from .backtracking import backtracking_line_search
+from .line_search import LineSearchBase
 from ...utils import totensor
@@ -182,7 +182,7 @@ def _notfinite(x):
     if isinstance(x, torch.Tensor): return not torch.isfinite(x).all()
     return not math.isfinite(x)
-class StrongWolfe(LineSearch):
+class StrongWolfe(LineSearchBase):
     """Cubic interpolation line search satisfying Strong Wolfe condition.
     Args:
@@ -192,11 +192,36 @@ class StrongWolfe(LineSearch):
         maxiter (int, optional): Maximum number of line search iterations. Defaults to 25.
         maxzoom (int, optional): Maximum number of zoom iterations. Defaults to 10.
         expand (float, optional): Expansion factor (multipler to step size when weak condition not satisfied). Defaults to 2.0.
+        use_prev (bool, optional):
+            if True, previous step size is used as the initial step size on the next step.
         adaptive (bool, optional):
             when enabled, if line search failed, initial step size is reduced.
             Otherwise it is reset to initial value. Defaults to True.
         plus_minus (bool, optional):
             If enabled and the direction is not descent direction, performs line search in opposite direction. Defaults to False.
+    Examples:
+        Conjugate gradient method with strong wolfe line search. Nocedal, Wright recommend setting c2 to 0.1 for CG.
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.PolakRibiere(),
+                tz.m.StrongWolfe(c2=0.1)
+            )
+        LBFGS strong wolfe line search:
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.LBFGS(),
+                tz.m.StrongWolfe()
+            )
     """
     def __init__(
         self,
@@ -207,11 +232,12 @@ class StrongWolfe(LineSearch):
         maxzoom: int = 10,
         # a_max: float = 1e10,
         expand: float = 2.0,
+        use_prev: bool = False,
         adaptive = True,
         plus_minus = False,
     ):
         defaults=dict(init=init,c1=c1,c2=c2,maxiter=maxiter,maxzoom=maxzoom,
-                      expand=expand, adaptive=adaptive, plus_minus=plus_minus)
+                      expand=expand, adaptive=adaptive, plus_minus=plus_minus,use_prev=use_prev)
         super().__init__(defaults=defaults)
         self.global_state['initial_scale'] = 1.0
@@ -221,11 +247,12 @@ class StrongWolfe(LineSearch):
     def search(self, update, var):
         objective = self.make_objective_with_derivative(var=var)
-        init, c1, c2, maxiter, maxzoom, expand, adaptive, plus_minus = itemgetter(
+        init, c1, c2, maxiter, maxzoom, expand, adaptive, plus_minus, use_prev = itemgetter(
             'init', 'c1', 'c2', 'maxiter', 'maxzoom',
-            'expand', 'adaptive', 'plus_minus')(self.settings[var.params[0]])
+            'expand', 'adaptive', 'plus_minus', 'use_prev')(self.settings[var.params[0]])
         f_0, g_0 = objective(0)
+        if use_prev: init = self.global_state.get('prev_alpha', init)
         step_size,f_a = strong_wolfe(
             objective,
@@ -242,8 +269,8 @@ class StrongWolfe(LineSearch):
         if f_a is not None and (f_a > f_0 or _notfinite(f_a)): step_size = None
         if step_size is not None and step_size != 0 and not _notfinite(step_size):
             self.global_state['initial_scale'] = min(1.0, self.global_state['initial_scale'] * math.sqrt(2))
+            self.global_state['prev_alpha'] = step_size
             return step_size
-        # fallback to backtracking on fail
         if adaptive: self.global_state['initial_scale'] *= 0.5
         return 0

torchzero/modules/misc/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+from .debug import PrintLoss, PrintParams, PrintShape, PrintUpdate
+from .escape import EscapeAnnealing
+from .gradient_accumulation import GradientAccumulation
+from .misc import (
+    DivByLoss,
+    FillLoss,
+    GradSign,
+    GraftGradToUpdate,
+    GraftToGrad,
+    GraftToParams,
+    HpuEstimate,
+    LastAbsoluteRatio,
+    LastDifference,
+    LastGradDifference,
+    LastProduct,
+    LastRatio,
+    MulByLoss,
+    NoiseSign,
+    Previous,
+    RandomHvp,
+    Relative,
+    UpdateSign,
+)
+from .multistep import Multistep, NegateOnLossIncrease, Online, Sequential
+from .regularization import Dropout, PerturbWeights, WeightDropout
+from .split import Split
+from .switch import Alternate, Switch

torchzero/modules/{ops → misc}/debug.py RENAMED Viewed

@@ -6,6 +6,7 @@ from ...core import Module
 from ...utils.tensorlist import Distributions
 class PrintUpdate(Module):
+    """Prints current update."""
     def __init__(self, text = 'update = ', print_fn = print):
         defaults = dict(text=text, print_fn=print_fn)
         super().__init__(defaults)
@@ -15,6 +16,7 @@ class PrintUpdate(Module):
         return var
 class PrintShape(Module):
+    """Prints shapes of the update."""
     def __init__(self, text = 'shapes = ', print_fn = print):
         defaults = dict(text=text, print_fn=print_fn)
         super().__init__(defaults)
@@ -22,4 +24,25 @@ class PrintShape(Module):
     def step(self, var):
         shapes = [u.shape for u in var.update] if var.update is not None else None
         self.settings[var.params[0]]["print_fn"](f'{self.settings[var.params[0]]["text"]}{shapes}')
-        return var
+        return var
+class PrintParams(Module):
+    """Prints current update."""
+    def __init__(self, text = 'params = ', print_fn = print):
+        defaults = dict(text=text, print_fn=print_fn)
+        super().__init__(defaults)
+    def step(self, var):
+        self.settings[var.params[0]]["print_fn"](f'{self.settings[var.params[0]]["text"]}{var.params}')
+        return var
+class PrintLoss(Module):
+    """Prints var.get_loss()."""
+    def __init__(self, text = 'loss = ', print_fn = print):
+        defaults = dict(text=text, print_fn=print_fn)
+        super().__init__(defaults)
+    def step(self, var):
+        self.settings[var.params[0]]["print_fn"](f'{self.settings[var.params[0]]["text"]}{var.get_loss(False)}')
+        return var

torchzero/modules/misc/escape.py ADDED Viewed

@@ -0,0 +1,60 @@
+import torch
+from ...core import Module
+from ...utils import TensorList, NumberList
+class EscapeAnnealing(Module):
+    """If parameters stop changing, this runs a backward annealing random search"""
+    def __init__(self, max_region:float = 1, max_iter:int = 1000, tol=1e-6, n_tol: int = 10):
+        defaults = dict(max_region=max_region, max_iter=max_iter, tol=tol, n_tol=n_tol)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        closure = var.closure
+        if closure is None: raise RuntimeError("Escape requries closure")
+        params = TensorList(var.params)
+        settings = self.settings[params[0]]
+        max_region = self.get_settings(params, 'max_region', cls=NumberList)
+        max_iter = settings['max_iter']
+        tol = settings['tol']
+        n_tol = settings['n_tol']
+        n_bad = self.global_state.get('n_bad', 0)
+        prev_params = self.get_state(params, 'prev_params', cls=TensorList)
+        diff = params-prev_params
+        prev_params.copy_(params)
+        if diff.abs().global_max() <= tol:
+            n_bad += 1
+        else:
+            n_bad = 0
+        self.global_state['n_bad'] = n_bad
+        # no progress
+        f_0 = var.get_loss(False)
+        if n_bad >= n_tol:
+            for i in range(1, max_iter+1):
+                alpha = max_region * (i / max_iter)
+                pert = params.sample_like(distribution='sphere').mul_(alpha)
+                params.add_(pert)
+                f_star = closure(False)
+                if f_star < f_0-1e-10:
+                    var.update = None
+                    var.stop = True
+                    var.skip_update = True
+                    return var
+                else:
+                    params.sub_(pert)
+            self.global_state['n_bad'] = 0
+        return var

torchzero/modules/misc/gradient_accumulation.py ADDED Viewed

@@ -0,0 +1,70 @@
+import torch
+from ...core import Chainable, Module
+class GradientAccumulation(Module):
+    """Uses :code:`n` steps to accumulate gradients, after :code:`n` gradients have been accumulated, they are passed to :code:`modules` and parameters are updates.
+    Accumulating gradients for :code:`n` steps is equivalent to increasing batch size by :code:`n`. Increasing the batch size
+    is more computationally efficient, but sometimes it is not feasible due to memory constraints.
+    .. note::
+        Technically this can accumulate any inputs, including updates generated by previous modules. As long as this module is first, it will accumulate the gradients.
+    Args:
+        modules (Chainable): modules that perform a step every :code:`n` steps using the accumulated gradients.
+        n (int): number of gradients to accumulate.
+        mean (bool, optional): if True, uses mean of accumulated gradients, otherwise uses sum. Defaults to True.
+        stop (bool, optional):
+            this module prevents next modules from stepping unless :code:`n` gradients have been accumulate. Setting this argument to False disables that. Defaults to True.
+    Examples:
+        Adam with gradients accumulated for 16 batches.
+        .. code-block:: python
+            opt = tz.Modular(
+                model.parameters(),
+                tz.m.GradientAccumulation(
+                    modules=[tz.m.Adam(), tz.m.LR(1e-2)],
+                    n=16
+                )
+            )
+    """
+    def __init__(self, modules: Chainable, n: int, mean=True, stop=True):
+        defaults = dict(n=n, mean=mean, stop=stop)
+        super().__init__(defaults)
+        self.set_child('modules', modules)
+    @torch.no_grad
+    def step(self, var):
+        accumulator = self.get_state(var.params, 'accumulator')
+        settings = self.settings[var.params[0]]
+        n = settings['n']; mean = settings['mean']; stop = settings['stop']
+        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
+        # add update to accumulator
+        torch._foreach_add_(accumulator, var.get_update())
+        # step with accumulated updates
+        if step % n == 0:
+            if mean:
+                torch._foreach_div_(accumulator, n)
+            var.update = [a.clone() for a in accumulator]
+            var = self.children['modules'].step(var)
+            # zero accumulator
+            torch._foreach_zero_(accumulator)
+        else:
+            # prevent update
+            if stop:
+                var.stop=True
+                var.skip_update=True
+        return var

torchzero 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

torchzero 0.3.10py3-none-any.whl → 0.3.11py3-none-any.whl