PyPI - torchzero - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

tests/test_identical.py +2 -2
tests/test_module_autograd.py +586 -0
tests/test_objective.py +188 -0
tests/test_opts.py +47 -36
tests/test_tensorlist.py +0 -8
tests/test_utils_optimizer.py +0 -1
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +8 -2
torchzero/core/chain.py +47 -0
torchzero/core/functional.py +103 -0
torchzero/core/modular.py +233 -0
torchzero/core/module.py +132 -643
torchzero/core/objective.py +948 -0
torchzero/core/reformulation.py +56 -23
torchzero/core/transform.py +261 -365
torchzero/linalg/__init__.py +10 -0
torchzero/linalg/eigh.py +34 -0
torchzero/linalg/linalg_utils.py +14 -0
torchzero/{utils/linalg → linalg}/linear_operator.py +132 -34
torchzero/linalg/matrix_power.py +28 -0
torchzero/linalg/orthogonalize.py +95 -0
torchzero/{utils/linalg → linalg}/qr.py +4 -2
torchzero/{utils/linalg → linalg}/solve.py +76 -88
torchzero/linalg/svd.py +20 -0
torchzero/linalg/torch_linalg.py +168 -0
torchzero/modules/__init__.py +0 -1
torchzero/modules/adaptive/__init__.py +1 -1
torchzero/modules/adaptive/adagrad.py +163 -213
torchzero/modules/adaptive/adahessian.py +74 -103
torchzero/modules/adaptive/adam.py +53 -76
torchzero/modules/adaptive/adan.py +49 -30
torchzero/modules/adaptive/adaptive_heavyball.py +11 -6
torchzero/modules/adaptive/aegd.py +12 -12
torchzero/modules/adaptive/esgd.py +98 -119
torchzero/modules/adaptive/lion.py +5 -10
torchzero/modules/adaptive/lmadagrad.py +87 -32
torchzero/modules/adaptive/mars.py +5 -5
torchzero/modules/adaptive/matrix_momentum.py +47 -51
torchzero/modules/adaptive/msam.py +70 -52
torchzero/modules/adaptive/muon.py +59 -124
torchzero/modules/adaptive/natural_gradient.py +33 -28
torchzero/modules/adaptive/orthograd.py +11 -15
torchzero/modules/adaptive/rmsprop.py +83 -75
torchzero/modules/adaptive/rprop.py +48 -47
torchzero/modules/adaptive/sam.py +55 -45
torchzero/modules/adaptive/shampoo.py +123 -129
torchzero/modules/adaptive/soap.py +207 -143
torchzero/modules/adaptive/sophia_h.py +106 -130
torchzero/modules/clipping/clipping.py +15 -18
torchzero/modules/clipping/ema_clipping.py +31 -25
torchzero/modules/clipping/growth_clipping.py +14 -17
torchzero/modules/conjugate_gradient/cg.py +26 -37
torchzero/modules/experimental/__init__.py +3 -6
torchzero/modules/experimental/coordinate_momentum.py +36 -0
torchzero/modules/experimental/curveball.py +25 -41
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/{higher_order → experimental}/higher_order_newton.py +14 -40
torchzero/modules/experimental/newton_solver.py +22 -53
torchzero/modules/experimental/newtonnewton.py +20 -17
torchzero/modules/experimental/reduce_outward_lr.py +7 -7
torchzero/modules/experimental/scipy_newton_cg.py +21 -24
torchzero/modules/experimental/spsa1.py +5 -5
torchzero/modules/experimental/structural_projections.py +1 -4
torchzero/modules/functional.py +8 -1
torchzero/modules/grad_approximation/forward_gradient.py +7 -7
torchzero/modules/grad_approximation/grad_approximator.py +23 -16
torchzero/modules/grad_approximation/rfdm.py +20 -17
torchzero/modules/least_squares/gn.py +90 -42
torchzero/modules/line_search/__init__.py +1 -1
torchzero/modules/line_search/_polyinterp.py +3 -1
torchzero/modules/line_search/adaptive.py +3 -3
torchzero/modules/line_search/backtracking.py +3 -3
torchzero/modules/line_search/interpolation.py +160 -0
torchzero/modules/line_search/line_search.py +42 -51
torchzero/modules/line_search/strong_wolfe.py +5 -5
torchzero/modules/misc/debug.py +12 -12
torchzero/modules/misc/escape.py +10 -10
torchzero/modules/misc/gradient_accumulation.py +10 -78
torchzero/modules/misc/homotopy.py +16 -8
torchzero/modules/misc/misc.py +120 -122
torchzero/modules/misc/multistep.py +63 -61
torchzero/modules/misc/regularization.py +49 -44
torchzero/modules/misc/split.py +30 -28
torchzero/modules/misc/switch.py +37 -32
torchzero/modules/momentum/averaging.py +14 -14
torchzero/modules/momentum/cautious.py +34 -28
torchzero/modules/momentum/momentum.py +11 -11
torchzero/modules/ops/__init__.py +4 -4
torchzero/modules/ops/accumulate.py +21 -21
torchzero/modules/ops/binary.py +67 -66
torchzero/modules/ops/higher_level.py +19 -19
torchzero/modules/ops/multi.py +44 -41
torchzero/modules/ops/reduce.py +26 -23
torchzero/modules/ops/unary.py +53 -53
torchzero/modules/ops/utility.py +47 -46
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +43 -43
torchzero/modules/quasi_newton/__init__.py +2 -0
torchzero/modules/quasi_newton/damping.py +1 -1
torchzero/modules/quasi_newton/lbfgs.py +7 -7
torchzero/modules/quasi_newton/lsr1.py +7 -7
torchzero/modules/quasi_newton/quasi_newton.py +25 -16
torchzero/modules/quasi_newton/sg2.py +292 -0
torchzero/modules/restarts/restars.py +26 -24
torchzero/modules/second_order/__init__.py +6 -3
torchzero/modules/second_order/ifn.py +58 -0
torchzero/modules/second_order/inm.py +101 -0
torchzero/modules/second_order/multipoint.py +40 -80
torchzero/modules/second_order/newton.py +105 -228
torchzero/modules/second_order/newton_cg.py +102 -154
torchzero/modules/second_order/nystrom.py +158 -178
torchzero/modules/second_order/rsn.py +237 -0
torchzero/modules/smoothing/laplacian.py +13 -12
torchzero/modules/smoothing/sampling.py +11 -10
torchzero/modules/step_size/adaptive.py +23 -23
torchzero/modules/step_size/lr.py +15 -15
torchzero/modules/termination/termination.py +32 -30
torchzero/modules/trust_region/cubic_regularization.py +2 -2
torchzero/modules/trust_region/levenberg_marquardt.py +25 -28
torchzero/modules/trust_region/trust_cg.py +1 -1
torchzero/modules/trust_region/trust_region.py +27 -22
torchzero/modules/variance_reduction/svrg.py +21 -18
torchzero/modules/weight_decay/__init__.py +2 -1
torchzero/modules/weight_decay/reinit.py +83 -0
torchzero/modules/weight_decay/weight_decay.py +12 -13
torchzero/modules/wrappers/optim_wrapper.py +57 -50
torchzero/modules/zeroth_order/cd.py +9 -6
torchzero/optim/root.py +3 -3
torchzero/optim/utility/split.py +2 -1
torchzero/optim/wrappers/directsearch.py +27 -63
torchzero/optim/wrappers/fcmaes.py +14 -35
torchzero/optim/wrappers/mads.py +11 -31
torchzero/optim/wrappers/moors.py +66 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +31 -25
torchzero/optim/wrappers/optuna.py +6 -13
torchzero/optim/wrappers/pybobyqa.py +124 -0
torchzero/optim/wrappers/scipy/__init__.py +7 -0
torchzero/optim/wrappers/scipy/basin_hopping.py +117 -0
torchzero/optim/wrappers/scipy/brute.py +48 -0
torchzero/optim/wrappers/scipy/differential_evolution.py +80 -0
torchzero/optim/wrappers/scipy/direct.py +69 -0
torchzero/optim/wrappers/scipy/dual_annealing.py +115 -0
torchzero/optim/wrappers/scipy/experimental.py +141 -0
torchzero/optim/wrappers/scipy/minimize.py +151 -0
torchzero/optim/wrappers/scipy/sgho.py +111 -0
torchzero/optim/wrappers/wrapper.py +121 -0
torchzero/utils/__init__.py +7 -25
torchzero/utils/compile.py +2 -2
torchzero/utils/derivatives.py +112 -88
torchzero/utils/optimizer.py +4 -77
torchzero/utils/python_tools.py +31 -0
torchzero/utils/tensorlist.py +11 -5
torchzero/utils/thoad_tools.py +68 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/METADATA +1 -1
torchzero-0.4.0.dist-info/RECORD +191 -0
tests/test_vars.py +0 -185
torchzero/modules/experimental/momentum.py +0 -160
torchzero/modules/higher_order/__init__.py +0 -1
torchzero/optim/wrappers/scipy.py +0 -572
torchzero/utils/linalg/__init__.py +0 -12
torchzero/utils/linalg/matrix_funcs.py +0 -87
torchzero/utils/linalg/orthogonalize.py +0 -12
torchzero/utils/linalg/svd.py +0 -20
torchzero/utils/ops.py +0 -10
torchzero-0.3.14.dist-info/RECORD +0 -167
/torchzero/{utils/linalg → linalg}/benchmark.py +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/WHEEL +0 -0
{torchzero-0.3.14.dist-info → torchzero-0.4.0.dist-info}/top_level.txt +0 -0

torchzero/modules/line_search/backtracking.py CHANGED Viewed

@@ -117,7 +117,7 @@ class Backtracking(LineSearchBase):
         # # directional derivative
         if c == 0: d = 0
-        else: d = -sum(t.sum() for t in torch._foreach_mul(var.get_grad(), var.get_update()))
+        else: d = -sum(t.sum() for t in torch._foreach_mul(var.get_grads(), var.get_updates()))
         # scale init
         init_scale = self.global_state.get('init_scale', 1)
@@ -136,7 +136,7 @@ class Backtracking(LineSearchBase):
         if adaptive:
             finfo = torch.finfo(var.params[0].dtype)
             if init_scale <= finfo.tiny * 2:
-                self.global_state["init_scale"] = finfo.max / 2
+                self.global_state["init_scale"] = init * 2
             else:
                 self.global_state['init_scale'] = init_scale * beta**maxiter
         return 0
@@ -199,7 +199,7 @@ class AdaptiveBacktracking(LineSearchBase):
         # directional derivative (0 if c = 0 because it is not needed)
         if c == 0: d = 0
-        else: d = -sum(t.sum() for t in torch._foreach_mul(var.get_grad(), update))
+        else: d = -sum(t.sum() for t in torch._foreach_mul(var.get_grads(), update))
         # scale beta
         beta = beta * self.global_state['beta_scale']

torchzero/modules/line_search/interpolation.py ADDED Viewed

@@ -0,0 +1,160 @@
+import math
+from bisect import insort
+import numpy as np
+from numpy.polynomial import Polynomial
+# we have a list of points in ascending order of their `y` value
+class Point:
+    __slots__ = ("x", "y", "d")
+    def __init__(self, x, y, d):
+        self.x = x
+        self.y = y
+        self.d = d
+    def __lt__(self, other):
+        return self.y < other.y
+def _get_dpoint(points: list[Point]):
+    """returns lowest point with derivative and list of other points"""
+    for i,p in enumerate(points):
+        if p.d is not None:
+            cpoints = points.copy()
+            del cpoints[i]
+            return p, cpoints
+    return None, points
+# -------------------------------- quadratic2 -------------------------------- #
+def _fitmin_quadratic2(x1, y1, d1, x2, y2):
+    a = (y2 - y1 - d1*(x2 - x1)) / (x2 - x1)**2
+    if a <= 0: return None
+    b = d1 - 2*a*x1
+    # c = y_1 - d_1*x_1 + a*x_1**2
+    return -b / (2*a)
+def quadratic2(points:list[Point]):
+    pd, points = _get_dpoint(points)
+    if pd is None: return None
+    if len(points) == 0: return None
+    pn = points[0]
+    return _fitmin_quadratic2(pd.x, pd.y, pd.d, pn.x, pn.y)
+# -------------------------------- quadratic3 -------------------------------- #
+def _fitmin_quadratic3(x1, y1, x2, y2, x3, y3):
+    quad = Polynomial.fit([x1,x2,x3], [y1,y2,y3], deg=2)
+    a,b,c = quad.coef
+    if a <= 0: return None
+    return -b / (2*a)
+def quadratic3(points:list[Point]):
+    if len(points) < 3: return None
+    p1,p2,p3 = points[:3]
+    return _fitmin_quadratic3(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y)
+# ---------------------------------- cubic3 ---------------------------------- #
+def _minimize_polynomial(poly: Polynomial):
+    roots = poly.deriv().roots()
+    vals = poly(roots)
+    argmin = np.argmin(vals)
+    return roots[argmin], vals[argmin]
+def _fitmin_cubic3(x1,y1,x2,y2,x3,y3,x4,d4):
+    """x4 is allowed to be equal to x1"""
+    A = np.array([
+        [x1**3, x1**2, x1, 1],
+        [x2**3, x2**2, x2, 1],
+        [x3**3, x3**2, x3, 1],
+        [3*x4**2, 2*x4, 1, 0]
+    ])
+    B = np.array([y1, y2, y3, d4])
+    try:
+        coeffs = np.linalg.solve(A, B)
+    except np.linalg.LinAlgError:
+        return None
+    cubic = Polynomial(coeffs)
+    x_min, y_min = _minimize_polynomial(cubic)
+    if y_min < min(y1,y2,y3): return x_min
+    return None
+def cubic3(points: list[Point]):
+    pd, points = _get_dpoint(points)
+    if pd is None: return None
+    if len(points) < 2: return None
+    p1, p2 = points[:2]
+    return _fitmin_cubic3(pd.x, pd.y, p1.x, p1.y, p2.x, p2.y, pd.x, pd.d)
+# ---------------------------------- cubic4 ---------------------------------- #
+def _fitmin_cubic4(x1, y1, x2, y2, x3, y3, x4, y4):
+    cubic = Polynomial.fit([x1,x2,x3,x4], [y1,y2,y3,y4], deg=3)
+    x_min, y_min = _minimize_polynomial(cubic)
+    if y_min < min(y1,y2,y3,y4): return x_min
+    return None
+def cubic4(points:list[Point]):
+    if len(points) < 4: return None
+    p1,p2,p3,p4 = points[:4]
+    return _fitmin_cubic4(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)
+# ---------------------------------- linear3 --------------------------------- #
+def _linear_intersection(x1,y1,s1,x2,y2,s2):
+    if s1 == 0 or s2 == 0 or s1 == s2: return None
+    return (y1 - s1*x1 - y2 + s2*x2) / (s2 - s1)
+def _fitmin_linear3(x1, y1, d1, x2, y2, x3, y3):
+    # we have that
+    # s2 = (y2 - y3) / (x2 - x3) # slope origin in x2 y2
+    # f1(x) = y1 + d1 * (x - x1)
+    # f2(x) = y2 + s2 * (x - x2)
+    # y1 + d1 * (x - x1) = y2 + s2 * (x - x2)
+    # y1 + d1 x - d1 x1 - y2 - s2 x + s2 x2 = 0
+    # s2 x - d1 x = y1 - d1 x1 - y2 + s2 x2
+    # x = (y1 - d1 x1 - y2 + s2 x2) / (s2 - d1)
+    if x2 < x1 < x3 or x3 < x1 < x2: # point with derivative in between
+        return None
+    if d1 > 0:
+        if x2 > x1 or x3 > x1: return None  # intersection is above to the right
+        if x2 > x3: x2,y2,x3,y3 = x3,y3,x2,y2
+    if d1 < 0:
+        if x2 < x1 or x3 < x1: return None  # intersection is above to the left
+        if x2 < x3: x2,y2,x3,y3 = x3,y3,x2,y2
+    s2 = (y2 - y3) / (x2 - x3)
+    return _linear_intersection(x1,y1,d1,x2,y2,s2)
+def linear3(points:list[Point]):
+    pd, points = _get_dpoint(points)
+    if pd is None: return None
+    if len(points) < 2: return None
+    p1, p2 = points[:2]
+    return _fitmin_linear3(pd.x, pd.y, pd.d, p1.x, p1.y, p2.x, p2.y)
+# ---------------------------------- linear4 --------------------------------- #
+def _fitmin_linear4(x1, y1, x2, y2, x3, y3, x4, y4):
+    # sort by x
+    points = ((x1,y1), (x2,y2), (x3,y3), (x4,y4))
+    points = sorted(points, key=lambda x: x[0])
+    (x1,y1), (x2,y2), (x3,y3), (x4,y4) = points
+    s1 = (y1 - y2) / (x1 - x2)
+    s3 = (y3 - y4) / (x3 - x4)
+    return _linear_intersection(x1,y1,s1,x3,y3,s3)
+def linear4(points:list[Point]):
+    if len(points) < 4: return None
+    p1,p2,p3,p4 = points[:4]
+    return _fitmin_linear4(p1.x, p1.y, p2.x, p2.y, p3.x, p3.y, p4.x, p4.y)

torchzero/modules/line_search/line_search.py CHANGED Viewed

@@ -8,8 +8,9 @@ from typing import Any, Literal
 import numpy as np
 import torch
-from ...core import Module, Target, Var
+from ...core import Module,  Objective
 from ...utils import tofloat, set_storage_
+from ..functional import clip_by_finfo
 class MaxLineSearchItersReached(Exception): pass
@@ -103,23 +104,18 @@ class LineSearchBase(Module, ABC):
     ):
         if not math.isfinite(step_size): return
-         # fixes overflow when backtracking keeps increasing alpha after converging
-        step_size = max(min(tofloat(step_size), 1e36), -1e36)
+         # avoid overflow error
+        step_size = clip_by_finfo(tofloat(step_size), torch.finfo(update[0].dtype))
         # skip is parameters are already at suggested step size
         if self._current_step_size == step_size: return
-        # this was basically causing floating point imprecision to build up
-        #if False:
-        # if abs(alpha) < abs(step_size) and step_size != 0:
-        #     torch._foreach_add_(params, update, alpha=alpha)
-        # else:
         assert self._initial_params is not None
         if step_size == 0:
             new_params = [p.clone() for p in self._initial_params]
         else:
             new_params = torch._foreach_sub(self._initial_params, update, alpha=step_size)
         for c, n in zip(params, new_params):
             set_storage_(c, n)
@@ -131,10 +127,7 @@ class LineSearchBase(Module, ABC):
         params: list[torch.Tensor],
         update: list[torch.Tensor],
     ):
-        # if not np.isfinite(step_size): step_size = [0 for _ in step_size]
-        # alpha = [self._current_step_size - s for s in step_size]
-        # if any(a!=0 for a in alpha):
-        #     torch._foreach_add_(params, torch._foreach_mul(update, alpha))
         assert self._initial_params is not None
         if not np.isfinite(step_size).all(): step_size = [0 for _ in step_size]
@@ -146,7 +139,7 @@ class LineSearchBase(Module, ABC):
         for c, n in zip(params, new_params):
             set_storage_(c, n)
-    def _loss(self, step_size: float, var: Var, closure, params: list[torch.Tensor],
+    def _loss(self, step_size: float, var: Objective, closure, params: list[torch.Tensor],
               update: list[torch.Tensor], backward:bool=False) -> float:
         # if step_size is 0, we might already know the loss
@@ -172,16 +165,16 @@ class LineSearchBase(Module, ABC):
         # if evaluated loss at step size 0, set it to var.loss
         if step_size == 0:
             var.loss = loss
-            if backward: var.grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
+            if backward: var.grads = [p.grad if p.grad is not None else torch.zeros_like(p) for p in params]
         return tofloat(loss)
-    def _loss_derivative_gradient(self, step_size: float, var: Var, closure,
+    def _loss_derivative_gradient(self, step_size: float, var: Objective, closure,
                          params: list[torch.Tensor], update: list[torch.Tensor]):
         # if step_size is 0, we might already know the derivative
-        if (var.grad is not None) and (step_size == 0):
+        if (var.grads is not None) and (step_size == 0):
             loss = self._loss(step_size=step_size,var=var,closure=closure,params=params,update=update,backward=False)
-            derivative = - sum(t.sum() for t in torch._foreach_mul(var.grad, update))
+            derivative = - sum(t.sum() for t in torch._foreach_mul(var.grads, update))
         else:
             # loss with a backward pass sets params.grad
@@ -191,81 +184,79 @@ class LineSearchBase(Module, ABC):
             derivative = - sum(t.sum() for t in torch._foreach_mul([p.grad if p.grad is not None
                                                                     else torch.zeros_like(p) for p in params], update))
-        assert var.grad is not None
-        return loss, tofloat(derivative), var.grad
+        assert var.grads is not None
+        return loss, tofloat(derivative), var.grads
-    def _loss_derivative(self, step_size: float, var: Var, closure,
+    def _loss_derivative(self, step_size: float, var: Objective, closure,
                          params: list[torch.Tensor], update: list[torch.Tensor]):
         return self._loss_derivative_gradient(step_size=step_size, var=var,closure=closure,params=params,update=update)[:2]
-    def evaluate_f(self, step_size: float, var: Var, backward:bool=False):
+    def evaluate_f(self, step_size: float, var: Objective, backward:bool=False):
         """evaluate function value at alpha `step_size`."""
         closure = var.closure
         if closure is None: raise RuntimeError('line search requires closure')
-        return self._loss(step_size=step_size, var=var, closure=closure, params=var.params,update=var.get_update(),backward=backward)
+        return self._loss(step_size=step_size, var=var, closure=closure, params=var.params,update=var.get_updates(),backward=backward)
-    def evaluate_f_d(self, step_size: float, var: Var):
+    def evaluate_f_d(self, step_size: float, var: Objective):
         """evaluate function value and directional derivative in the direction of the update at step size `step_size`."""
         closure = var.closure
         if closure is None: raise RuntimeError('line search requires closure')
-        return self._loss_derivative(step_size=step_size, var=var, closure=closure, params=var.params,update=var.get_update())
+        return self._loss_derivative(step_size=step_size, var=var, closure=closure, params=var.params,update=var.get_updates())
-    def evaluate_f_d_g(self, step_size: float, var: Var):
+    def evaluate_f_d_g(self, step_size: float, var: Objective):
         """evaluate function value, directional derivative, and gradient list at step size `step_size`."""
         closure = var.closure
         if closure is None: raise RuntimeError('line search requires closure')
-        return self._loss_derivative_gradient(step_size=step_size, var=var, closure=closure, params=var.params,update=var.get_update())
+        return self._loss_derivative_gradient(step_size=step_size, var=var, closure=closure, params=var.params,update=var.get_updates())
-    def make_objective(self, var: Var, backward:bool=False):
+    def make_objective(self, var: Objective, backward:bool=False):
         closure = var.closure
         if closure is None: raise RuntimeError('line search requires closure')
-        return partial(self._loss, var=var, closure=closure, params=var.params, update=var.get_update(), backward=backward)
+        return partial(self._loss, var=var, closure=closure, params=var.params, update=var.get_updates(), backward=backward)
-    def make_objective_with_derivative(self, var: Var):
+    def make_objective_with_derivative(self, var: Objective):
         closure = var.closure
         if closure is None: raise RuntimeError('line search requires closure')
-        return partial(self._loss_derivative, var=var, closure=closure, params=var.params, update=var.get_update())
+        return partial(self._loss_derivative, var=var, closure=closure, params=var.params, update=var.get_updates())
-    def make_objective_with_derivative_and_gradient(self, var: Var):
+    def make_objective_with_derivative_and_gradient(self, var: Objective):
         closure = var.closure
         if closure is None: raise RuntimeError('line search requires closure')
-        return partial(self._loss_derivative_gradient, var=var, closure=closure, params=var.params, update=var.get_update())
+        return partial(self._loss_derivative_gradient, var=var, closure=closure, params=var.params, update=var.get_updates())
     @abstractmethod
-    def search(self, update: list[torch.Tensor], var: Var) -> float:
+    def search(self, update: list[torch.Tensor], var: Objective) -> float:
         """Finds the step size to use"""
     @torch.no_grad
-    def step(self, var: Var) -> Var:
+    def apply(self, objective: Objective) -> Objective:
         self._reset()
-        params = var.params
+        params = objective.params
         self._initial_params = [p.clone() for p in params]
-        update = var.get_update()
+        update = objective.get_updates()
         try:
-            step_size = self.search(update=update, var=var)
+            step_size = self.search(update=update, var=objective)
         except MaxLineSearchItersReached:
             step_size = self._best_step_size
-        # set loss_approx
-        if var.loss_approx is None: var.loss_approx = self._lowest_loss
+        step_size = clip_by_finfo(step_size, torch.finfo(update[0].dtype))
-        # this is last module - set step size to found step_size times lr
-        if var.is_last:
-            if var.last_module_lrs is None:
-                self.set_step_size_(step_size, params=params, update=update)
+        # set loss_approx
+        if objective.loss_approx is None: objective.loss_approx = self._lowest_loss
-            else:
-                self._set_per_parameter_step_size_([step_size*lr for lr in var.last_module_lrs], params=params, update=update)
+        # if this is last module, directly update parameters to avoid redundant operations
+        if objective.modular is not None and self is objective.modular.modules[-1]:
+            self.set_step_size_(step_size, params=params, update=update)
-            var.stop = True; var.skip_update = True
-            return var
+            objective.stop = True; objective.skip_update = True
+            return objective
         # revert parameters and multiply update by step size
         self.set_step_size_(0, params=params, update=update)
-        torch._foreach_mul_(var.update, step_size)
-        return var
+        torch._foreach_mul_(objective.updates, step_size)
+        return objective
@@ -277,7 +268,7 @@ class GridLineSearch(LineSearchBase):
     @torch.no_grad
     def search(self, update, var):
-        start,end,num=itemgetter('start','end','num')(self.defaults)
+        start, end, num = itemgetter('start', 'end', 'num')(self.defaults)
         for lr in torch.linspace(start,end,num):
             self.evaluate_f(lr.item(), var=var, backward=False)

torchzero/modules/line_search/strong_wolfe.py CHANGED Viewed

@@ -7,7 +7,7 @@ import numpy as np
 import torch
 from torch.optim.lbfgs import _cubic_interpolate
-from ...utils import as_tensorlist, totensor
+from ...utils import as_tensorlist, totensor, tofloat
 from ._polyinterp import polyinterp, polyinterp2
 from .line_search import LineSearchBase, TerminationCondition, termination_condition
 from ..step_size.adaptive import _bb_geom
@@ -92,7 +92,7 @@ class _StrongWolfe:
             return _apply_bounds(a_lo + 0.5 * (a_hi - a_lo), bounds)
         if self.interpolation in ('polynomial', 'polynomial2'):
-            finite_history = [(a, f, g) for a, (f,g) in self.history.items() if math.isfinite(a) and math.isfinite(f) and math.isfinite(g)]
+            finite_history = [(tofloat(a), tofloat(f), tofloat(g)) for a, (f,g) in self.history.items() if math.isfinite(a) and math.isfinite(f) and math.isfinite(g)]
             if bounds is None: bounds = (None, None)
             polyinterp_fn = polyinterp if self.interpolation == 'polynomial' else polyinterp2
             try:
@@ -284,8 +284,8 @@ class StrongWolfe(LineSearchBase):
             'init_value', 'init', 'c1', 'c2', 'a_max', 'maxiter', 'maxzoom',
             'maxeval', 'interpolation', 'adaptive', 'plus_minus', 'fallback', 'tol_change')(self.defaults)
-        dir = as_tensorlist(var.get_update())
-        grad_list = var.get_grad()
+        dir = as_tensorlist(var.get_updates())
+        grad_list = var.get_grads()
         g_0 = -sum(t.sum() for t in torch._foreach_mul(grad_list, dir))
         f_0 = var.get_loss(False)
@@ -370,6 +370,6 @@ class StrongWolfe(LineSearchBase):
             self.global_state['initial_scale'] = self.global_state.get('initial_scale', 1) * 0.5
             finfo = torch.finfo(dir[0].dtype)
             if self.global_state['initial_scale'] < finfo.tiny * 2:
-                self.global_state['initial_scale'] = finfo.max / 2
+                self.global_state['initial_scale'] = init_value * 2
         return 0

torchzero/modules/misc/debug.py CHANGED Viewed

@@ -11,9 +11,9 @@ class PrintUpdate(Module):
         defaults = dict(text=text, print_fn=print_fn)
         super().__init__(defaults)
-    def step(self, var):
-        self.defaults["print_fn"](f'{self.defaults["text"]}{var.update}')
-        return var
+    def apply(self, objective):
+        self.defaults["print_fn"](f'{self.defaults["text"]}{objective.updates}')
+        return objective
 class PrintShape(Module):
     """Prints shapes of the update."""
@@ -21,10 +21,10 @@ class PrintShape(Module):
         defaults = dict(text=text, print_fn=print_fn)
         super().__init__(defaults)
-    def step(self, var):
-        shapes = [u.shape for u in var.update] if var.update is not None else None
+    def apply(self, objective):
+        shapes = [u.shape for u in objective.updates] if objective.updates is not None else None
         self.defaults["print_fn"](f'{self.defaults["text"]}{shapes}')
-        return var
+        return objective
 class PrintParams(Module):
     """Prints current update."""
@@ -32,9 +32,9 @@ class PrintParams(Module):
         defaults = dict(text=text, print_fn=print_fn)
         super().__init__(defaults)
-    def step(self, var):
-        self.defaults["print_fn"](f'{self.defaults["text"]}{var.params}')
-        return var
+    def apply(self, objective):
+        self.defaults["print_fn"](f'{self.defaults["text"]}{objective.params}')
+        return objective
 class PrintLoss(Module):
@@ -43,6 +43,6 @@ class PrintLoss(Module):
         defaults = dict(text=text, print_fn=print_fn)
         super().__init__(defaults)
-    def step(self, var):
-        self.defaults["print_fn"](f'{self.defaults["text"]}{var.get_loss(False)}')
-        return var
+    def apply(self, objective):
+        self.defaults["print_fn"](f'{self.defaults["text"]}{objective.get_loss(False)}')
+        return objective

torchzero/modules/misc/escape.py CHANGED Viewed

@@ -3,7 +3,7 @@ import math
 from typing import Literal
 import torch
-from ...core import Modular, Module, Var, Chainable
+from ...core import Modular, Module, Objective, Chainable
 from ...utils import NumberList, TensorList
@@ -15,11 +15,11 @@ class EscapeAnnealing(Module):
     @torch.no_grad
-    def step(self, var):
-        closure = var.closure
+    def apply(self, objective):
+        closure = objective.closure
         if closure is None: raise RuntimeError("Escape requries closure")
-        params = TensorList(var.params)
+        params = TensorList(objective.params)
         settings = self.settings[params[0]]
         max_region = self.get_settings(params, 'max_region', cls=NumberList)
         max_iter = settings['max_iter']
@@ -41,7 +41,7 @@ class EscapeAnnealing(Module):
         self.global_state['n_bad'] = n_bad
         # no progress
-        f_0 = var.get_loss(False)
+        f_0 = objective.get_loss(False)
         if n_bad >= n_tol:
             for i in range(1, max_iter+1):
                 alpha = max_region * (i / max_iter)
@@ -51,12 +51,12 @@ class EscapeAnnealing(Module):
                 f_star = closure(False)
                 if math.isfinite(f_star) and f_star < f_0-1e-12:
-                    var.update = None
-                    var.stop = True
-                    var.skip_update = True
-                    return var
+                    objective.updates = None
+                    objective.stop = True
+                    objective.skip_update = True
+                    return objective
                 params.sub_(pert)
             self.global_state['n_bad'] = 0
-        return var
+        return objective

torchzero/modules/misc/gradient_accumulation.py CHANGED Viewed

@@ -3,74 +3,6 @@ import torch
 from ...core import Chainable, Module
-# class GradientAccumulation(Module):
-#     """Uses :code:`n` steps to accumulate gradients, after :code:`n` gradients have been accumulated, they are passed to :code:`modules` and parameters are updates.
-#     Accumulating gradients for :code:`n` steps is equivalent to increasing batch size by :code:`n`. Increasing the batch size
-#     is more computationally efficient, but sometimes it is not feasible due to memory constraints.
-#     .. note::
-#         Technically this can accumulate any inputs, including updates generated by previous modules. As long as this module is first, it will accumulate the gradients.
-#     Args:
-#         modules (Chainable): modules that perform a step every :code:`n` steps using the accumulated gradients.
-#         n (int): number of gradients to accumulate.
-#         mean (bool, optional): if True, uses mean of accumulated gradients, otherwise uses sum. Defaults to True.
-#         stop (bool, optional):
-#             this module prevents next modules from stepping unless :code:`n` gradients have been accumulate. Setting this argument to False disables that. Defaults to True.
-#     Examples:
-#         Adam with gradients accumulated for 16 batches.
-#         .. code-block:: python
-#             opt = tz.Modular(
-#                 model.parameters(),
-#                 tz.m.GradientAccumulation(
-#                     [tz.m.Adam(), tz.m.LR(1e-2)],
-#                     n=16
-#                 )
-#             )
-#     """
-#     def __init__(self, modules: Chainable, n: int, mean=True, stop=True):
-#         defaults = dict(n=n, mean=mean, stop=stop)
-#         super().__init__(defaults)
-#         self.set_child('modules', modules)
-#     @torch.no_grad
-#     def step(self, var):
-#         accumulator = self.get_state(var.params, 'accumulator')
-#         settings = self.defaults
-#         n = settings['n']; mean = settings['mean']; stop = settings['stop']
-#         step = self.global_state['step'] = self.global_state.get('step', 0) + 1
-#         # add update to accumulator
-#         torch._foreach_add_(accumulator, var.get_update())
-#         # step with accumulated updates
-#         if step % n == 0:
-#             if mean:
-#                 torch._foreach_div_(accumulator, n)
-#             var.update = [a.clone() for a in accumulator]
-#             var = self.children['modules'].step(var)
-#             # zero accumulator
-#             torch._foreach_zero_(accumulator)
-#         else:
-#             # prevent update
-#             if stop:
-#                 var.update = None
-#                 var.stop=True
-#                 var.skip_update=True
-#         return var
 class GradientAccumulation(Module):
     """Uses ``n`` steps to accumulate gradients, after ``n`` gradients have been accumulated, they are passed to :code:`modules` and parameters are updates.
@@ -106,21 +38,21 @@ class GradientAccumulation(Module):
     @torch.no_grad
-    def step(self, var):
-        accumulator = self.get_state(var.params, 'accumulator')
+    def apply(self, objective):
+        accumulator = self.get_state(objective.params, 'accumulator')
         settings = self.defaults
         n = settings['n']; mean = settings['mean']; stop = settings['stop']
-        step = self.global_state['step'] = self.global_state.get('step', 0) + 1
+        step = self.increment_counter("step", 0)
         # add update to accumulator
-        torch._foreach_add_(accumulator, var.get_update())
+        torch._foreach_add_(accumulator, objective.get_updates())
         # step with accumulated updates
-        if step % n == 0:
+        if (step + 1) % n == 0:
             if mean:
                 torch._foreach_div_(accumulator, n)
-            var.update = accumulator
+            objective.updates = accumulator
             # zero accumulator
             self.clear_state_keys('accumulator')
@@ -128,9 +60,9 @@ class GradientAccumulation(Module):
         else:
             # prevent update
             if stop:
-                var.update = None
-                var.stop=True
-                var.skip_update=True
+                objective.updates = None
+                objective.stop=True
+                objective.skip_update=True
-        return var
+        return objective

torchzero 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

torchzero 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl