PyPI - torchzero - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

torchzero 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

tests/test_opts.py +54 -21
tests/test_tensorlist.py +2 -2
tests/test_vars.py +61 -61
torchzero/core/__init__.py +2 -3
torchzero/core/module.py +49 -49
torchzero/core/transform.py +219 -158
torchzero/modules/__init__.py +1 -0
torchzero/modules/clipping/clipping.py +10 -10
torchzero/modules/clipping/ema_clipping.py +14 -13
torchzero/modules/clipping/growth_clipping.py +16 -18
torchzero/modules/experimental/__init__.py +12 -3
torchzero/modules/experimental/absoap.py +50 -156
torchzero/modules/experimental/adadam.py +15 -14
torchzero/modules/experimental/adamY.py +17 -27
torchzero/modules/experimental/adasoap.py +19 -129
torchzero/modules/experimental/curveball.py +12 -12
torchzero/modules/experimental/diagonal_higher_order_newton.py +225 -0
torchzero/modules/experimental/eigendescent.py +117 -0
torchzero/modules/experimental/etf.py +172 -0
torchzero/modules/experimental/gradmin.py +2 -2
torchzero/modules/experimental/newton_solver.py +11 -11
torchzero/modules/experimental/newtonnewton.py +88 -0
torchzero/modules/experimental/reduce_outward_lr.py +8 -5
torchzero/modules/experimental/soapy.py +19 -146
torchzero/modules/experimental/spectral.py +79 -204
torchzero/modules/experimental/structured_newton.py +12 -12
torchzero/modules/experimental/subspace_preconditioners.py +13 -10
torchzero/modules/experimental/tada.py +38 -0
torchzero/modules/grad_approximation/fdm.py +2 -2
torchzero/modules/grad_approximation/forward_gradient.py +5 -5
torchzero/modules/grad_approximation/grad_approximator.py +21 -21
torchzero/modules/grad_approximation/rfdm.py +28 -15
torchzero/modules/higher_order/__init__.py +1 -0
torchzero/modules/higher_order/higher_order_newton.py +256 -0
torchzero/modules/line_search/backtracking.py +42 -23
torchzero/modules/line_search/line_search.py +40 -40
torchzero/modules/line_search/scipy.py +18 -3
torchzero/modules/line_search/strong_wolfe.py +21 -32
torchzero/modules/line_search/trust_region.py +18 -6
torchzero/modules/lr/__init__.py +1 -1
torchzero/modules/lr/{step_size.py → adaptive.py} +22 -26
torchzero/modules/lr/lr.py +20 -16
torchzero/modules/momentum/averaging.py +25 -10
torchzero/modules/momentum/cautious.py +73 -35
torchzero/modules/momentum/ema.py +92 -41
torchzero/modules/momentum/experimental.py +21 -13
torchzero/modules/momentum/matrix_momentum.py +96 -54
torchzero/modules/momentum/momentum.py +24 -4
torchzero/modules/ops/accumulate.py +51 -21
torchzero/modules/ops/binary.py +36 -36
torchzero/modules/ops/debug.py +7 -7
torchzero/modules/ops/misc.py +128 -129
torchzero/modules/ops/multi.py +19 -19
torchzero/modules/ops/reduce.py +16 -16
torchzero/modules/ops/split.py +26 -26
torchzero/modules/ops/switch.py +4 -4
torchzero/modules/ops/unary.py +20 -20
torchzero/modules/ops/utility.py +37 -37
torchzero/modules/optimizers/adagrad.py +33 -24
torchzero/modules/optimizers/adam.py +31 -34
torchzero/modules/optimizers/lion.py +4 -4
torchzero/modules/optimizers/muon.py +6 -6
torchzero/modules/optimizers/orthograd.py +4 -5
torchzero/modules/optimizers/rmsprop.py +13 -16
torchzero/modules/optimizers/rprop.py +52 -49
torchzero/modules/optimizers/shampoo.py +17 -23
torchzero/modules/optimizers/soap.py +12 -19
torchzero/modules/optimizers/sophia_h.py +13 -13
torchzero/modules/projections/dct.py +4 -4
torchzero/modules/projections/fft.py +6 -6
torchzero/modules/projections/galore.py +1 -1
torchzero/modules/projections/projection.py +57 -57
torchzero/modules/projections/structural.py +17 -17
torchzero/modules/quasi_newton/__init__.py +33 -4
torchzero/modules/quasi_newton/cg.py +67 -17
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +24 -24
torchzero/modules/quasi_newton/lbfgs.py +12 -12
torchzero/modules/quasi_newton/lsr1.py +11 -11
torchzero/modules/quasi_newton/olbfgs.py +19 -19
torchzero/modules/quasi_newton/quasi_newton.py +254 -47
torchzero/modules/second_order/newton.py +32 -20
torchzero/modules/second_order/newton_cg.py +13 -12
torchzero/modules/second_order/nystrom.py +21 -21
torchzero/modules/smoothing/gaussian.py +21 -21
torchzero/modules/smoothing/laplacian.py +7 -9
torchzero/modules/weight_decay/__init__.py +1 -1
torchzero/modules/weight_decay/weight_decay.py +43 -9
torchzero/modules/wrappers/optim_wrapper.py +11 -11
torchzero/optim/wrappers/directsearch.py +244 -0
torchzero/optim/wrappers/fcmaes.py +97 -0
torchzero/optim/wrappers/mads.py +90 -0
torchzero/optim/wrappers/nevergrad.py +4 -4
torchzero/optim/wrappers/nlopt.py +28 -14
torchzero/optim/wrappers/optuna.py +70 -0
torchzero/optim/wrappers/scipy.py +162 -13
torchzero/utils/__init__.py +2 -6
torchzero/utils/derivatives.py +2 -1
torchzero/utils/optimizer.py +55 -74
torchzero/utils/python_tools.py +17 -4
{torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/METADATA +14 -14
torchzero-0.3.10.dist-info/RECORD +139 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/WHEEL +1 -1
torchzero/core/preconditioner.py +0 -138
torchzero/modules/experimental/algebraic_newton.py +0 -145
torchzero/modules/experimental/tropical_newton.py +0 -136
torchzero-0.3.9.dist-info/RECORD +0 -131
{torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/licenses/LICENSE +0 -0
{torchzero-0.3.9.dist-info → torchzero-0.3.10.dist-info}/top_level.txt +0 -0

torchzero/modules/experimental/newtonnewton.py ADDED Viewed

@@ -0,0 +1,88 @@
+import itertools
+import warnings
+from collections.abc import Callable
+from contextlib import nullcontext
+from functools import partial
+from typing import Literal
+import torch
+from ...core import Chainable, Module, apply_transform
+from ...utils import TensorList, vec_to_tensors
+from ...utils.derivatives import (
+    hessian_list_to_mat,
+    jacobian_wrt,
+)
+from ..second_order.newton import (
+    cholesky_solve,
+    eigh_solve,
+    least_squares_solve,
+    lu_solve,
+)
+class NewtonNewton(Module):
+    """
+    Method that I thought of and then it worked.
+    1. Calculate newton step by solving Hx=g
+    2. Calculate jacobian of x wrt parameters and call it H2
+    3. Solve H2 x2 = x for x2.
+    4. Optionally, repeat (if order is higher than 3.)
+    Memory is n^order. It tends to converge faster on convex functions, but can be unstable on non-convex. Orders higher than 3 are usually too unsable and have little benefit.
+    """
+    def __init__(
+        self,
+        reg: float = 1e-6,
+        order: int = 3,
+        search_negative: bool = False,
+        vectorize: bool = True,
+        eigval_tfm: Callable[[torch.Tensor], torch.Tensor] | None = None,
+    ):
+        defaults = dict(order=order, reg=reg, vectorize=vectorize, eigval_tfm=eigval_tfm, search_negative=search_negative)
+        super().__init__(defaults)
+    @torch.no_grad
+    def step(self, var):
+        params = TensorList(var.params)
+        closure = var.closure
+        if closure is None: raise RuntimeError('NewtonCG requires closure')
+        settings = self.settings[params[0]]
+        reg = settings['reg']
+        vectorize = settings['vectorize']
+        order = settings['order']
+        search_negative = settings['search_negative']
+        eigval_tfm = settings['eigval_tfm']
+        # ------------------------ calculate grad and hessian ------------------------ #
+        with torch.enable_grad():
+            loss = var.loss = var.loss_approx = closure(False)
+            g_list = torch.autograd.grad(loss, params, create_graph=True)
+            var.grad = list(g_list)
+            xp = torch.cat([t.ravel() for t in g_list])
+            I = torch.eye(xp.numel(), dtype=xp.dtype, device=xp.device)
+            for o in range(2, order + 1):
+                is_last = o == order
+                H_list = jacobian_wrt([xp], params, create_graph=not is_last, batched=vectorize)
+                with torch.no_grad() if is_last else nullcontext():
+                    H = hessian_list_to_mat(H_list)
+                    if reg != 0: H = H + I * reg
+                    x = None
+                    if search_negative or (is_last and eigval_tfm is not None):
+                        x = eigh_solve(H, xp, eigval_tfm, search_negative=search_negative)
+                    if x is None: x = cholesky_solve(H, xp)
+                    if x is None: x = lu_solve(H, xp)
+                    if x is None: x = least_squares_solve(H, xp)
+                    xp = x.squeeze()
+        var.update = vec_to_tensors(xp, params)
+        return var

torchzero/modules/experimental/reduce_outward_lr.py CHANGED Viewed

@@ -1,30 +1,33 @@
 import torch
 from ...core import Target, Transform
-from ...utils import TensorList
+from ...utils import TensorList, unpack_states, unpack_dicts
 class ReduceOutwardLR(Transform):
     """
     When update sign matches weight sign, the learning rate for that weight is multiplied by `mul`.
     This means updates that move weights towards zero have higher learning rates.
+    A note on this is that it sounded good but its really bad in practice.
     """
     def __init__(self, mul = 0.5, use_grad=False, invert=False, target: Target = 'update'):
         defaults = dict(mul=mul, use_grad=use_grad, invert=invert)
         super().__init__(defaults, uses_grad=use_grad, target=target)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
+    def apply(self, tensors, params, grads, loss, states, settings):
         params = TensorList(params)
         tensors = TensorList(tensors)
-        mul = self.get_settings('mul', params=params)
-        s = self.settings[params[0]]
+        mul = [s['mul'] for s in settings]
+        s = settings[0]
         use_grad = s['use_grad']
         invert = s['invert']
-        if use_grad: cur = vars.get_grad()
+        if use_grad: cur = grads
         else: cur = tensors
+        assert cur is not None
         # mask of weights where sign matches with update sign (minus ascent sign), multiplied by `mul`.
         if invert: mask = (params * cur) > 0

torchzero/modules/experimental/soapy.py CHANGED Viewed

@@ -2,147 +2,22 @@ from operator import itemgetter
 import torch
-from ...core import Chainable, Transform, apply
+from ...core import Chainable, Transform
 from ..optimizers.shampoo import _merge_small_dims, _unmerge_small_dims
-@torch.no_grad
-def update_soap_covariances_(
-    grad: torch.Tensor,
-    GGs_: list[torch.Tensor | None],
-    beta: float | None,
-):
-    for i, GG in enumerate(GGs_):
-        if GG is None: continue
-        axes = list(range(i)) + list(range(i + 1, grad.ndim)) # this works fine with 1d params
-        if beta is None: GG.add_(torch.tensordot(grad, grad, (axes, axes))) # pyright:ignore[reportArgumentType]
-        else: GG.lerp_(torch.tensordot(grad, grad, (axes, axes)), 1-beta) # pyright:ignore[reportArgumentType]
-@torch.no_grad
-def project(tensors: torch.Tensor, Q: list[torch.Tensor | None]):
-    """
-    Projects the gradient to the eigenbases of the preconditioner.
-    """
-    for mat in Q:
-        if mat is None: continue
-        if len(mat) > 0:
-            tensors = torch.tensordot(tensors, mat, dims=[[0], [0]]) # pyright:ignore[reportArgumentType]
-        else:
-            # I don't understand this part but it is in https://github.com/nikhilvyas/SOAP/blob/main/soap.py
-            permute_order = list(range(1, len(tensors.shape))) + [0]
-            tensors = tensors.permute(permute_order)
-    return tensors
-@torch.no_grad
-def project_back(tensors: torch.Tensor, Q: list[torch.Tensor| None]):
-    """
-    Projects the gradient back to the original space.
-    """
-    for mat in Q:
-        if mat is None: continue
-        if len(mat) > 0:
-            tensors = torch.tensordot(tensors, mat,dims=[[0], [1]]) # pyright:ignore[reportArgumentType]
-        else:
-            permute_order = list(range(1, len(tensors.shape))) + [0]
-            tensors = tensors.permute(permute_order)
-    return tensors
-# function from https://github.com/nikhilvyas/SOAP/blob/main/soap.py
-@torch.no_grad
-def get_orthogonal_matrix(mat: list[torch.Tensor | None]):
-    """
-    Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
-    """
-    matrix = []
-    float_data = False
-    original_type = original_device = None
-    for m in mat:
-        if m is None: continue
-        if len(m) == 0:
-            matrix.append([])
-            continue
-        if m.dtype != torch.float:
-            original_type = m.dtype
-            original_device = m.device
-            matrix.append(m.float())
-        else:
-            float_data = True
-            matrix.append(m)
-    final = []
-    for m in matrix:
-        if len(m) == 0:
-            final.append([])
-            continue
-        try:
-            _, Q = torch.linalg.eigh(m+1e-30*torch.eye(m.shape[0], device=m.device)) # pylint:disable=not-callable
-        except Exception:
-            _, Q = torch.linalg.eigh(m.to(torch.float64)+1e-30*torch.eye(m.shape[0], device=m.device)) # pylint:disable=not-callable
-            Q = Q.to(m.dtype)
-        Q = torch.flip(Q, [1])
-        if not float_data:
-            Q = Q.to(original_device).type(original_type)
-        final.append(Q)
-    return final
-# function from https://github.com/nikhilvyas/SOAP/blob/main/soap.py#L240
-@torch.no_grad
-def get_orthogonal_matrix_QR(exp_avg_sq: torch.Tensor, GG: list[torch.Tensor | None], Q_list: list[torch.Tensor | None]):
-    """
-    Computes the eigenbases of the preconditioner using one round of power iteration
-    followed by torch.linalg.qr decomposition.
-    """
-    matrix = []
-    orth_matrix = []
-    float_data = False
-    original_type = original_device = None
-    for m,o in zip(GG, Q_list):
-        if m is None: continue
-        assert o is not None
-        if len(m) == 0:
-            matrix.append([])
-            orth_matrix.append([])
-            continue
-        if m.data.dtype != torch.float:
-            original_type = m.data.dtype
-            original_device = m.data.device
-            matrix.append(m.data.float())
-            orth_matrix.append(o.data.float())
-        else:
-            float_data = True
-            matrix.append(m.data.float())
-            orth_matrix.append(o.data.float())
-    final = []
-    for ind, (m,o) in enumerate(zip(matrix, orth_matrix)):
-        if len(m)==0:
-            final.append([])
-            continue
-        est_eig = torch.diag(o.T @ m @ o)
-        sort_idx = torch.argsort(est_eig, descending=True)
-        exp_avg_sq = exp_avg_sq.index_select(ind, sort_idx)
-        o = o[:,sort_idx]
-        power_iter = m @ o
-        Q, _ = torch.linalg.qr(power_iter) # pylint:disable=not-callable
-        if not float_data:
-            Q = Q.to(original_device).type(original_type)
-        final.append(Q)
-    return final, exp_avg_sq
+from ..optimizers.soap import (
+    update_soap_covariances_,
+    get_orthogonal_matrix,
+    get_orthogonal_matrix_QR,
+    project,
+    project_back,
+)
 class SOAPY(Transform):
-    """SOAP but uses scaled gradient differences
-    new args
-    scale by s whether to scale gradient differences by parameter differences
+    """Adam but uses scaled gradient differences for GGᵀ. Please note that this is experimental and isn't guaranteed to work.
-    y_to_ema2 whether to use gradient differences for exponential moving average too
+    New args:
+        scale_by_s - whether to scale gradient differences by parameter differences
+        y_to_ema2 - whether to use gradient differences for exponential moving average too
     """
     def __init__(
         self,
@@ -178,16 +53,14 @@ class SOAPY(Transform):
         super().__init__(defaults, uses_grad=False)
     @torch.no_grad
-    def transform(self, tensors, params, grads, vars):
+    def apply(self, tensors, params, grads, loss, states, settings):
         updates = []
         # update preconditioners
-        for i,(p,t) in enumerate(zip(params, tensors)):
-            state = self.state[p]
-            settings = self.settings[p]
+        for i,(p,t, state, setting) in enumerate(zip(params, tensors, states, settings)):
             beta1, beta2, shampoo_beta, merge_small, max_dim, precondition_1d, eps, alpha = itemgetter(
-                'beta1', 'beta2', 'shampoo_beta', 'merge_small', 'max_dim', 'precondition_1d', 'eps', 'alpha')(settings)
-            scale_by_s = settings['scale_by_s']
-            y_to_ema2 = settings['y_to_ema2']
+                'beta1', 'beta2', 'shampoo_beta', 'merge_small', 'max_dim', 'precondition_1d', 'eps', 'alpha')(setting)
+            scale_by_s = setting['scale_by_s']
+            y_to_ema2 = setting['y_to_ema2']
             if merge_small:
                 t, state['flat_sizes'], state['sort_idxs'] = _merge_small_dims(t, max_dim)
@@ -268,7 +141,7 @@ class SOAPY(Transform):
             if z_projected is not None:
                 update = project_back(update, state["Q"])
-            if settings['bias_correction']:
+            if setting['bias_correction']:
                 bias_correction1 = 1.0 - beta1 ** (state["step"]+1)
                 bias_correction2 = 1.0 - beta2 ** (state["step"]+1)
                 update *= ((bias_correction2 ** .5) / bias_correction1) * alpha
@@ -284,7 +157,7 @@ class SOAPY(Transform):
             # Update is done after the gradient step to avoid using current gradients in the projection.
             if state['GG'] is not None:
                 update_soap_covariances_(y, state['GG'], shampoo_beta)
-                if state['step'] % settings['precond_freq'] == 0:
+                if state['step'] % setting['precond_freq'] == 0:
                     state['Q'], state['exp_avg_sq'] = get_orthogonal_matrix_QR(exp_avg_sq, state['GG'], state['Q'])
         return updates

torchzero 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

torchzero 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl