PyPI - torchzero - Versions diffs - 0.1.7__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

torchzero 0.1.7py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

docs/source/conf.py +57 -0
tests/test_identical.py +230 -0
tests/test_module.py +50 -0
tests/test_opts.py +884 -0
tests/test_tensorlist.py +1787 -0
tests/test_utils_optimizer.py +170 -0
tests/test_vars.py +184 -0
torchzero/__init__.py +4 -4
torchzero/core/__init__.py +3 -13
torchzero/core/module.py +629 -494
torchzero/core/preconditioner.py +137 -0
torchzero/core/transform.py +252 -0
torchzero/modules/__init__.py +13 -21
torchzero/modules/clipping/__init__.py +3 -0
torchzero/modules/clipping/clipping.py +320 -0
torchzero/modules/clipping/ema_clipping.py +135 -0
torchzero/modules/clipping/growth_clipping.py +187 -0
torchzero/modules/experimental/__init__.py +13 -18
torchzero/modules/experimental/absoap.py +350 -0
torchzero/modules/experimental/adadam.py +111 -0
torchzero/modules/experimental/adamY.py +135 -0
torchzero/modules/experimental/adasoap.py +282 -0
torchzero/modules/experimental/algebraic_newton.py +145 -0
torchzero/modules/experimental/curveball.py +89 -0
torchzero/modules/experimental/dsoap.py +290 -0
torchzero/modules/experimental/gradmin.py +85 -0
torchzero/modules/experimental/reduce_outward_lr.py +35 -0
torchzero/modules/experimental/spectral.py +286 -0
torchzero/modules/experimental/subspace_preconditioners.py +128 -0
torchzero/modules/experimental/tropical_newton.py +136 -0
torchzero/modules/functional.py +209 -0
torchzero/modules/grad_approximation/__init__.py +4 -0
torchzero/modules/grad_approximation/fdm.py +120 -0
torchzero/modules/grad_approximation/forward_gradient.py +81 -0
torchzero/modules/grad_approximation/grad_approximator.py +66 -0
torchzero/modules/grad_approximation/rfdm.py +259 -0
torchzero/modules/line_search/__init__.py +5 -30
torchzero/modules/line_search/backtracking.py +186 -0
torchzero/modules/line_search/line_search.py +181 -0
torchzero/modules/line_search/scipy.py +37 -0
torchzero/modules/line_search/strong_wolfe.py +260 -0
torchzero/modules/line_search/trust_region.py +61 -0
torchzero/modules/lr/__init__.py +2 -0
torchzero/modules/lr/lr.py +59 -0
torchzero/modules/lr/step_size.py +97 -0
torchzero/modules/momentum/__init__.py +14 -4
torchzero/modules/momentum/averaging.py +78 -0
torchzero/modules/momentum/cautious.py +181 -0
torchzero/modules/momentum/ema.py +173 -0
torchzero/modules/momentum/experimental.py +189 -0
torchzero/modules/momentum/matrix_momentum.py +124 -0
torchzero/modules/momentum/momentum.py +43 -106
torchzero/modules/ops/__init__.py +103 -0
torchzero/modules/ops/accumulate.py +65 -0
torchzero/modules/ops/binary.py +240 -0
torchzero/modules/ops/debug.py +25 -0
torchzero/modules/ops/misc.py +419 -0
torchzero/modules/ops/multi.py +137 -0
torchzero/modules/ops/reduce.py +149 -0
torchzero/modules/ops/split.py +75 -0
torchzero/modules/ops/switch.py +68 -0
torchzero/modules/ops/unary.py +115 -0
torchzero/modules/ops/utility.py +112 -0
torchzero/modules/optimizers/__init__.py +18 -10
torchzero/modules/optimizers/adagrad.py +146 -49
torchzero/modules/optimizers/adam.py +112 -118
torchzero/modules/optimizers/lion.py +18 -11
torchzero/modules/optimizers/muon.py +222 -0
torchzero/modules/optimizers/orthograd.py +55 -0
torchzero/modules/optimizers/rmsprop.py +103 -51
torchzero/modules/optimizers/rprop.py +342 -99
torchzero/modules/optimizers/shampoo.py +197 -0
torchzero/modules/optimizers/soap.py +286 -0
torchzero/modules/optimizers/sophia_h.py +129 -0
torchzero/modules/projections/__init__.py +5 -0
torchzero/modules/projections/dct.py +73 -0
torchzero/modules/projections/fft.py +73 -0
torchzero/modules/projections/galore.py +10 -0
torchzero/modules/projections/projection.py +218 -0
torchzero/modules/projections/structural.py +151 -0
torchzero/modules/quasi_newton/__init__.py +7 -4
torchzero/modules/quasi_newton/cg.py +218 -0
torchzero/modules/quasi_newton/experimental/__init__.py +1 -0
torchzero/modules/quasi_newton/experimental/modular_lbfgs.py +265 -0
torchzero/modules/quasi_newton/lbfgs.py +228 -0
torchzero/modules/quasi_newton/lsr1.py +170 -0
torchzero/modules/quasi_newton/olbfgs.py +196 -0
torchzero/modules/quasi_newton/quasi_newton.py +475 -0
torchzero/modules/second_order/__init__.py +3 -4
torchzero/modules/second_order/newton.py +142 -165
torchzero/modules/second_order/newton_cg.py +84 -0
torchzero/modules/second_order/nystrom.py +168 -0
torchzero/modules/smoothing/__init__.py +2 -5
torchzero/modules/smoothing/gaussian.py +164 -0
torchzero/modules/smoothing/{laplacian_smoothing.py → laplacian.py} +115 -128
torchzero/modules/weight_decay/__init__.py +1 -0
torchzero/modules/weight_decay/weight_decay.py +52 -0
torchzero/modules/wrappers/__init__.py +1 -0
torchzero/modules/wrappers/optim_wrapper.py +91 -0
torchzero/optim/__init__.py +2 -10
torchzero/optim/utility/__init__.py +1 -0
torchzero/optim/utility/split.py +45 -0
torchzero/optim/wrappers/nevergrad.py +2 -28
torchzero/optim/wrappers/nlopt.py +31 -16
torchzero/optim/wrappers/scipy.py +79 -156
torchzero/utils/__init__.py +27 -0
torchzero/utils/compile.py +175 -37
torchzero/utils/derivatives.py +513 -99
torchzero/utils/linalg/__init__.py +5 -0
torchzero/utils/linalg/matrix_funcs.py +87 -0
torchzero/utils/linalg/orthogonalize.py +11 -0
torchzero/utils/linalg/qr.py +71 -0
torchzero/utils/linalg/solve.py +168 -0
torchzero/utils/linalg/svd.py +20 -0
torchzero/utils/numberlist.py +132 -0
torchzero/utils/ops.py +10 -0
torchzero/utils/optimizer.py +284 -0
torchzero/utils/optuna_tools.py +40 -0
torchzero/utils/params.py +149 -0
torchzero/utils/python_tools.py +40 -25
torchzero/utils/tensorlist.py +1081 -0
torchzero/utils/torch_tools.py +48 -12
torchzero-0.3.1.dist-info/METADATA +379 -0
torchzero-0.3.1.dist-info/RECORD +128 -0
{torchzero-0.1.7.dist-info → torchzero-0.3.1.dist-info}/WHEEL +1 -1
{torchzero-0.1.7.dist-info → torchzero-0.3.1.dist-info/licenses}/LICENSE +0 -0
torchzero-0.3.1.dist-info/top_level.txt +3 -0
torchzero/core/tensorlist_optimizer.py +0 -219
torchzero/modules/adaptive/__init__.py +0 -4
torchzero/modules/adaptive/adaptive.py +0 -192
torchzero/modules/experimental/experimental.py +0 -294
torchzero/modules/experimental/quad_interp.py +0 -104
torchzero/modules/experimental/subspace.py +0 -259
torchzero/modules/gradient_approximation/__init__.py +0 -7
torchzero/modules/gradient_approximation/_fd_formulas.py +0 -3
torchzero/modules/gradient_approximation/base_approximator.py +0 -105
torchzero/modules/gradient_approximation/fdm.py +0 -125
torchzero/modules/gradient_approximation/forward_gradient.py +0 -163
torchzero/modules/gradient_approximation/newton_fdm.py +0 -198
torchzero/modules/gradient_approximation/rfdm.py +0 -125
torchzero/modules/line_search/armijo.py +0 -56
torchzero/modules/line_search/base_ls.py +0 -139
torchzero/modules/line_search/directional_newton.py +0 -217
torchzero/modules/line_search/grid_ls.py +0 -158
torchzero/modules/line_search/scipy_minimize_scalar.py +0 -62
torchzero/modules/meta/__init__.py +0 -12
torchzero/modules/meta/alternate.py +0 -65
torchzero/modules/meta/grafting.py +0 -195
torchzero/modules/meta/optimizer_wrapper.py +0 -173
torchzero/modules/meta/return_overrides.py +0 -46
torchzero/modules/misc/__init__.py +0 -10
torchzero/modules/misc/accumulate.py +0 -43
torchzero/modules/misc/basic.py +0 -115
torchzero/modules/misc/lr.py +0 -96
torchzero/modules/misc/multistep.py +0 -51
torchzero/modules/misc/on_increase.py +0 -53
torchzero/modules/operations/__init__.py +0 -29
torchzero/modules/operations/multi.py +0 -298
torchzero/modules/operations/reduction.py +0 -134
torchzero/modules/operations/singular.py +0 -113
torchzero/modules/optimizers/sgd.py +0 -54
torchzero/modules/orthogonalization/__init__.py +0 -2
torchzero/modules/orthogonalization/newtonschulz.py +0 -159
torchzero/modules/orthogonalization/svd.py +0 -86
torchzero/modules/regularization/__init__.py +0 -22
torchzero/modules/regularization/dropout.py +0 -34
torchzero/modules/regularization/noise.py +0 -77
torchzero/modules/regularization/normalization.py +0 -328
torchzero/modules/regularization/ortho_grad.py +0 -78
torchzero/modules/regularization/weight_decay.py +0 -92
torchzero/modules/scheduling/__init__.py +0 -2
torchzero/modules/scheduling/lr_schedulers.py +0 -131
torchzero/modules/scheduling/step_size.py +0 -80
torchzero/modules/smoothing/gaussian_smoothing.py +0 -90
torchzero/modules/weight_averaging/__init__.py +0 -2
torchzero/modules/weight_averaging/ema.py +0 -72
torchzero/modules/weight_averaging/swa.py +0 -171
torchzero/optim/experimental/__init__.py +0 -20
torchzero/optim/experimental/experimental.py +0 -343
torchzero/optim/experimental/ray_search.py +0 -83
torchzero/optim/first_order/__init__.py +0 -18
torchzero/optim/first_order/cautious.py +0 -158
torchzero/optim/first_order/forward_gradient.py +0 -70
torchzero/optim/first_order/optimizers.py +0 -570
torchzero/optim/modular.py +0 -132
torchzero/optim/quasi_newton/__init__.py +0 -1
torchzero/optim/quasi_newton/directional_newton.py +0 -58
torchzero/optim/second_order/__init__.py +0 -1
torchzero/optim/second_order/newton.py +0 -94
torchzero/optim/zeroth_order/__init__.py +0 -4
torchzero/optim/zeroth_order/fdm.py +0 -87
torchzero/optim/zeroth_order/newton_fdm.py +0 -146
torchzero/optim/zeroth_order/rfdm.py +0 -217
torchzero/optim/zeroth_order/rs.py +0 -85
torchzero/random/__init__.py +0 -1
torchzero/random/random.py +0 -46
torchzero/tensorlist.py +0 -826
torchzero-0.1.7.dist-info/METADATA +0 -120
torchzero-0.1.7.dist-info/RECORD +0 -104
torchzero-0.1.7.dist-info/top_level.txt +0 -1

torchzero/modules/quasi_newton/olbfgs.py ADDED Viewed

@@ -0,0 +1,196 @@
+from collections import deque
+from functools import partial
+from operator import itemgetter
+from typing import Literal
+import torch
+from ...core import Chainable, Module, Transform, Vars, apply
+from ...utils import NumberList, TensorList, as_tensorlist
+from .lbfgs import _adaptive_damping, lbfgs
+@torch.no_grad
+def _store_sk_yk_after_step_hook(optimizer, vars: Vars, prev_params: TensorList, prev_grad: TensorList, damping, init_damping, eigval_bounds, s_history: deque[TensorList], y_history: deque[TensorList], sy_history: deque[torch.Tensor]):
+    assert vars.closure is not None
+    with torch.enable_grad(): vars.closure()
+    grad = [p.grad if p.grad is not None else torch.zeros_like(p) for p in vars.params]
+    s_k = vars.params - prev_params
+    y_k = grad - prev_grad
+    ys_k = s_k.dot(y_k)
+    if damping:
+        s_k, y_k, ys_k = _adaptive_damping(s_k, y_k, ys_k, init_damping=init_damping, eigval_bounds=eigval_bounds)
+    if ys_k > 1e-10:
+        s_history.append(s_k)
+        y_history.append(y_k)
+        sy_history.append(ys_k)
+class OnlineLBFGS(Module):
+    """Online L-BFGS.
+    Parameter and gradient differences are sampled from the same mini-batch by performing an extra forward and backward pass.
+    However I did a bunch of experiments and the online part doesn't seem to help. Normal L-BFGS is usually still
+    better because it performs twice as many steps, and it is reasonably stable with normalization or grafting.
+    Args:
+        history_size (int, optional): number of past parameter differences and gradient differences to store. Defaults to 10.
+        sample_grads (str, optional):
+            - "before" - samples current mini-batch gradient at previous and current parameters, calculates y_k
+            and adds it to history before stepping.
+            - "after" - samples current mini-batch gradient at parameters before stepping and after updating parameters.
+                s_k and y_k are added after parameter update, therefore they are delayed by 1 step.
+            In practice both modes behave very similarly. Defaults to 'before'.
+        tol (float | None, optional):
+            tolerance for minimal gradient difference to avoid instability after converging to minima. Defaults to 1e-10.
+        damping (bool, optional):
+            whether to use adaptive damping. Learning rate might need to be lowered with this enabled. Defaults to False.
+        init_damping (float, optional):
+            initial damping for adaptive dampening. Defaults to 0.9.
+        eigval_bounds (tuple, optional):
+            eigenvalue bounds for adaptive dampening. Defaults to (0.5, 50).
+        params_beta (float | None, optional):
+            if not None, EMA of parameters is used for preconditioner update. Defaults to None.
+        grads_beta (float | None, optional):
+            if not None, EMA of gradients is used for preconditioner update. Defaults to None.
+        update_freq (int, optional):
+            how often to update L-BFGS history. Defaults to 1.
+        z_beta (float | None, optional):
+            optional EMA for initial H^-1 @ q. Acts as a kind of momentum but is prone to get stuck. Defaults to None.
+        inner (Chainable | None, optional):
+            optional inner modules applied after updating L-BFGS history and before preconditioning. Defaults to None.
+    """
+    def __init__(
+        self,
+        history_size=10,
+        sample_grads: Literal['before', 'after'] = 'before',
+        tol: float | None = 1e-10,
+        damping: bool = False,
+        init_damping=0.9,
+        eigval_bounds=(0.5, 50),
+        z_beta: float | None = None,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(history_size=history_size, tol=tol, damping=damping, init_damping=init_damping, eigval_bounds=eigval_bounds, sample_grads=sample_grads, z_beta=z_beta)
+        super().__init__(defaults)
+        self.global_state['s_history'] = deque(maxlen=history_size)
+        self.global_state['y_history'] = deque(maxlen=history_size)
+        self.global_state['sy_history'] = deque(maxlen=history_size)
+        if inner is not None:
+            self.set_child('inner', inner)
+    def reset(self):
+        """Resets the internal state of the L-SR1 module."""
+        # super().reset() # Clears self.state (per-parameter) if any, and "step"
+        # Re-initialize L-SR1 specific global state
+        self.state.clear()
+        self.global_state['step'] = 0
+        self.global_state['s_history'].clear()
+        self.global_state['y_history'].clear()
+        self.global_state['sy_history'].clear()
+    @torch.no_grad
+    def step(self, vars):
+        assert vars.closure is not None
+        params = as_tensorlist(vars.params)
+        update = as_tensorlist(vars.get_update())
+        step = self.global_state.get('step', 0)
+        self.global_state['step'] = step + 1
+        # history of s and k
+        s_history: deque[TensorList] = self.global_state['s_history']
+        y_history: deque[TensorList] = self.global_state['y_history']
+        sy_history: deque[torch.Tensor] = self.global_state['sy_history']
+        tol, damping, init_damping, eigval_bounds, sample_grads, z_beta = itemgetter(
+            'tol', 'damping', 'init_damping', 'eigval_bounds', 'sample_grads', 'z_beta')(self.settings[params[0]])
+        # sample gradient at previous params with current mini-batch
+        if sample_grads == 'before':
+            prev_params = self.get_state('prev_params', params=params, cls=TensorList)
+            if step == 0:
+                s_k = None; y_k = None; ys_k = None
+            else:
+                s_k = params - prev_params
+                current_params = params.clone()
+                params.set_(prev_params)
+                with torch.enable_grad(): vars.closure()
+                y_k = update - params.grad
+                ys_k = s_k.dot(y_k)
+                params.set_(current_params)
+                if damping:
+                    s_k, y_k, ys_k = _adaptive_damping(s_k, y_k, ys_k, init_damping=init_damping, eigval_bounds=eigval_bounds)
+                if ys_k > 1e-10:
+                    s_history.append(s_k)
+                    y_history.append(y_k)
+                    sy_history.append(ys_k)
+            prev_params.copy_(params)
+        # use previous s_k, y_k pair, samples gradient at current batch before and after updating parameters
+        elif sample_grads == 'after':
+            if len(s_history) == 0:
+                s_k = None; y_k = None; ys_k = None
+            else:
+                s_k = s_history[-1]
+                y_k = y_history[-1]
+                ys_k = s_k.dot(y_k)
+            # this will run after params are updated by Modular after running all future modules
+            vars.post_step_hooks.append(
+                partial(
+                    _store_sk_yk_after_step_hook,
+                    prev_params=params.clone(),
+                    prev_grad=update.clone(),
+                    damping=damping,
+                    init_damping=init_damping,
+                    eigval_bounds=eigval_bounds,
+                    s_history=s_history,
+                    y_history=y_history,
+                    sy_history=sy_history,
+                ))
+        else:
+            raise ValueError(sample_grads)
+        # step with inner module before applying preconditioner
+        if self.children:
+            update = TensorList(apply(self.children['inner'], tensors=update, params=params, grads=vars.grad, vars=vars))
+        # tolerance on gradient difference to avoid exploding after converging
+        if tol is not None:
+            if y_k is not None and y_k.abs().global_max() <= tol:
+                vars.update = update # may have been updated by inner module, probably makes sense to use it here?
+                return vars
+        # lerp initial H^-1 @ q guess
+        z_ema = None
+        if z_beta is not None:
+            z_ema = self.get_state('z_ema', params=vars.params, cls=TensorList)
+        # precondition
+        dir = lbfgs(
+            tensors_=as_tensorlist(update),
+            s_history=s_history,
+            y_history=y_history,
+            sy_history=sy_history,
+            y_k=y_k,
+            ys_k=ys_k,
+            z_beta = z_beta,
+            z_ema = z_ema,
+            step=step
+        )
+        vars.update = dir
+        return vars

torchzero/modules/quasi_newton/quasi_newton.py ADDED Viewed

@@ -0,0 +1,475 @@
+"""Use BFGS or maybe SR1."""
+from typing import Any, Literal
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+import torch
+from ...core import Chainable, Module, Preconditioner, TensorwisePreconditioner
+from ...utils import TensorList, set_storage_
+def _safe_dict_update_(d1_:dict, d2:dict):
+    inter = set(d1_.keys()).intersection(d2.keys())
+    if len(inter) > 0: raise RuntimeError(f"Duplicate keys {inter}")
+    d1_.update(d2)
+def _maybe_lerp_(state, key, value: torch.Tensor, beta: float | None):
+    if (beta is None) or (beta == 0) or (key not in state): state[key] = value
+    elif state[key].shape != value.shape: state[key] = value
+    else: state[key].lerp_(value, 1-beta)
+class HessianUpdateStrategy(TensorwisePreconditioner, ABC):
+    def __init__(
+        self,
+        defaults: dict | None = None,
+        init_scale: float | Literal["auto"] = "auto",
+        tol: float = 1e-10,
+        tol_reset: bool = True,
+        reset_interval: int | None = None,
+        beta: float | None = None,
+        update_freq: int = 1,
+        scale_first: bool = True,
+        scale_second: bool = False,
+        concat_params: bool = True,
+        inverse: bool = True,
+        inner: Chainable | None = None,
+    ):
+        if defaults is None: defaults = {}
+        _safe_dict_update_(defaults, dict(init_scale=init_scale, tol=tol, tol_reset=tol_reset, scale_second=scale_second, inverse=inverse, beta=beta, reset_interval=reset_interval))
+        super().__init__(defaults, uses_grad=False, concat_params=concat_params, update_freq=update_freq, scale_first=scale_first, inner=inner)
+    def _get_init_scale(self,s:torch.Tensor,y:torch.Tensor) -> torch.Tensor | float:
+        """returns multiplier to H or B"""
+        ys = y.dot(s)
+        yy = y.dot(y)
+        if ys != 0 and yy != 0: return yy/ys
+        return 1
+    def _reset_M_(self, M: torch.Tensor, s:torch.Tensor,y:torch.Tensor,inverse:bool, init_scale: Any):
+        set_storage_(M, torch.eye(M.size(-1), device=M.device, dtype=M.dtype))
+        if init_scale == 'auto': init_scale = self._get_init_scale(s,y)
+        if init_scale >= 1:
+            if inverse: M /= init_scale
+            else: M *= init_scale
+    def update_H(self, H:torch.Tensor, s:torch.Tensor, y:torch.Tensor, p:torch.Tensor, g:torch.Tensor,
+                 p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], settings: Mapping[str, Any]) -> torch.Tensor:
+        """update hessian inverse"""
+        raise NotImplementedError
+    def update_B(self, B:torch.Tensor, s:torch.Tensor, y:torch.Tensor, p:torch.Tensor, g:torch.Tensor,
+                 p_prev:torch.Tensor, g_prev:torch.Tensor, state: dict[str, Any], settings: Mapping[str, Any]) -> torch.Tensor:
+        """update hessian"""
+        raise NotImplementedError
+    @torch.no_grad
+    def update_tensor(self, tensor, param, grad, state, settings):
+        p = param.view(-1); g = tensor.view(-1)
+        inverse = settings['inverse']
+        M_key = 'H' if inverse else 'B'
+        M = state.get(M_key, None)
+        step = state.get('step', 0)
+        init_scale = settings['init_scale']
+        tol = settings['tol']
+        tol_reset = settings['tol_reset']
+        reset_interval = settings['reset_interval']
+        if M is None:
+            M = torch.eye(p.size(0), device=p.device, dtype=p.dtype)
+            if isinstance(init_scale, (int, float)) and init_scale != 1:
+                if inverse: M /= init_scale
+                else: M *= init_scale
+            state[M_key] = M
+            state['p_prev'] = p.clone()
+            state['g_prev'] = g.clone()
+            return
+        p_prev = state['p_prev']
+        g_prev = state['g_prev']
+        s: torch.Tensor = p - p_prev
+        y: torch.Tensor = g - g_prev
+        state['p_prev'].copy_(p)
+        state['g_prev'].copy_(g)
+        if reset_interval is not None and step % reset_interval == 0:
+            self._reset_M_(M, s, y, inverse, init_scale)
+            return
+        # tolerance on gradient difference to avoid exploding after converging
+        if y.abs().max() <= tol:
+            # reset history
+            if tol_reset: self._reset_M_(M, s, y, inverse, init_scale)
+            return
+        if step == 1 and init_scale == 'auto':
+            if inverse: M /= self._get_init_scale(s,y)
+            else: M *= self._get_init_scale(s,y)
+        beta = settings['beta']
+        if beta is not None and beta != 0: M = M.clone() # because all of them update it in-place
+        if inverse:
+            H_new = self.update_H(H=M, s=s, y=y, p=p, g=g, p_prev=p_prev, g_prev=g_prev, state=state, settings=settings)
+            _maybe_lerp_(state, 'H', H_new, beta)
+        else:
+            B_new = self.update_B(B=M, s=s, y=y, p=p, g=g, p_prev=p_prev, g_prev=g_prev, state=state, settings=settings)
+            _maybe_lerp_(state, 'B', B_new, beta)
+    @torch.no_grad
+    def apply_tensor(self, tensor, param, grad, state, settings):
+        step = state['step'] = state.get('step', 0) + 1
+        if settings['scale_second'] and step == 2:
+            s = max(1, tensor.abs().sum()) # pyright:ignore[reportArgumentType]
+            if s < settings['tol']: tensor = tensor/s
+        inverse = settings['inverse']
+        if inverse:
+            H = state['H']
+            return (H @ tensor.view(-1)).view_as(tensor)
+        B = state['B']
+        return torch.linalg.solve_ex(B, tensor.view(-1))[0].view_as(tensor) # pylint:disable=not-callable
+# to avoid typing all arguments for each method
+class QuasiNewtonH(HessianUpdateStrategy):
+    def __init__(
+        self,
+        init_scale: float | Literal["auto"] = "auto",
+        tol: float = 1e-10,
+        tol_reset: bool = True,
+        reset_interval: int | None = None,
+        beta: float | None = None,
+        update_freq: int = 1,
+        scale_first: bool = True,
+        scale_second: bool = False,
+        concat_params: bool = True,
+        inner: Chainable | None = None,
+    ):
+        super().__init__(
+            defaults=None,
+            init_scale=init_scale,
+            tol=tol,
+            tol_reset=tol_reset,
+            reset_interval=reset_interval,
+            beta=beta,
+            update_freq=update_freq,
+            scale_first=scale_first,
+            scale_second=scale_second,
+            concat_params=concat_params,
+            inverse=True,
+            inner=inner,
+        )
+# ----------------------------------- BFGS ----------------------------------- #
+def bfgs_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    sy = torch.dot(s, y)
+    if sy <= tol: return H # don't reset H in this case
+    num1 = (sy + (y @ H @ y)) * s.outer(s)
+    term1 = num1.div_(sy**2)
+    num2 = (torch.outer(H @ y, s).add_(torch.outer(s, y) @ H))
+    term2 = num2.div_(sy)
+    H += term1.sub_(term2)
+    return H
+class BFGS(QuasiNewtonH):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        return bfgs_H_(H=H, s=s, y=y, tol=settings['tol'])
+# ------------------------------------ SR1 ----------------------------------- #
+def sr1_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol:float):
+    z = s - H@y
+    denom = torch.dot(z, y)
+    z_norm = torch.linalg.norm(z) # pylint:disable=not-callable
+    y_norm = torch.linalg.norm(y) # pylint:disable=not-callable
+    if y_norm*z_norm < tol: return H
+    # check as in Nocedal, Wright. “Numerical optimization” 2nd p.146
+    if denom.abs() <= tol * y_norm * z_norm: return H # pylint:disable=not-callable
+    H += torch.outer(z, z).div_(denom)
+    return H
+class SR1(QuasiNewtonH):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        return sr1_H_(H=H, s=s, y=y, tol=settings['tol'])
+# BFGS has defaults - init_scale = "auto" and scale_second = False
+# SR1 has defaults -  init_scale = 1 and scale_second = True
+# basically some methods work better with first and some with second.
+# I inherit from BFGS or SR1 to avoid writing all those arguments again
+# ------------------------------------ DFP ----------------------------------- #
+def dfp_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    sy = torch.dot(s, y)
+    if sy.abs() <= tol: return H
+    term1 = torch.outer(s, s).div_(sy)
+    denom = torch.dot(y, H @ y) #
+    if denom.abs() <= tol: return H
+    num = H @ torch.outer(y, y) @ H
+    term2 = num.div_(denom)
+    H += term1.sub_(term2)
+    return H
+class DFP(QuasiNewtonH):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        return dfp_H_(H=H, s=s, y=y, tol=settings['tol'])
+# formulas for methods below from Spedicato, E., & Huang, Z. (1997). Numerical experience with newton-like methods for nonlinear algebraic systems. Computing, 58(1), 69–89. doi:10.1007/bf02684472
+# H' = H - (Hy - S)c^T / c^T*y
+# the difference is how `c` is calculated
+def broyden_good_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    c = H.T @ s
+    denom = c.dot(y)
+    if denom.abs() <= tol: return H
+    num = (H@y).sub_(s).outer(c)
+    H -= num/denom
+    return H
+def broyden_bad_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    c = y
+    denom = c.dot(y)
+    if denom.abs() <= tol: return H
+    num = (H@y).sub_(s).outer(c)
+    H -= num/denom
+    return H
+def greenstadt1_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g_prev: torch.Tensor, tol: float):
+    c = g_prev
+    denom = c.dot(y)
+    if denom.abs() <= tol: return H
+    num = (H@y).sub_(s).outer(c)
+    H -= num/denom
+    return H
+def greenstadt2_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    c = torch.linalg.multi_dot([H,H,y]) # pylint:disable=not-callable
+    denom = c.dot(y)
+    if denom.abs() <= tol: return H
+    num = (H@y).sub_(s).outer(c)
+    H -= num/denom
+    return H
+class BroydenGood(QuasiNewtonH):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        return broyden_good_H_(H=H, s=s, y=y, tol=settings['tol'])
+class BroydenBad(QuasiNewtonH):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        return broyden_bad_H_(H=H, s=s, y=y, tol=settings['tol'])
+class Greenstadt1(QuasiNewtonH):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        return greenstadt1_H_(H=H, s=s, y=y, g_prev=g_prev, tol=settings['tol'])
+class Greenstadt2(QuasiNewtonH):
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        return greenstadt2_H_(H=H, s=s, y=y, tol=settings['tol'])
+def column_updating_H_(H:torch.Tensor, s:torch.Tensor, y:torch.Tensor, tol:float):
+    n = H.shape[0]
+    j = y.abs().argmax()
+    u = torch.zeros(n, device=H.device, dtype=H.dtype)
+    u[j] = 1.0
+    denom = y[j]
+    if denom.abs() < tol: return H
+    Hy = H @ y.unsqueeze(1)
+    num = s.unsqueeze(1) - Hy
+    H[:, j] += num.squeeze() / denom
+    return H
+class ColumnUpdatingMethod(QuasiNewtonH):
+    """Lopes, V. L., & Martínez, J. M. (1995). Convergence properties of the inverse column-updating method. Optimization Methods & Software, 6(2), 127–144. from https://www.ime.unicamp.br/sites/default/files/pesquisa/relatorios/rp-1993-76.pdf"""
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        return column_updating_H_(H=H, s=s, y=y, tol=settings['tol'])
+def thomas_H_(H: torch.Tensor, R:torch.Tensor, s: torch.Tensor, y: torch.Tensor, tol:float):
+    s_norm = torch.linalg.vector_norm(s) # pylint:disable=not-callable
+    I = torch.eye(H.size(-1), device=H.device, dtype=H.dtype)
+    d = (R + I * (s_norm/2)) @ s
+    denom = d.dot(s)
+    if denom.abs() <= tol: return H, R
+    R = (1 + s_norm) * ((I*s_norm).add_(R).sub_(d.outer(d).div_(denom)))
+    c = H.T @ d
+    denom = c.dot(y)
+    if denom.abs() <= tol: return H, R
+    num = (H@y).sub_(s).outer(c)
+    H -= num/denom
+    return H, R
+class ThomasOptimalMethod(QuasiNewtonH):
+    """Thomas, Stephen Walter. Sequential estimation techniques for quasi-Newton algorithms. Cornell University, 1975."""
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        if 'R' not in state: state['R'] = torch.eye(H.size(-1), device=H.device, dtype=H.dtype)
+        H, state['R'] = thomas_H_(H=H, R=state['R'], s=s, y=y, tol=settings['tol'])
+        return H
+# ------------------------ powell's symmetric broyden ------------------------ #
+def psb_B_(B: torch.Tensor, s: torch.Tensor, y: torch.Tensor, tol:float):
+    y_Bs = y - B@s
+    ss = s.dot(s)
+    if ss.abs() < tol: return B
+    num1 = y_Bs.outer(s).add_(s.outer(y_Bs))
+    term1 = num1.div_(ss)
+    term2 = s.outer(s).mul_(y_Bs.dot(s)/(ss**2))
+    B += term1.sub_(term2)
+    return B
+class PSB(HessianUpdateStrategy):
+    def __init__(
+        self,
+        init_scale: float | Literal["auto"] = 'auto',
+        tol: float = 1e-10,
+        tol_reset: bool = True,
+        reset_interval: int | None = None,
+        beta: float | None = None,
+        update_freq: int = 1,
+        scale_first: bool = True,
+        scale_second: bool = False,
+        concat_params: bool = True,
+        inner: Chainable | None = None,
+    ):
+        super().__init__(
+            defaults=None,
+            init_scale=init_scale,
+            tol=tol,
+            tol_reset=tol_reset,
+            reset_interval=reset_interval,
+            beta=beta,
+            update_freq=update_freq,
+            scale_first=scale_first,
+            scale_second=scale_second,
+            concat_params=concat_params,
+            inverse=False,
+            inner=inner,
+        )
+    def update_B(self, B, s, y, p, g, p_prev, g_prev, state, settings):
+        return psb_B_(B=B, s=s, y=y, tol=settings['tol'])
+def pearson2_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, tol: float):
+    sy = s.dot(y)
+    if sy.abs() <= tol: return H
+    num = (s - H@y).outer(s)
+    H += num.div_(sy)
+    return H
+class Pearson2(QuasiNewtonH):
+    """finally found a reference in https://www.recotechnologies.com/~beigi/ps/asme-jdsmc-93-2.pdf"""
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        return pearson2_H_(H=H, s=s, y=y, tol=settings['tol'])
+# Oren, S. S., & Spedicato, E. (1976). Optimal conditioning of self-scaling variable metric algorithms. Mathematical programming, 10(1), 70-90.
+def ssvm_H_(H:torch.Tensor, s: torch.Tensor, y:torch.Tensor, g:torch.Tensor, switch: tuple[float,float] | Literal[1,2,3,4], tol: float):
+    # in notation p is s, q is y, H is D
+    # another p is lr
+    # omega (o) = sy
+    # tau (t) = yHy
+    # epsilon = p'D^-1 p
+    # however p.12 says eps = gs / gHy
+    Hy = H@y
+    gHy = g.dot(Hy)
+    yHy = y.dot(Hy)
+    sy = s.dot(y)
+    if sy < tol: return H
+    if yHy.abs() < tol: return H
+    if gHy.abs() < tol: return H
+    v_mul = yHy.sqrt()
+    v_term1 = s/sy
+    v_term2 = Hy/yHy
+    v = (v_term1.sub_(v_term2)).mul_(v_mul)
+    gs = g.dot(s)
+    if isinstance(switch, tuple): phi, theta = switch
+    else:
+        o = sy
+        t = yHy
+        e = gs / gHy
+        if switch in (1, 3):
+            if e/o <= 1:
+                if o.abs() <= tol: return H
+                phi = e/o
+                theta = 0
+            elif o/t >= 1:
+                if t.abs() <= tol: return H
+                phi = o/t
+                theta = 1
+            else:
+                phi = 1
+                denom = e*t - o**2
+                if denom.abs() <= tol: return H
+                if switch == 1: theta = o * (e - o) / denom
+                else: theta = o * (t - o) / denom
+        elif switch == 2:
+            if t.abs() <= tol or o.abs() <= tol or e.abs() <= tol: return H
+            phi = (e / t) ** 0.5
+            theta = 1 / (1 + (t*e / o**2)**0.5)
+        elif switch == 4:
+            if t.abs() <= tol: return H
+            phi = e/t
+            theta = 1/2
+        else: raise ValueError(switch)
+    u = phi * (gs/gHy) + (1 - phi) * (sy/yHy)
+    term1 = (H @ y.outer(y) @ H).div_(yHy)
+    term2 = v.outer(v).mul_(theta)
+    term3 = s.outer(s).div_(sy)
+    H -= term1
+    H += term2
+    H *= u
+    H += term3
+    return H
+class SSVM(HessianUpdateStrategy):
+    """This one is from Oren, S. S., & Spedicato, E. (1976). Optimal conditioning of self-scaling variable Metric algorithms. Mathematical Programming, 10(1), 70–90. doi:10.1007/bf01580654
+    """
+    def __init__(
+        self,
+        switch: tuple[float,float] | Literal[1,2,3,4] = 3,
+        init_scale: float | Literal["auto"] = 'auto',
+        tol: float = 1e-10,
+        tol_reset: bool = True,
+        reset_interval: int | None = None,
+        beta: float | None = None,
+        update_freq: int = 1,
+        scale_first: bool = True,
+        scale_second: bool = False,
+        concat_params: bool = True,
+        inner: Chainable | None = None,
+    ):
+        defaults = dict(switch=switch)
+        super().__init__(
+            defaults=defaults,
+            init_scale=init_scale,
+            tol=tol,
+            tol_reset=tol_reset,
+            reset_interval=reset_interval,
+            beta=beta,
+            update_freq=update_freq,
+            scale_first=scale_first,
+            scale_second=scale_second,
+            concat_params=concat_params,
+            inverse=True,
+            inner=inner,
+        )
+    def update_H(self, H, s, y, p, g, p_prev, g_prev, state, settings):
+        return ssvm_H_(H=H, s=s, y=y, g=g, switch=settings['switch'], tol=settings['tol'])

torchzero/modules/second_order/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
-r"""
-This includes modules that use the hessian computed via autograd.
-"""
-from .newton import ExactNewton, LinearSystemSolvers, FallbackLinearSystemSolvers, LINEAR_SYSTEM_SOLVERS
+from .newton import Newton
+from .newton_cg import NewtonCG
+from .nystrom import NystromSketchAndSolve, NystromPCG

torchzero 0.1.7__py3-none-any.whl → 0.3.1__py3-none-any.whl

torchzero 0.1.7py3-none-any.whl → 0.3.1py3-none-any.whl