PyPI - torchzero - Versions diffs - 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl - Mend

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

tests/test_opts.py +95 -69
tests/test_tensorlist.py +8 -7
torchzero/__init__.py +1 -1
torchzero/core/__init__.py +2 -2
torchzero/core/module.py +225 -72
torchzero/core/reformulation.py +65 -0
torchzero/core/transform.py +44 -24
torchzero/modules/__init__.py +13 -5
torchzero/modules/{optimizers → adaptive}/__init__.py +5 -2
torchzero/modules/adaptive/adagrad.py +356 -0
torchzero/modules/{optimizers → adaptive}/adahessian.py +53 -52
torchzero/modules/{optimizers → adaptive}/adam.py +0 -3
torchzero/modules/{optimizers → adaptive}/adan.py +26 -40
torchzero/modules/{optimizers → adaptive}/adaptive_heavyball.py +3 -6
torchzero/modules/adaptive/aegd.py +54 -0
torchzero/modules/{optimizers → adaptive}/esgd.py +1 -1
torchzero/modules/{optimizers/ladagrad.py → adaptive/lmadagrad.py} +42 -39
torchzero/modules/{optimizers → adaptive}/mars.py +24 -36
torchzero/modules/adaptive/matrix_momentum.py +146 -0
torchzero/modules/{optimizers → adaptive}/msam.py +14 -12
torchzero/modules/{optimizers → adaptive}/muon.py +19 -20
torchzero/modules/adaptive/natural_gradient.py +175 -0
torchzero/modules/{optimizers → adaptive}/rprop.py +0 -2
torchzero/modules/{optimizers → adaptive}/sam.py +1 -1
torchzero/modules/{optimizers → adaptive}/shampoo.py +8 -4
torchzero/modules/{optimizers → adaptive}/soap.py +27 -50
torchzero/modules/{optimizers → adaptive}/sophia_h.py +2 -3
torchzero/modules/clipping/clipping.py +85 -92
torchzero/modules/clipping/ema_clipping.py +5 -5
torchzero/modules/conjugate_gradient/__init__.py +11 -0
torchzero/modules/{quasi_newton → conjugate_gradient}/cg.py +355 -369
torchzero/modules/experimental/__init__.py +9 -32
torchzero/modules/experimental/dct.py +2 -2
torchzero/modules/experimental/fft.py +2 -2
torchzero/modules/experimental/gradmin.py +4 -3
torchzero/modules/experimental/l_infinity.py +111 -0
torchzero/modules/{momentum/experimental.py → experimental/momentum.py} +3 -40
torchzero/modules/experimental/newton_solver.py +79 -17
torchzero/modules/experimental/newtonnewton.py +27 -14
torchzero/modules/experimental/scipy_newton_cg.py +105 -0
torchzero/modules/experimental/structural_projections.py +1 -1
torchzero/modules/functional.py +50 -14
torchzero/modules/grad_approximation/fdm.py +19 -20
torchzero/modules/grad_approximation/forward_gradient.py +4 -2
torchzero/modules/grad_approximation/grad_approximator.py +43 -47
torchzero/modules/grad_approximation/rfdm.py +144 -122
torchzero/modules/higher_order/__init__.py +1 -1
torchzero/modules/higher_order/higher_order_newton.py +31 -23
torchzero/modules/least_squares/__init__.py +1 -0
torchzero/modules/least_squares/gn.py +161 -0
torchzero/modules/line_search/__init__.py +2 -2
torchzero/modules/line_search/_polyinterp.py +289 -0
torchzero/modules/line_search/adaptive.py +69 -44
torchzero/modules/line_search/backtracking.py +83 -70
torchzero/modules/line_search/line_search.py +159 -68
torchzero/modules/line_search/scipy.py +1 -1
torchzero/modules/line_search/strong_wolfe.py +319 -218
torchzero/modules/misc/__init__.py +8 -0
torchzero/modules/misc/debug.py +4 -4
torchzero/modules/misc/escape.py +9 -7
torchzero/modules/misc/gradient_accumulation.py +88 -22
torchzero/modules/misc/homotopy.py +59 -0
torchzero/modules/misc/misc.py +82 -15
torchzero/modules/misc/multistep.py +47 -11
torchzero/modules/misc/regularization.py +5 -9
torchzero/modules/misc/split.py +55 -35
torchzero/modules/misc/switch.py +1 -1
torchzero/modules/momentum/__init__.py +1 -5
torchzero/modules/momentum/averaging.py +3 -3
torchzero/modules/momentum/cautious.py +42 -47
torchzero/modules/momentum/momentum.py +35 -1
torchzero/modules/ops/__init__.py +9 -1
torchzero/modules/ops/binary.py +9 -8
torchzero/modules/{momentum/ema.py → ops/higher_level.py} +10 -33
torchzero/modules/ops/multi.py +15 -15
torchzero/modules/ops/reduce.py +1 -1
torchzero/modules/ops/utility.py +12 -8
torchzero/modules/projections/projection.py +4 -4
torchzero/modules/quasi_newton/__init__.py +1 -16
torchzero/modules/quasi_newton/damping.py +105 -0
torchzero/modules/quasi_newton/diagonal_quasi_newton.py +167 -163
torchzero/modules/quasi_newton/lbfgs.py +256 -200
torchzero/modules/quasi_newton/lsr1.py +167 -132
torchzero/modules/quasi_newton/quasi_newton.py +346 -446
torchzero/modules/restarts/__init__.py +7 -0
torchzero/modules/restarts/restars.py +252 -0
torchzero/modules/second_order/__init__.py +2 -1
torchzero/modules/second_order/multipoint.py +238 -0
torchzero/modules/second_order/newton.py +133 -88
torchzero/modules/second_order/newton_cg.py +141 -80
torchzero/modules/smoothing/__init__.py +1 -1
torchzero/modules/smoothing/sampling.py +300 -0
torchzero/modules/step_size/__init__.py +1 -1
torchzero/modules/step_size/adaptive.py +312 -47
torchzero/modules/termination/__init__.py +14 -0
torchzero/modules/termination/termination.py +207 -0
torchzero/modules/trust_region/__init__.py +5 -0
torchzero/modules/trust_region/cubic_regularization.py +170 -0
torchzero/modules/trust_region/dogleg.py +92 -0
torchzero/modules/trust_region/levenberg_marquardt.py +128 -0
torchzero/modules/trust_region/trust_cg.py +97 -0
torchzero/modules/trust_region/trust_region.py +350 -0
torchzero/modules/variance_reduction/__init__.py +1 -0
torchzero/modules/variance_reduction/svrg.py +208 -0
torchzero/modules/weight_decay/weight_decay.py +65 -64
torchzero/modules/zeroth_order/__init__.py +1 -0
torchzero/modules/zeroth_order/cd.py +359 -0
torchzero/optim/root.py +65 -0
torchzero/optim/utility/split.py +8 -8
torchzero/optim/wrappers/directsearch.py +0 -1
torchzero/optim/wrappers/fcmaes.py +3 -2
torchzero/optim/wrappers/nlopt.py +0 -2
torchzero/optim/wrappers/optuna.py +2 -2
torchzero/optim/wrappers/scipy.py +81 -22
torchzero/utils/__init__.py +40 -4
torchzero/utils/compile.py +1 -1
torchzero/utils/derivatives.py +123 -111
torchzero/utils/linalg/__init__.py +9 -2
torchzero/utils/linalg/linear_operator.py +329 -0
torchzero/utils/linalg/matrix_funcs.py +2 -2
torchzero/utils/linalg/orthogonalize.py +2 -1
torchzero/utils/linalg/qr.py +2 -2
torchzero/utils/linalg/solve.py +226 -154
torchzero/utils/metrics.py +83 -0
torchzero/utils/python_tools.py +6 -0
torchzero/utils/tensorlist.py +105 -34
torchzero/utils/torch_tools.py +9 -4
torchzero-0.3.13.dist-info/METADATA +14 -0
torchzero-0.3.13.dist-info/RECORD +166 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/top_level.txt +0 -1
docs/source/conf.py +0 -59
docs/source/docstring template.py +0 -46
torchzero/modules/experimental/absoap.py +0 -253
torchzero/modules/experimental/adadam.py +0 -118
torchzero/modules/experimental/adamY.py +0 -131
torchzero/modules/experimental/adam_lambertw.py +0 -149
torchzero/modules/experimental/adaptive_step_size.py +0 -90
torchzero/modules/experimental/adasoap.py +0 -177
torchzero/modules/experimental/cosine.py +0 -214
torchzero/modules/experimental/cubic_adam.py +0 -97
torchzero/modules/experimental/eigendescent.py +0 -120
torchzero/modules/experimental/etf.py +0 -195
torchzero/modules/experimental/exp_adam.py +0 -113
torchzero/modules/experimental/expanded_lbfgs.py +0 -141
torchzero/modules/experimental/hnewton.py +0 -85
torchzero/modules/experimental/modular_lbfgs.py +0 -265
torchzero/modules/experimental/parabolic_search.py +0 -220
torchzero/modules/experimental/subspace_preconditioners.py +0 -145
torchzero/modules/experimental/tensor_adagrad.py +0 -42
torchzero/modules/line_search/polynomial.py +0 -233
torchzero/modules/momentum/matrix_momentum.py +0 -193
torchzero/modules/optimizers/adagrad.py +0 -165
torchzero/modules/quasi_newton/trust_region.py +0 -397
torchzero/modules/smoothing/gaussian.py +0 -198
torchzero-0.3.11.dist-info/METADATA +0 -404
torchzero-0.3.11.dist-info/RECORD +0 -159
torchzero-0.3.11.dist-info/licenses/LICENSE +0 -21
/torchzero/modules/{optimizers → adaptive}/lion.py +0 -0
/torchzero/modules/{optimizers → adaptive}/orthograd.py +0 -0
/torchzero/modules/{optimizers → adaptive}/rmsprop.py +0 -0
{torchzero-0.3.11.dist-info → torchzero-0.3.13.dist-info}/WHEEL +0 -0

torchzero/modules/clipping/clipping.py CHANGED Viewed

@@ -1,11 +1,13 @@
+import math
+from collections.abc import Iterable, Sequence
 from operator import itemgetter
 from typing import Literal
-from collections.abc import Iterable, Sequence
-import math
 import torch
 from ...core import Module, Target, Transform
-from ...utils import NumberList, TensorList
+from ...utils import Metrics, NumberList, TensorList
+from ...utils.metrics import _METRICS
 def clip_grad_value_(params: Iterable[torch.Tensor], value: float):
@@ -24,7 +26,7 @@ def _clip_norm_(
     min: float | NumberList | None,
     max: float | NumberList | None,
     norm_value: float | NumberList | None,
-    ord: float | Literal['mean_abs'],
+    ord: Metrics,
     dim: int | Sequence[int] | Literal["global"] | None,
     inverse_dims: bool,
     min_size: int,
@@ -35,7 +37,7 @@ def _clip_norm_(
             raise ValueError(f'if norm_value is given then min and max must be None got {min = }; {max = }')
         # if dim is None: return tensors_.mul_(norm_value / tensors_.norm(ord=ord))
-        if dim == 'global': return tensors_.mul_(norm_value / tensors_.global_vector_norm(ord=ord))
+        if dim == 'global': return tensors_.mul_(norm_value / tensors_.global_metric(ord))
     # if dim is None: return tensors_.clip_norm_(min,max,tensorwise=True,ord=ord)
     if dim == 'global': return tensors_.clip_norm_(min,max,tensorwise=False,ord=ord)
@@ -54,8 +56,8 @@ def _clip_norm_(
         size = math.prod(tensor.size(d) for d in real_dim)
         if size < min_size: continue
-        if ord == 'mean_abs':
-            norm = tensor.abs().mean(dim=real_dim, keepdim=True)
+        if isinstance(ord, str):
+            norm = _METRICS[ord].evaluate_tensor(tensor, dim=real_dim, keepdim=True)
         else:
             norm: torch.Tensor = torch.linalg.vector_norm(tensor, ord=ord, dim=real_dim, keepdim=True) # pylint:disable=not-callable
@@ -94,7 +96,7 @@ def _clip_norm_(
 def clip_grad_norm_(
     params: Iterable[torch.Tensor],
     max_norm: float | None,
-    ord: float | Literal['mean_abs'] = 2,
+    ord: Metrics = 2,
     dim: int | Sequence[int] | Literal["global"] | None = None,
     inverse_dims: bool = False,
     min_size: int = 2,
@@ -105,7 +107,7 @@ def clip_grad_norm_(
     Args:
         params (Iterable[torch.Tensor]): parameters with gradients to clip.
-        value (float): value to clip norm to.
+        max_norm (float): value to clip norm to.
         ord (float, optional): norm order. Defaults to 2.
         dim (int | Sequence[int] | str | None, optional):
             calculates norm along those dimensions.
@@ -122,7 +124,7 @@ def clip_grad_norm_(
 def normalize_grads_(
     params: Iterable[torch.Tensor],
     norm_value: float,
-    ord: float | Literal['mean_abs'] = 2,
+    ord: Metrics = 2,
     dim: int | Sequence[int] | Literal["global"] | None = None,
     inverse_dims: bool = False,
     min_size: int = 1,
@@ -149,35 +151,33 @@ def normalize_grads_(
 class ClipValue(Transform):
-    """Clips update magnitude to be within `(-value, value)` range.
+    """Clips update magnitude to be within ``(-value, value)`` range.
     Args:
         value (float): value to clip to.
-        target (str): refer to :ref:`target argument` in documentation.
+        target (str): refer to ``target argument`` in documentation.
     Examples:
-        Gradient clipping:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.ClipValue(1),
-                tz.m.Adam(),
-                tz.m.LR(1e-2),
-            )
-        Update clipping:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Adam(),
-                tz.m.ClipValue(1),
-                tz.m.LR(1e-2),
-            )
+    Gradient clipping:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.ClipValue(1),
+        tz.m.Adam(),
+        tz.m.LR(1e-2),
+    )
+    ```
+    Update clipping:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Adam(),
+        tz.m.ClipValue(1),
+        tz.m.LR(1e-2),
+    )
+    ```
     """
     def __init__(self, value: float, target: Target = 'update'):
@@ -193,7 +193,7 @@ class ClipNorm(Transform):
     """Clips update norm to be no larger than `value`.
     Args:
-        value (float): value to clip norm to.
+        max_norm (float): value to clip norm to.
         ord (float, optional): norm order. Defaults to 2.
         dim (int | Sequence[int] | str | None, optional):
             calculates norm along those dimensions.
@@ -209,32 +209,30 @@ class ClipNorm(Transform):
     Examples:
-        Gradient norm clipping:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.ClipNorm(1),
-                tz.m.Adam(),
-                tz.m.LR(1e-2),
-            )
-        Update norm clipping:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Adam(),
-                tz.m.ClipNorm(1),
-                tz.m.LR(1e-2),
-            )
+    Gradient norm clipping:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.ClipNorm(1),
+        tz.m.Adam(),
+        tz.m.LR(1e-2),
+    )
+    ```
+    Update norm clipping:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Adam(),
+        tz.m.ClipNorm(1),
+        tz.m.LR(1e-2),
+    )
+    ```
     """
     def __init__(
         self,
         max_norm: float,
-        ord: float | Literal['mean_abs'] = 2,
+        ord: Metrics = 2,
         dim: int | Sequence[int] | Literal["global"] | None = None,
         inverse_dims: bool = False,
         min_size: int = 1,
@@ -263,7 +261,7 @@ class Normalize(Transform):
     """Normalizes the update.
     Args:
-        value (float): desired norm value.
+        norm_value (float): desired norm value.
         ord (float, optional): norm order. Defaults to 2.
         dim (int | Sequence[int] | str | None, optional):
             calculates norm along those dimensions.
@@ -278,33 +276,31 @@ class Normalize(Transform):
             what this affects.
     Examples:
-        Gradient normalization:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Normalize(1),
-                tz.m.Adam(),
-                tz.m.LR(1e-2),
-            )
-        Update normalization:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Adam(),
-                tz.m.Normalize(1),
-                tz.m.LR(1e-2),
-            )
+    Gradient normalization:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Normalize(1),
+        tz.m.Adam(),
+        tz.m.LR(1e-2),
+    )
+    ```
+    Update normalization:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Adam(),
+        tz.m.Normalize(1),
+        tz.m.LR(1e-2),
+    )
+    ```
     """
     def __init__(
         self,
         norm_value: float = 1,
-        ord: float | Literal['mean_abs'] = 2,
+        ord: Metrics = 2,
         dim: int | Sequence[int] | Literal["global"] | None = None,
         inverse_dims: bool = False,
         min_size: int = 1,
@@ -370,8 +366,6 @@ class Centralize(Transform):
     """Centralizes the update.
     Args:
-        value (float): desired norm value.
-        ord (float, optional): norm order. Defaults to 2.
         dim (int | Sequence[int] | str | None, optional):
             calculates norm along those dimensions.
             If list/tuple, tensors are centralized along all dimensios in `dim` that they have.
@@ -384,18 +378,17 @@ class Centralize(Transform):
     Examples:
-        Standard gradient centralization:
-        .. code-block:: python
-            opt = tz.Modular(
-                model.parameters(),
-                tz.m.Centralize(dim=0),
-                tz.m.LR(1e-2),
-            )
+    Standard gradient centralization:
+    ```python
+    opt = tz.Modular(
+        model.parameters(),
+        tz.m.Centralize(dim=0),
+        tz.m.LR(1e-2),
+    )
+    ```
     References:
-        - Yong, H., Huang, J., Hua, X., & Zhang, L. (2020). Gradient centralization: A new optimization technique for deep neural networks. In Computer Vision–ECCV 2020: 16th European Conference, Glasgow, UK, August 23–28, 2020, Proceedings, Part I 16 (pp. 635-652). Springer International Publishing. https://arxiv.org/abs/2004.01461
+    - Yong, H., Huang, J., Hua, X., & Zhang, L. (2020). Gradient centralization: A new optimization technique for deep neural networks. In Computer Vision–ECCV 2020: 16th European Conference, Glasgow, UK, August 23–28, 2020, Proceedings, Part I 16 (pp. 635-652). Springer International Publishing. https://arxiv.org/abs/2004.01461
     """
     def __init__(
         self,

torchzero/modules/clipping/ema_clipping.py CHANGED Viewed

@@ -5,7 +5,7 @@ from collections.abc import Iterable, Sequence
 import torch
 from ...core import Module, Target, Transform, apply_transform, Chainable
-from ...utils import NumberList, TensorList, unpack_dicts, unpack_states
+from ...utils import NumberList, TensorList, unpack_dicts, unpack_states, Metrics
 class ClipNormByEMA(Transform):
     """Clips norm to be no larger than the norm of an exponential moving average of past updates.
@@ -25,7 +25,7 @@ class ClipNormByEMA(Transform):
     def __init__(
         self,
         beta=0.99,
-        ord: float = 2,
+        ord: Metrics = 2,
         eps=1e-6,
         tensorwise:bool=True,
         max_ema_growth: float | None = 1.5,
@@ -47,7 +47,7 @@ class ClipNormByEMA(Transform):
         ema.lerp_(tensors, 1-beta)
         if tensorwise:
-            ema_norm = ema.norm(ord)
+            ema_norm = ema.metric(ord)
             # clip ema norm growth
             if max_ema_growth is not None:
@@ -64,7 +64,7 @@ class ClipNormByEMA(Transform):
             else: denom.clip_(min=1)
         else:
-            ema_norm = ema.global_vector_norm(ord)
+            ema_norm = ema.global_metric(ord)
             # clip ema norm growth
             if max_ema_growth is not None:
@@ -75,7 +75,7 @@ class ClipNormByEMA(Transform):
                     ema_norm = allowed_norm
                 prev_ema_norm.set_(ema_norm)
-            tensors_norm = tensors.global_vector_norm(ord)
+            tensors_norm = tensors.global_metric(ord)
             denom = tensors_norm / ema_norm.clip(min=eps[0])
             if self.NORMALIZE: denom.clip_(min=eps[0])
             else: denom.clip_(min=1)

torchzero/modules/conjugate_gradient/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .cg import (
+    DYHS,
+    ConjugateDescent,
+    DaiYuan,
+    FletcherReeves,
+    HagerZhang,
+    HestenesStiefel,
+    LiuStorey,
+    PolakRibiere,
+    ProjectedGradientMethod,
+)

torchzero 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

torchzero 0.3.11py3-none-any.whl → 0.3.13py3-none-any.whl