torchzero 0.1.7__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchzero-0.1.8/PKG-INFO +130 -0
- torchzero-0.1.8/README.md +92 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/pyproject.toml +1 -1
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/core/module.py +16 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/modular.py +16 -0
- torchzero-0.1.8/src/torchzero.egg-info/PKG-INFO +130 -0
- torchzero-0.1.7/PKG-INFO +0 -120
- torchzero-0.1.7/README.md +0 -82
- torchzero-0.1.7/src/torchzero.egg-info/PKG-INFO +0 -120
- {torchzero-0.1.7 → torchzero-0.1.8}/LICENSE +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/setup.cfg +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/core/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/core/tensorlist_optimizer.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/adaptive/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/adaptive/adaptive.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/experimental/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/experimental/experimental.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/experimental/quad_interp.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/experimental/subspace.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/_fd_formulas.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/base_approximator.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/fdm.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/forward_gradient.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/newton_fdm.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/rfdm.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/line_search/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/line_search/armijo.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/line_search/base_ls.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/line_search/directional_newton.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/line_search/grid_ls.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/line_search/scipy_minimize_scalar.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/meta/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/meta/alternate.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/meta/grafting.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/meta/optimizer_wrapper.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/meta/return_overrides.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/misc/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/misc/accumulate.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/misc/basic.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/misc/lr.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/misc/multistep.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/misc/on_increase.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/momentum/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/momentum/momentum.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/operations/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/operations/multi.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/operations/reduction.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/operations/singular.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/optimizers/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/optimizers/adagrad.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/optimizers/adam.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/optimizers/lion.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/optimizers/rmsprop.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/optimizers/rprop.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/optimizers/sgd.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/orthogonalization/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/orthogonalization/newtonschulz.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/orthogonalization/svd.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/quasi_newton/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/regularization/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/regularization/dropout.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/regularization/noise.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/regularization/normalization.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/regularization/ortho_grad.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/regularization/weight_decay.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/scheduling/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/scheduling/lr_schedulers.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/scheduling/step_size.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/second_order/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/second_order/newton.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/smoothing/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/smoothing/gaussian_smoothing.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/smoothing/laplacian_smoothing.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/weight_averaging/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/weight_averaging/ema.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/weight_averaging/swa.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/experimental/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/experimental/experimental.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/experimental/ray_search.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/first_order/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/first_order/cautious.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/first_order/forward_gradient.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/first_order/optimizers.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/quasi_newton/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/quasi_newton/directional_newton.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/second_order/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/second_order/newton.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/wrappers/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/wrappers/nevergrad.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/wrappers/nlopt.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/wrappers/scipy.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/zeroth_order/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/zeroth_order/fdm.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/zeroth_order/newton_fdm.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/zeroth_order/rfdm.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/optim/zeroth_order/rs.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/random/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/random/random.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/tensorlist.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/utils/__init__.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/utils/compile.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/utils/derivatives.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/utils/python_tools.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/utils/torch_tools.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero.egg-info/SOURCES.txt +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero.egg-info/dependency_links.txt +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero.egg-info/requires.txt +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero.egg-info/top_level.txt +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/tests/test_against_reference.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/tests/test_modules.py +0 -0
- {torchzero-0.1.7 → torchzero-0.1.8}/tests/test_tensorlist.py +0 -0
torchzero-0.1.8/PKG-INFO
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: torchzero
|
|
3
|
+
Version: 0.1.8
|
|
4
|
+
Summary: Modular optimization library for PyTorch.
|
|
5
|
+
Author-email: Ivan Nikishev <nkshv2@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2024 inikishev
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/inikishev/torchzero
|
|
29
|
+
Project-URL: Repository, https://github.com/inikishev/torchzero
|
|
30
|
+
Project-URL: Issues, https://github.com/inikishev/torchzero/isses
|
|
31
|
+
Keywords: optimization,optimizers,torch,neural networks,zeroth order,second order
|
|
32
|
+
Requires-Python: >=3.10
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
License-File: LICENSE
|
|
35
|
+
Requires-Dist: torch
|
|
36
|
+
Requires-Dist: numpy
|
|
37
|
+
Requires-Dist: typing_extensions
|
|
38
|
+
|
|
39
|
+

|
|
40
|
+
|
|
41
|
+
# torchzero
|
|
42
|
+
|
|
43
|
+
`torchzero` implements a large number of chainable optimization modules that can be chained together to create custom optimizers:
|
|
44
|
+
|
|
45
|
+
```py
|
|
46
|
+
import torchzero as tz
|
|
47
|
+
|
|
48
|
+
optimizer = tz.Modular(
|
|
49
|
+
model.parameters(),
|
|
50
|
+
tz.m.Adam(),
|
|
51
|
+
tz.m.Cautious(),
|
|
52
|
+
tz.m.LR(1e-3),
|
|
53
|
+
tz.m.WeightDecay(1e-4)
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# standard training loop
|
|
57
|
+
for batch in dataset:
|
|
58
|
+
preds = model(batch)
|
|
59
|
+
loss = criterion(preds)
|
|
60
|
+
optimizer.zero_grad()
|
|
61
|
+
optimizer.step()
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Each module takes the output of the previous module and applies a further transformation. This modular design avoids redundant code, such as reimplementing cautioning, orthogonalization, laplacian smoothing, etc for every optimizer. It is also easy to experiment with grafting, interpolation between different optimizers, and perhaps some weirder combinations like nested momentum.
|
|
65
|
+
|
|
66
|
+
Modules are not limited to gradient transformations. They can perform other operations like line searches, exponential moving average (EMA) and stochastic weight averaging (SWA), gradient accumulation, gradient approximation, and more.
|
|
67
|
+
|
|
68
|
+
There are over 100 modules, all accessible within the `tz.m` namespace. For example, the Adam update rule is available as `tz.m.Adam`. Complete list of modules is available in [documentation](https://torchzero.readthedocs.io/en/latest/autoapi/torchzero/modules/index.html).
|
|
69
|
+
|
|
70
|
+
## Closure
|
|
71
|
+
|
|
72
|
+
Some modules and optimizers in torchzero, particularly line-search methods and gradient approximation modules, require a closure function. This is similar to how `torch.optim.LBFGS` works in PyTorch. In torchzero, closure needs to accept a boolean backward argument (though the argument can have any name). When `backward=True`, the closure should zero out old gradients using `opt.zero_grad()`, and compute new gradients using `loss.backward()`.
|
|
73
|
+
|
|
74
|
+
```py
|
|
75
|
+
def closure(backward = True):
|
|
76
|
+
preds = model(inputs)
|
|
77
|
+
loss = loss_fn(preds, targets)
|
|
78
|
+
|
|
79
|
+
if backward:
|
|
80
|
+
optimizer.zero_grad()
|
|
81
|
+
loss.backward()
|
|
82
|
+
return loss
|
|
83
|
+
|
|
84
|
+
optimizer.step(closure)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
If you intend to use gradient-free methods, `backward` argument is still required in the closure. Simply leave it unused. Gradient-free and gradient approximation methods always call closure with `backward=False`.
|
|
88
|
+
|
|
89
|
+
All built-in pytorch optimizers, as well as most custom ones, support closure too. So the code above will work with all other optimizers out of the box, and you can switch between different optimizers without rewriting your training loop.
|
|
90
|
+
|
|
91
|
+
# Documentation
|
|
92
|
+
|
|
93
|
+
For more information on how to create, use and extend torchzero modules, please refer to the documentation at [torchzero.readthedocs.io](https://torchzero.readthedocs.io/en/latest/index.html).
|
|
94
|
+
|
|
95
|
+
# Extra
|
|
96
|
+
|
|
97
|
+
Some other optimization related things in torchzero:
|
|
98
|
+
|
|
99
|
+
### scipy.optimize.minimize wrapper
|
|
100
|
+
|
|
101
|
+
scipy.optimize.minimize wrapper with support for both gradient and hessian via batched autograd
|
|
102
|
+
|
|
103
|
+
```py
|
|
104
|
+
from torchzero.optim.wrappers.scipy import ScipyMinimize
|
|
105
|
+
opt = ScipyMinimize(model.parameters(), method = 'trust-krylov')
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Use as any other closure-based optimizer, but make sure closure accepts `backward` argument. Note that it performs full minimization on each step.
|
|
109
|
+
|
|
110
|
+
### Nevergrad wrapper
|
|
111
|
+
|
|
112
|
+
[Nevergrad](https://github.com/facebookresearch/nevergrad) is an optimization library by facebook with an insane number of gradient free methods.
|
|
113
|
+
|
|
114
|
+
```py
|
|
115
|
+
from torchzero.optim.wrappers.nevergrad import NevergradOptimizer
|
|
116
|
+
opt = NevergradOptimizer(bench.parameters(), ng.optimizers.NGOptBase, budget = 1000)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Use as any other closure-based optimizer, but make sure closure accepts `backward` argument.
|
|
120
|
+
|
|
121
|
+
### NLopt wrapper
|
|
122
|
+
|
|
123
|
+
[NLopt](https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/) is another optimization library similar to scipy.optimize.minimize, with a large number of both gradient based and gradient free methods.
|
|
124
|
+
|
|
125
|
+
```py
|
|
126
|
+
from torchzero.optim.wrappers.nlopt import NLOptOptimizer
|
|
127
|
+
opt = NLOptOptimizer(bench.parameters(), 'LD_TNEWTON_PRECOND_RESTART', maxeval = 1000)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Use as any other closure-based optimizer, but make sure closure accepts `backward` argument. Note that it performs full minimization on each step.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
# torchzero
|
|
4
|
+
|
|
5
|
+
`torchzero` implements a large number of chainable optimization modules that can be chained together to create custom optimizers:
|
|
6
|
+
|
|
7
|
+
```py
|
|
8
|
+
import torchzero as tz
|
|
9
|
+
|
|
10
|
+
optimizer = tz.Modular(
|
|
11
|
+
model.parameters(),
|
|
12
|
+
tz.m.Adam(),
|
|
13
|
+
tz.m.Cautious(),
|
|
14
|
+
tz.m.LR(1e-3),
|
|
15
|
+
tz.m.WeightDecay(1e-4)
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# standard training loop
|
|
19
|
+
for batch in dataset:
|
|
20
|
+
preds = model(batch)
|
|
21
|
+
loss = criterion(preds)
|
|
22
|
+
optimizer.zero_grad()
|
|
23
|
+
optimizer.step()
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Each module takes the output of the previous module and applies a further transformation. This modular design avoids redundant code, such as reimplementing cautioning, orthogonalization, laplacian smoothing, etc for every optimizer. It is also easy to experiment with grafting, interpolation between different optimizers, and perhaps some weirder combinations like nested momentum.
|
|
27
|
+
|
|
28
|
+
Modules are not limited to gradient transformations. They can perform other operations like line searches, exponential moving average (EMA) and stochastic weight averaging (SWA), gradient accumulation, gradient approximation, and more.
|
|
29
|
+
|
|
30
|
+
There are over 100 modules, all accessible within the `tz.m` namespace. For example, the Adam update rule is available as `tz.m.Adam`. Complete list of modules is available in [documentation](https://torchzero.readthedocs.io/en/latest/autoapi/torchzero/modules/index.html).
|
|
31
|
+
|
|
32
|
+
## Closure
|
|
33
|
+
|
|
34
|
+
Some modules and optimizers in torchzero, particularly line-search methods and gradient approximation modules, require a closure function. This is similar to how `torch.optim.LBFGS` works in PyTorch. In torchzero, closure needs to accept a boolean backward argument (though the argument can have any name). When `backward=True`, the closure should zero out old gradients using `opt.zero_grad()`, and compute new gradients using `loss.backward()`.
|
|
35
|
+
|
|
36
|
+
```py
|
|
37
|
+
def closure(backward = True):
|
|
38
|
+
preds = model(inputs)
|
|
39
|
+
loss = loss_fn(preds, targets)
|
|
40
|
+
|
|
41
|
+
if backward:
|
|
42
|
+
optimizer.zero_grad()
|
|
43
|
+
loss.backward()
|
|
44
|
+
return loss
|
|
45
|
+
|
|
46
|
+
optimizer.step(closure)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
If you intend to use gradient-free methods, `backward` argument is still required in the closure. Simply leave it unused. Gradient-free and gradient approximation methods always call closure with `backward=False`.
|
|
50
|
+
|
|
51
|
+
All built-in pytorch optimizers, as well as most custom ones, support closure too. So the code above will work with all other optimizers out of the box, and you can switch between different optimizers without rewriting your training loop.
|
|
52
|
+
|
|
53
|
+
# Documentation
|
|
54
|
+
|
|
55
|
+
For more information on how to create, use and extend torchzero modules, please refer to the documentation at [torchzero.readthedocs.io](https://torchzero.readthedocs.io/en/latest/index.html).
|
|
56
|
+
|
|
57
|
+
# Extra
|
|
58
|
+
|
|
59
|
+
Some other optimization related things in torchzero:
|
|
60
|
+
|
|
61
|
+
### scipy.optimize.minimize wrapper
|
|
62
|
+
|
|
63
|
+
scipy.optimize.minimize wrapper with support for both gradient and hessian via batched autograd
|
|
64
|
+
|
|
65
|
+
```py
|
|
66
|
+
from torchzero.optim.wrappers.scipy import ScipyMinimize
|
|
67
|
+
opt = ScipyMinimize(model.parameters(), method = 'trust-krylov')
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Use as any other closure-based optimizer, but make sure closure accepts `backward` argument. Note that it performs full minimization on each step.
|
|
71
|
+
|
|
72
|
+
### Nevergrad wrapper
|
|
73
|
+
|
|
74
|
+
[Nevergrad](https://github.com/facebookresearch/nevergrad) is an optimization library by facebook with an insane number of gradient free methods.
|
|
75
|
+
|
|
76
|
+
```py
|
|
77
|
+
from torchzero.optim.wrappers.nevergrad import NevergradOptimizer
|
|
78
|
+
opt = NevergradOptimizer(bench.parameters(), ng.optimizers.NGOptBase, budget = 1000)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Use as any other closure-based optimizer, but make sure closure accepts `backward` argument.
|
|
82
|
+
|
|
83
|
+
### NLopt wrapper
|
|
84
|
+
|
|
85
|
+
[NLopt](https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/) is another optimization library similar to scipy.optimize.minimize, with a large number of both gradient based and gradient free methods.
|
|
86
|
+
|
|
87
|
+
```py
|
|
88
|
+
from torchzero.optim.wrappers.nlopt import NLOptOptimizer
|
|
89
|
+
opt = NLOptOptimizer(bench.parameters(), 'LD_TNEWTON_PRECOND_RESTART', maxeval = 1000)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Use as any other closure-based optimizer, but make sure closure accepts `backward` argument. Note that it performs full minimization on each step.
|
|
@@ -212,6 +212,22 @@ class OptimizerModule(TensorListOptimizer, ABC): # type:ignore
|
|
|
212
212
|
if self._initialized: return super().__repr__()
|
|
213
213
|
return f"uninitialized {self.__class__.__name__}()"
|
|
214
214
|
|
|
215
|
+
def state_dict(self):
|
|
216
|
+
state_dict = {}
|
|
217
|
+
state_dict['__self__'] = super().state_dict()
|
|
218
|
+
for k,v in self.children.items():
|
|
219
|
+
state_dict[k] = v.state_dict()
|
|
220
|
+
return state_dict
|
|
221
|
+
|
|
222
|
+
def load_state_dict(self, state_dict: dict[str, Any]) -> None:
|
|
223
|
+
super().load_state_dict(state_dict['__self__'])
|
|
224
|
+
for k, v in self.children.items():
|
|
225
|
+
if k in state_dict:
|
|
226
|
+
v.load_state_dict(state_dict[k])
|
|
227
|
+
else:
|
|
228
|
+
warnings.warn(f"Tried to load state dict for {k}: {v.__class__.__name__}, but it is not present in state_dict with {list(state_dict.keys()) = }")
|
|
229
|
+
|
|
230
|
+
|
|
215
231
|
def set_params(self, params: ParamsT):
|
|
216
232
|
"""
|
|
217
233
|
Set parameters to this module. Use this to set per-parameter group settings.
|
|
@@ -2,6 +2,7 @@ from collections import abc
|
|
|
2
2
|
import warnings
|
|
3
3
|
from inspect import cleandoc
|
|
4
4
|
import torch
|
|
5
|
+
from typing import Any
|
|
5
6
|
|
|
6
7
|
from ..core import OptimizerModule, TensorListOptimizer, OptimizationVars, _Chain, _Chainable
|
|
7
8
|
from ..utils.python_tools import flatten
|
|
@@ -67,6 +68,21 @@ class Modular(TensorListOptimizer):
|
|
|
67
68
|
for hook in module.post_init_hooks:
|
|
68
69
|
hook(self, module)
|
|
69
70
|
|
|
71
|
+
def state_dict(self):
|
|
72
|
+
state_dict = {}
|
|
73
|
+
state_dict['__self__'] = super().state_dict()
|
|
74
|
+
for i,v in enumerate(self.unrolled_modules):
|
|
75
|
+
state_dict[str(i)] = v.state_dict()
|
|
76
|
+
return state_dict
|
|
77
|
+
|
|
78
|
+
def load_state_dict(self, state_dict: dict[str, Any]) -> None:
|
|
79
|
+
super().load_state_dict(state_dict['__self__'])
|
|
80
|
+
for i,v in enumerate(self.unrolled_modules):
|
|
81
|
+
if str(i) in state_dict:
|
|
82
|
+
v.load_state_dict(state_dict[str(i)])
|
|
83
|
+
else:
|
|
84
|
+
warnings.warn(f"Tried to load state dict for {i}th module: {v.__class__.__name__}, but it is not present in state_dict with {list(state_dict.keys()) = }")
|
|
85
|
+
|
|
70
86
|
def get_lr_module(self, last=True) -> OptimizerModule:
|
|
71
87
|
"""
|
|
72
88
|
Retrieves the module in the chain that controls the learning rate.
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: torchzero
|
|
3
|
+
Version: 0.1.8
|
|
4
|
+
Summary: Modular optimization library for PyTorch.
|
|
5
|
+
Author-email: Ivan Nikishev <nkshv2@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2024 inikishev
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/inikishev/torchzero
|
|
29
|
+
Project-URL: Repository, https://github.com/inikishev/torchzero
|
|
30
|
+
Project-URL: Issues, https://github.com/inikishev/torchzero/isses
|
|
31
|
+
Keywords: optimization,optimizers,torch,neural networks,zeroth order,second order
|
|
32
|
+
Requires-Python: >=3.10
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
License-File: LICENSE
|
|
35
|
+
Requires-Dist: torch
|
|
36
|
+
Requires-Dist: numpy
|
|
37
|
+
Requires-Dist: typing_extensions
|
|
38
|
+
|
|
39
|
+

|
|
40
|
+
|
|
41
|
+
# torchzero
|
|
42
|
+
|
|
43
|
+
`torchzero` implements a large number of chainable optimization modules that can be chained together to create custom optimizers:
|
|
44
|
+
|
|
45
|
+
```py
|
|
46
|
+
import torchzero as tz
|
|
47
|
+
|
|
48
|
+
optimizer = tz.Modular(
|
|
49
|
+
model.parameters(),
|
|
50
|
+
tz.m.Adam(),
|
|
51
|
+
tz.m.Cautious(),
|
|
52
|
+
tz.m.LR(1e-3),
|
|
53
|
+
tz.m.WeightDecay(1e-4)
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# standard training loop
|
|
57
|
+
for batch in dataset:
|
|
58
|
+
preds = model(batch)
|
|
59
|
+
loss = criterion(preds)
|
|
60
|
+
optimizer.zero_grad()
|
|
61
|
+
optimizer.step()
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Each module takes the output of the previous module and applies a further transformation. This modular design avoids redundant code, such as reimplementing cautioning, orthogonalization, laplacian smoothing, etc for every optimizer. It is also easy to experiment with grafting, interpolation between different optimizers, and perhaps some weirder combinations like nested momentum.
|
|
65
|
+
|
|
66
|
+
Modules are not limited to gradient transformations. They can perform other operations like line searches, exponential moving average (EMA) and stochastic weight averaging (SWA), gradient accumulation, gradient approximation, and more.
|
|
67
|
+
|
|
68
|
+
There are over 100 modules, all accessible within the `tz.m` namespace. For example, the Adam update rule is available as `tz.m.Adam`. Complete list of modules is available in [documentation](https://torchzero.readthedocs.io/en/latest/autoapi/torchzero/modules/index.html).
|
|
69
|
+
|
|
70
|
+
## Closure
|
|
71
|
+
|
|
72
|
+
Some modules and optimizers in torchzero, particularly line-search methods and gradient approximation modules, require a closure function. This is similar to how `torch.optim.LBFGS` works in PyTorch. In torchzero, closure needs to accept a boolean backward argument (though the argument can have any name). When `backward=True`, the closure should zero out old gradients using `opt.zero_grad()`, and compute new gradients using `loss.backward()`.
|
|
73
|
+
|
|
74
|
+
```py
|
|
75
|
+
def closure(backward = True):
|
|
76
|
+
preds = model(inputs)
|
|
77
|
+
loss = loss_fn(preds, targets)
|
|
78
|
+
|
|
79
|
+
if backward:
|
|
80
|
+
optimizer.zero_grad()
|
|
81
|
+
loss.backward()
|
|
82
|
+
return loss
|
|
83
|
+
|
|
84
|
+
optimizer.step(closure)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
If you intend to use gradient-free methods, `backward` argument is still required in the closure. Simply leave it unused. Gradient-free and gradient approximation methods always call closure with `backward=False`.
|
|
88
|
+
|
|
89
|
+
All built-in pytorch optimizers, as well as most custom ones, support closure too. So the code above will work with all other optimizers out of the box, and you can switch between different optimizers without rewriting your training loop.
|
|
90
|
+
|
|
91
|
+
# Documentation
|
|
92
|
+
|
|
93
|
+
For more information on how to create, use and extend torchzero modules, please refer to the documentation at [torchzero.readthedocs.io](https://torchzero.readthedocs.io/en/latest/index.html).
|
|
94
|
+
|
|
95
|
+
# Extra
|
|
96
|
+
|
|
97
|
+
Some other optimization related things in torchzero:
|
|
98
|
+
|
|
99
|
+
### scipy.optimize.minimize wrapper
|
|
100
|
+
|
|
101
|
+
scipy.optimize.minimize wrapper with support for both gradient and hessian via batched autograd
|
|
102
|
+
|
|
103
|
+
```py
|
|
104
|
+
from torchzero.optim.wrappers.scipy import ScipyMinimize
|
|
105
|
+
opt = ScipyMinimize(model.parameters(), method = 'trust-krylov')
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Use as any other closure-based optimizer, but make sure closure accepts `backward` argument. Note that it performs full minimization on each step.
|
|
109
|
+
|
|
110
|
+
### Nevergrad wrapper
|
|
111
|
+
|
|
112
|
+
[Nevergrad](https://github.com/facebookresearch/nevergrad) is an optimization library by facebook with an insane number of gradient free methods.
|
|
113
|
+
|
|
114
|
+
```py
|
|
115
|
+
from torchzero.optim.wrappers.nevergrad import NevergradOptimizer
|
|
116
|
+
opt = NevergradOptimizer(bench.parameters(), ng.optimizers.NGOptBase, budget = 1000)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Use as any other closure-based optimizer, but make sure closure accepts `backward` argument.
|
|
120
|
+
|
|
121
|
+
### NLopt wrapper
|
|
122
|
+
|
|
123
|
+
[NLopt](https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/) is another optimization library similar to scipy.optimize.minimize, with a large number of both gradient based and gradient free methods.
|
|
124
|
+
|
|
125
|
+
```py
|
|
126
|
+
from torchzero.optim.wrappers.nlopt import NLOptOptimizer
|
|
127
|
+
opt = NLOptOptimizer(bench.parameters(), 'LD_TNEWTON_PRECOND_RESTART', maxeval = 1000)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Use as any other closure-based optimizer, but make sure closure accepts `backward` argument. Note that it performs full minimization on each step.
|
torchzero-0.1.7/PKG-INFO
DELETED
|
@@ -1,120 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.2
|
|
2
|
-
Name: torchzero
|
|
3
|
-
Version: 0.1.7
|
|
4
|
-
Summary: Modular optimization library for PyTorch.
|
|
5
|
-
Author-email: Ivan Nikishev <nkshv2@gmail.com>
|
|
6
|
-
License: MIT License
|
|
7
|
-
|
|
8
|
-
Copyright (c) 2024 inikishev
|
|
9
|
-
|
|
10
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
-
in the Software without restriction, including without limitation the rights
|
|
13
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
-
furnished to do so, subject to the following conditions:
|
|
16
|
-
|
|
17
|
-
The above copyright notice and this permission notice shall be included in all
|
|
18
|
-
copies or substantial portions of the Software.
|
|
19
|
-
|
|
20
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
-
SOFTWARE.
|
|
27
|
-
|
|
28
|
-
Project-URL: Homepage, https://github.com/inikishev/torchzero
|
|
29
|
-
Project-URL: Repository, https://github.com/inikishev/torchzero
|
|
30
|
-
Project-URL: Issues, https://github.com/inikishev/torchzero/isses
|
|
31
|
-
Keywords: optimization,optimizers,torch,neural networks,zeroth order,second order
|
|
32
|
-
Requires-Python: >=3.10
|
|
33
|
-
Description-Content-Type: text/markdown
|
|
34
|
-
License-File: LICENSE
|
|
35
|
-
Requires-Dist: torch
|
|
36
|
-
Requires-Dist: numpy
|
|
37
|
-
Requires-Dist: typing_extensions
|
|
38
|
-
|
|
39
|
-

|
|
40
|
-
|
|
41
|
-
# torchzero
|
|
42
|
-
|
|
43
|
-
This is a work-in-progress optimizers library for pytorch with composable zeroth, first, second order and quasi newton methods, gradient approximation, line searches and a whole lot of other stuff.
|
|
44
|
-
|
|
45
|
-
Most optimizers are modular, meaning you can chain them like this:
|
|
46
|
-
|
|
47
|
-
```py
|
|
48
|
-
optimizer = torchzero.optim.Modular(model.parameters(), [*list of modules*])`
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
For example you might use `[ClipNorm(4), LR(1e-3), NesterovMomentum(0.9)]` for standard SGD with gradient clipping and nesterov momentum. Move `ClipNorm` to the end to clip the update instead of the gradients. If you don't have access to gradients, add a `RandomizedFDM()` at the beginning to approximate them via randomized finite differences. Add `Cautious()` to make the optimizer cautious.
|
|
52
|
-
|
|
53
|
-
Each new module takes previous module update and works on it. That way there is no need to reimplement stuff like laplacian smoothing for all optimizers, and it is easy to experiment with grafting, interpolation between different optimizers, and perhaps some weirder combinations like nested momentum.
|
|
54
|
-
|
|
55
|
-
# How to use
|
|
56
|
-
|
|
57
|
-
All modules are defined in `torchzero.modules`. You can generally mix and match them however you want. Some pre-made optimizers are available in `torchzero.optim`.
|
|
58
|
-
|
|
59
|
-
Some optimizers require closure, which should look like this:
|
|
60
|
-
|
|
61
|
-
```py
|
|
62
|
-
def closure(backward = True):
|
|
63
|
-
preds = model(inputs)
|
|
64
|
-
loss = loss_fn(preds, targets)
|
|
65
|
-
|
|
66
|
-
# if you can't call loss.backward(), and instead use gradient-free methods,
|
|
67
|
-
# they always call closure with backward=False.
|
|
68
|
-
# so you can remove the part below, but keep the unused backward argument.
|
|
69
|
-
if backward:
|
|
70
|
-
optimizer.zero_grad()
|
|
71
|
-
loss.backward()
|
|
72
|
-
return loss
|
|
73
|
-
|
|
74
|
-
optimizer.step(closure)
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
This closure will also work with all built in pytorch optimizers, including LBFGS, all optimizers in this library, as well as most custom ones.
|
|
78
|
-
|
|
79
|
-
# Contents
|
|
80
|
-
|
|
81
|
-
Docs are available at [torchzero.readthedocs.io](https://torchzero.readthedocs.io/en/latest/). A preliminary list of all modules is available here <https://torchzero.readthedocs.io/en/latest/autoapi/torchzero/modules/index.html#classes>. Some of the implemented algorithms:
|
|
82
|
-
|
|
83
|
-
- SGD/Rprop/RMSProp/AdaGrad/Adam as composable modules. They are also tested to exactly match built in pytorch versions.
|
|
84
|
-
- Cautious Optimizers (<https://huggingface.co/papers/2411.16085>)
|
|
85
|
-
- Optimizer grafting (<https://openreview.net/forum?id=FpKgG31Z_i9>)
|
|
86
|
-
- Laplacian smoothing (<https://arxiv.org/abs/1806.06317>)
|
|
87
|
-
- Polyak momentum, nesterov momentum
|
|
88
|
-
- Gradient norm and value clipping, gradient normalization
|
|
89
|
-
- Gradient centralization (<https://arxiv.org/abs/2004.01461>)
|
|
90
|
-
- Learning rate droput (<https://pubmed.ncbi.nlm.nih.gov/35286266/>).
|
|
91
|
-
- Forward gradient (<https://arxiv.org/abs/2202.08587>)
|
|
92
|
-
- Gradient approximation via finite difference or randomized finite difference, which includes SPSA, RDSA, FDSA and Gaussian smoothing (<https://arxiv.org/abs/2211.13566v3>)
|
|
93
|
-
- Various line searches
|
|
94
|
-
- Exact Newton's method (with Levenberg-Marquardt regularization), newton with hessian approximation via finite difference, subspace finite differences newton.
|
|
95
|
-
- Directional newton via one additional forward pass
|
|
96
|
-
|
|
97
|
-
All modules should be quite fast, especially on models with many different parameters, due to `_foreach` operations.
|
|
98
|
-
|
|
99
|
-
I am getting to the point where I can start focusing on good docs and tests. As of now, the code should be considered experimental, untested and subject to change, so feel free but be careful if using this for actual project.
|
|
100
|
-
|
|
101
|
-
# Wrappers
|
|
102
|
-
|
|
103
|
-
### scipy.optimize.minimize wrapper
|
|
104
|
-
|
|
105
|
-
scipy.optimize.minimize wrapper with support for both gradient and hessian via batched autograd
|
|
106
|
-
|
|
107
|
-
```py
|
|
108
|
-
from torchzero.optim.wrappers.scipy import ScipyMinimize
|
|
109
|
-
opt = ScipyMinimize(model.parameters(), method = 'trust-krylov')
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
Use as any other optimizer (make sure closure accepts `backward` argument like one from **How to use**). Note that it performs full minimization on each step.
|
|
113
|
-
|
|
114
|
-
### Nevergrad wrapper
|
|
115
|
-
|
|
116
|
-
```py
|
|
117
|
-
opt = NevergradOptimizer(bench.parameters(), ng.optimizers.NGOptBase, budget = 1000)
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
Use as any other optimizer (make sure closure accepts `backward` argument like one from **How to use**).
|
torchzero-0.1.7/README.md
DELETED
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-

|
|
2
|
-
|
|
3
|
-
# torchzero
|
|
4
|
-
|
|
5
|
-
This is a work-in-progress optimizers library for pytorch with composable zeroth, first, second order and quasi newton methods, gradient approximation, line searches and a whole lot of other stuff.
|
|
6
|
-
|
|
7
|
-
Most optimizers are modular, meaning you can chain them like this:
|
|
8
|
-
|
|
9
|
-
```py
|
|
10
|
-
optimizer = torchzero.optim.Modular(model.parameters(), [*list of modules*])`
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
For example you might use `[ClipNorm(4), LR(1e-3), NesterovMomentum(0.9)]` for standard SGD with gradient clipping and nesterov momentum. Move `ClipNorm` to the end to clip the update instead of the gradients. If you don't have access to gradients, add a `RandomizedFDM()` at the beginning to approximate them via randomized finite differences. Add `Cautious()` to make the optimizer cautious.
|
|
14
|
-
|
|
15
|
-
Each new module takes previous module update and works on it. That way there is no need to reimplement stuff like laplacian smoothing for all optimizers, and it is easy to experiment with grafting, interpolation between different optimizers, and perhaps some weirder combinations like nested momentum.
|
|
16
|
-
|
|
17
|
-
# How to use
|
|
18
|
-
|
|
19
|
-
All modules are defined in `torchzero.modules`. You can generally mix and match them however you want. Some pre-made optimizers are available in `torchzero.optim`.
|
|
20
|
-
|
|
21
|
-
Some optimizers require closure, which should look like this:
|
|
22
|
-
|
|
23
|
-
```py
|
|
24
|
-
def closure(backward = True):
|
|
25
|
-
preds = model(inputs)
|
|
26
|
-
loss = loss_fn(preds, targets)
|
|
27
|
-
|
|
28
|
-
# if you can't call loss.backward(), and instead use gradient-free methods,
|
|
29
|
-
# they always call closure with backward=False.
|
|
30
|
-
# so you can remove the part below, but keep the unused backward argument.
|
|
31
|
-
if backward:
|
|
32
|
-
optimizer.zero_grad()
|
|
33
|
-
loss.backward()
|
|
34
|
-
return loss
|
|
35
|
-
|
|
36
|
-
optimizer.step(closure)
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
This closure will also work with all built in pytorch optimizers, including LBFGS, all optimizers in this library, as well as most custom ones.
|
|
40
|
-
|
|
41
|
-
# Contents
|
|
42
|
-
|
|
43
|
-
Docs are available at [torchzero.readthedocs.io](https://torchzero.readthedocs.io/en/latest/). A preliminary list of all modules is available here <https://torchzero.readthedocs.io/en/latest/autoapi/torchzero/modules/index.html#classes>. Some of the implemented algorithms:
|
|
44
|
-
|
|
45
|
-
- SGD/Rprop/RMSProp/AdaGrad/Adam as composable modules. They are also tested to exactly match built in pytorch versions.
|
|
46
|
-
- Cautious Optimizers (<https://huggingface.co/papers/2411.16085>)
|
|
47
|
-
- Optimizer grafting (<https://openreview.net/forum?id=FpKgG31Z_i9>)
|
|
48
|
-
- Laplacian smoothing (<https://arxiv.org/abs/1806.06317>)
|
|
49
|
-
- Polyak momentum, nesterov momentum
|
|
50
|
-
- Gradient norm and value clipping, gradient normalization
|
|
51
|
-
- Gradient centralization (<https://arxiv.org/abs/2004.01461>)
|
|
52
|
-
- Learning rate droput (<https://pubmed.ncbi.nlm.nih.gov/35286266/>).
|
|
53
|
-
- Forward gradient (<https://arxiv.org/abs/2202.08587>)
|
|
54
|
-
- Gradient approximation via finite difference or randomized finite difference, which includes SPSA, RDSA, FDSA and Gaussian smoothing (<https://arxiv.org/abs/2211.13566v3>)
|
|
55
|
-
- Various line searches
|
|
56
|
-
- Exact Newton's method (with Levenberg-Marquardt regularization), newton with hessian approximation via finite difference, subspace finite differences newton.
|
|
57
|
-
- Directional newton via one additional forward pass
|
|
58
|
-
|
|
59
|
-
All modules should be quite fast, especially on models with many different parameters, due to `_foreach` operations.
|
|
60
|
-
|
|
61
|
-
I am getting to the point where I can start focusing on good docs and tests. As of now, the code should be considered experimental, untested and subject to change, so feel free but be careful if using this for actual project.
|
|
62
|
-
|
|
63
|
-
# Wrappers
|
|
64
|
-
|
|
65
|
-
### scipy.optimize.minimize wrapper
|
|
66
|
-
|
|
67
|
-
scipy.optimize.minimize wrapper with support for both gradient and hessian via batched autograd
|
|
68
|
-
|
|
69
|
-
```py
|
|
70
|
-
from torchzero.optim.wrappers.scipy import ScipyMinimize
|
|
71
|
-
opt = ScipyMinimize(model.parameters(), method = 'trust-krylov')
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
Use as any other optimizer (make sure closure accepts `backward` argument like one from **How to use**). Note that it performs full minimization on each step.
|
|
75
|
-
|
|
76
|
-
### Nevergrad wrapper
|
|
77
|
-
|
|
78
|
-
```py
|
|
79
|
-
opt = NevergradOptimizer(bench.parameters(), ng.optimizers.NGOptBase, budget = 1000)
|
|
80
|
-
```
|
|
81
|
-
|
|
82
|
-
Use as any other optimizer (make sure closure accepts `backward` argument like one from **How to use**).
|
|
@@ -1,120 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.2
|
|
2
|
-
Name: torchzero
|
|
3
|
-
Version: 0.1.7
|
|
4
|
-
Summary: Modular optimization library for PyTorch.
|
|
5
|
-
Author-email: Ivan Nikishev <nkshv2@gmail.com>
|
|
6
|
-
License: MIT License
|
|
7
|
-
|
|
8
|
-
Copyright (c) 2024 inikishev
|
|
9
|
-
|
|
10
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
-
in the Software without restriction, including without limitation the rights
|
|
13
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
-
furnished to do so, subject to the following conditions:
|
|
16
|
-
|
|
17
|
-
The above copyright notice and this permission notice shall be included in all
|
|
18
|
-
copies or substantial portions of the Software.
|
|
19
|
-
|
|
20
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
-
SOFTWARE.
|
|
27
|
-
|
|
28
|
-
Project-URL: Homepage, https://github.com/inikishev/torchzero
|
|
29
|
-
Project-URL: Repository, https://github.com/inikishev/torchzero
|
|
30
|
-
Project-URL: Issues, https://github.com/inikishev/torchzero/isses
|
|
31
|
-
Keywords: optimization,optimizers,torch,neural networks,zeroth order,second order
|
|
32
|
-
Requires-Python: >=3.10
|
|
33
|
-
Description-Content-Type: text/markdown
|
|
34
|
-
License-File: LICENSE
|
|
35
|
-
Requires-Dist: torch
|
|
36
|
-
Requires-Dist: numpy
|
|
37
|
-
Requires-Dist: typing_extensions
|
|
38
|
-
|
|
39
|
-

|
|
40
|
-
|
|
41
|
-
# torchzero
|
|
42
|
-
|
|
43
|
-
This is a work-in-progress optimizers library for pytorch with composable zeroth, first, second order and quasi newton methods, gradient approximation, line searches and a whole lot of other stuff.
|
|
44
|
-
|
|
45
|
-
Most optimizers are modular, meaning you can chain them like this:
|
|
46
|
-
|
|
47
|
-
```py
|
|
48
|
-
optimizer = torchzero.optim.Modular(model.parameters(), [*list of modules*])`
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
For example you might use `[ClipNorm(4), LR(1e-3), NesterovMomentum(0.9)]` for standard SGD with gradient clipping and nesterov momentum. Move `ClipNorm` to the end to clip the update instead of the gradients. If you don't have access to gradients, add a `RandomizedFDM()` at the beginning to approximate them via randomized finite differences. Add `Cautious()` to make the optimizer cautious.
|
|
52
|
-
|
|
53
|
-
Each new module takes previous module update and works on it. That way there is no need to reimplement stuff like laplacian smoothing for all optimizers, and it is easy to experiment with grafting, interpolation between different optimizers, and perhaps some weirder combinations like nested momentum.
|
|
54
|
-
|
|
55
|
-
# How to use
|
|
56
|
-
|
|
57
|
-
All modules are defined in `torchzero.modules`. You can generally mix and match them however you want. Some pre-made optimizers are available in `torchzero.optim`.
|
|
58
|
-
|
|
59
|
-
Some optimizers require closure, which should look like this:
|
|
60
|
-
|
|
61
|
-
```py
|
|
62
|
-
def closure(backward = True):
|
|
63
|
-
preds = model(inputs)
|
|
64
|
-
loss = loss_fn(preds, targets)
|
|
65
|
-
|
|
66
|
-
# if you can't call loss.backward(), and instead use gradient-free methods,
|
|
67
|
-
# they always call closure with backward=False.
|
|
68
|
-
# so you can remove the part below, but keep the unused backward argument.
|
|
69
|
-
if backward:
|
|
70
|
-
optimizer.zero_grad()
|
|
71
|
-
loss.backward()
|
|
72
|
-
return loss
|
|
73
|
-
|
|
74
|
-
optimizer.step(closure)
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
This closure will also work with all built in pytorch optimizers, including LBFGS, all optimizers in this library, as well as most custom ones.
|
|
78
|
-
|
|
79
|
-
# Contents
|
|
80
|
-
|
|
81
|
-
Docs are available at [torchzero.readthedocs.io](https://torchzero.readthedocs.io/en/latest/). A preliminary list of all modules is available here <https://torchzero.readthedocs.io/en/latest/autoapi/torchzero/modules/index.html#classes>. Some of the implemented algorithms:
|
|
82
|
-
|
|
83
|
-
- SGD/Rprop/RMSProp/AdaGrad/Adam as composable modules. They are also tested to exactly match built in pytorch versions.
|
|
84
|
-
- Cautious Optimizers (<https://huggingface.co/papers/2411.16085>)
|
|
85
|
-
- Optimizer grafting (<https://openreview.net/forum?id=FpKgG31Z_i9>)
|
|
86
|
-
- Laplacian smoothing (<https://arxiv.org/abs/1806.06317>)
|
|
87
|
-
- Polyak momentum, nesterov momentum
|
|
88
|
-
- Gradient norm and value clipping, gradient normalization
|
|
89
|
-
- Gradient centralization (<https://arxiv.org/abs/2004.01461>)
|
|
90
|
-
- Learning rate droput (<https://pubmed.ncbi.nlm.nih.gov/35286266/>).
|
|
91
|
-
- Forward gradient (<https://arxiv.org/abs/2202.08587>)
|
|
92
|
-
- Gradient approximation via finite difference or randomized finite difference, which includes SPSA, RDSA, FDSA and Gaussian smoothing (<https://arxiv.org/abs/2211.13566v3>)
|
|
93
|
-
- Various line searches
|
|
94
|
-
- Exact Newton's method (with Levenberg-Marquardt regularization), newton with hessian approximation via finite difference, subspace finite differences newton.
|
|
95
|
-
- Directional newton via one additional forward pass
|
|
96
|
-
|
|
97
|
-
All modules should be quite fast, especially on models with many different parameters, due to `_foreach` operations.
|
|
98
|
-
|
|
99
|
-
I am getting to the point where I can start focusing on good docs and tests. As of now, the code should be considered experimental, untested and subject to change, so feel free but be careful if using this for actual project.
|
|
100
|
-
|
|
101
|
-
# Wrappers
|
|
102
|
-
|
|
103
|
-
### scipy.optimize.minimize wrapper
|
|
104
|
-
|
|
105
|
-
scipy.optimize.minimize wrapper with support for both gradient and hessian via batched autograd
|
|
106
|
-
|
|
107
|
-
```py
|
|
108
|
-
from torchzero.optim.wrappers.scipy import ScipyMinimize
|
|
109
|
-
opt = ScipyMinimize(model.parameters(), method = 'trust-krylov')
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
Use as any other optimizer (make sure closure accepts `backward` argument like one from **How to use**). Note that it performs full minimization on each step.
|
|
113
|
-
|
|
114
|
-
### Nevergrad wrapper
|
|
115
|
-
|
|
116
|
-
```py
|
|
117
|
-
opt = NevergradOptimizer(bench.parameters(), ng.optimizers.NGOptBase, budget = 1000)
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
Use as any other optimizer (make sure closure accepts `backward` argument like one from **How to use**).
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/__init__.py
RENAMED
|
File without changes
|
{torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/_fd_formulas.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/forward_gradient.py
RENAMED
|
File without changes
|
{torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/gradient_approximation/newton_fdm.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{torchzero-0.1.7 → torchzero-0.1.8}/src/torchzero/modules/line_search/scipy_minimize_scalar.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|