AdamWClip 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adamwclip-0.1.0/AdamWClip/__init__.py +3 -0
- adamwclip-0.1.0/AdamWClip/optimizer.py +83 -0
- adamwclip-0.1.0/AdamWClip.egg-info/PKG-INFO +50 -0
- adamwclip-0.1.0/AdamWClip.egg-info/SOURCES.txt +10 -0
- adamwclip-0.1.0/AdamWClip.egg-info/dependency_links.txt +1 -0
- adamwclip-0.1.0/AdamWClip.egg-info/requires.txt +1 -0
- adamwclip-0.1.0/AdamWClip.egg-info/top_level.txt +1 -0
- adamwclip-0.1.0/PKG-INFO +50 -0
- adamwclip-0.1.0/README.md +27 -0
- adamwclip-0.1.0/pyproject.toml +3 -0
- adamwclip-0.1.0/setup.cfg +4 -0
- adamwclip-0.1.0/setup.py +23 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from torch.optim.optimizer import Optimizer
|
|
3
|
+
|
|
4
|
+
class AdamWClip(Optimizer):
|
|
5
|
+
"""
|
|
6
|
+
AdamW optimizer with adaptive gradient clipping
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, params, lr=0.001, betas=(0.9,0.999), eps=1e-8, weight_decay=0.01, clip_grad_adapt = 3, clip_grad_min=0.01, clip_grad_warm_up=10):
|
|
10
|
+
"""
|
|
11
|
+
params, lr, betas, eps, weight_decay is identical to AdamW optimizer
|
|
12
|
+
On top of that, we have the following parameters:
|
|
13
|
+
:clip_grad_adapt: adaptive gradient clipping threshold in terms of standard deviations of the clipped gradient distribution (default: 3)
|
|
14
|
+
:clip_grad_min: minimum value for the adaptive gradient clipping threshold (default: 0.01)
|
|
15
|
+
:clip_grad_warm_up: Number of initial update steps without gradient clipping to obtain reasonable gradient statistics at the beginning (default: 10)
|
|
16
|
+
"""
|
|
17
|
+
beta_1, beta_2 = betas
|
|
18
|
+
defaults = dict(lr=lr, beta_1=beta_1, beta_2=beta_2,weight_decay=weight_decay, clip_grad_adapt = clip_grad_adapt,clip_grad_min=clip_grad_min)
|
|
19
|
+
self.iteration = 0
|
|
20
|
+
self.clip_grad_warm_up = clip_grad_warm_up
|
|
21
|
+
self.eps = eps
|
|
22
|
+
super(AdamWClip, self).__init__(params, defaults)
|
|
23
|
+
|
|
24
|
+
def step(self, closure=None, end=False):
|
|
25
|
+
"""Performs a single optimization step."""
|
|
26
|
+
loss = None
|
|
27
|
+
if closure is not None:
|
|
28
|
+
loss = closure()
|
|
29
|
+
|
|
30
|
+
self.iteration += 1
|
|
31
|
+
|
|
32
|
+
for group in self.param_groups:
|
|
33
|
+
lr = group['lr']
|
|
34
|
+
beta_1 = group['beta_1']
|
|
35
|
+
beta_2 = group['beta_2']
|
|
36
|
+
weight_decay = group['weight_decay']
|
|
37
|
+
clip_grad_adapt = group['clip_grad_adapt']
|
|
38
|
+
clip_grad_min = group['clip_grad_min']
|
|
39
|
+
|
|
40
|
+
for p in group['params']:
|
|
41
|
+
if p.grad is None:
|
|
42
|
+
continue
|
|
43
|
+
theta = p.data
|
|
44
|
+
grad = p.grad.data
|
|
45
|
+
if weight_decay != 0:
|
|
46
|
+
theta.add_(-weight_decay*lr*theta)
|
|
47
|
+
|
|
48
|
+
# Momentum part
|
|
49
|
+
param_state = self.state[p]
|
|
50
|
+
|
|
51
|
+
# Buffers:
|
|
52
|
+
if 'sum_grad' not in param_state:
|
|
53
|
+
# 1. momentum
|
|
54
|
+
sum_grad = param_state['sum_grad'] = torch.zeros_like(grad)
|
|
55
|
+
# 2. momentum
|
|
56
|
+
sum_grad_grad = param_state['sum_grad_grad'] = torch.zeros_like(grad)
|
|
57
|
+
else:
|
|
58
|
+
# 1. momentum
|
|
59
|
+
sum_grad = param_state['sum_grad']
|
|
60
|
+
# 2. momentum
|
|
61
|
+
sum_grad_grad = param_state['sum_grad_grad']
|
|
62
|
+
|
|
63
|
+
if clip_grad_adapt is not None and self.iteration > self.clip_grad_warm_up:
|
|
64
|
+
E_grad_grad = sum_grad_grad / (1-beta_2**self.iteration)
|
|
65
|
+
clamp_adapt = clip_grad_adapt*torch.sqrt(E_grad_grad).clamp_(min=clip_grad_min)
|
|
66
|
+
grad.clamp_(-clamp_adapt,clamp_adapt)
|
|
67
|
+
|
|
68
|
+
# 1. momentum
|
|
69
|
+
sum_grad.mul_(beta_1).add_(grad*(1-beta_1))
|
|
70
|
+
|
|
71
|
+
# 2. momentum
|
|
72
|
+
sum_grad_grad.mul_(beta_2).add_(grad*grad*(1-beta_2))
|
|
73
|
+
|
|
74
|
+
# bias correction
|
|
75
|
+
E_grad = sum_grad / (1-beta_1**self.iteration)
|
|
76
|
+
E_grad_grad = sum_grad_grad / (1-beta_2**self.iteration) # could be optimized
|
|
77
|
+
|
|
78
|
+
step = lr*E_grad/(torch.sqrt(E_grad_grad)+self.eps) # could be optimized
|
|
79
|
+
|
|
80
|
+
# apply update step
|
|
81
|
+
theta.add_(-step)
|
|
82
|
+
|
|
83
|
+
return loss
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: AdamWClip
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.
|
|
5
|
+
Home-page: https://github.com/wandeln/AdamWClip
|
|
6
|
+
Author: Nils Wandel
|
|
7
|
+
Author-email: wandeln@cs.uni-bonn.de
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: torch
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: home-page
|
|
20
|
+
Dynamic: requires-dist
|
|
21
|
+
Dynamic: requires-python
|
|
22
|
+
Dynamic: summary
|
|
23
|
+
|
|
24
|
+
# AdamWClip: AdamW with adaptive gradient clipping
|
|
25
|
+
|
|
26
|
+
AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.
|
|
27
|
+
This way, the gradient clipping thresholds automatically adapt to the gradient statistics and become equivariant with respect to scaling the gradients.
|
|
28
|
+
This makes finding suitable clipping thresholds much easier (usually, the default thresholds are good to go).
|
|
29
|
+
Furthermore, by directly utilizing the internal state variables of Adam, AdamWClip implements adaptive gradient clipping without an additional memory footprint.
|
|
30
|
+
|
|
31
|
+
## Useage
|
|
32
|
+
|
|
33
|
+
To use AdamWClip in your pytorch project, simply run the following:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
%pip install AdamWClip
|
|
37
|
+
from AdamWClip import AdamWClip
|
|
38
|
+
...
|
|
39
|
+
optimizer = AdamWClip(model.parameters(),*args)
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
On top of the standard parameters from AdamW, AdamWClip offers the following additional parameters:
|
|
43
|
+
|
|
44
|
+
- clip_grad_adapt: adaptive gradient clipping threshold in terms of standard deviations of the clipped gradient distribution (default: 3)
|
|
45
|
+
- clip_grad_min: minimum value for the adaptive gradient clipping threshold (default: 0.01)
|
|
46
|
+
- clip_grad_warm_up: Number of initial update steps without gradient clipping to obtain reasonable gradient statistics at the beginning (default: 10)
|
|
47
|
+
|
|
48
|
+
In most instances, the default values should be fine.
|
|
49
|
+
|
|
50
|
+
If this optimizer becomes useful to you, please consider citing this repository :)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
setup.py
|
|
4
|
+
AdamWClip/__init__.py
|
|
5
|
+
AdamWClip/optimizer.py
|
|
6
|
+
AdamWClip.egg-info/PKG-INFO
|
|
7
|
+
AdamWClip.egg-info/SOURCES.txt
|
|
8
|
+
AdamWClip.egg-info/dependency_links.txt
|
|
9
|
+
AdamWClip.egg-info/requires.txt
|
|
10
|
+
AdamWClip.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
torch
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
AdamWClip
|
adamwclip-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: AdamWClip
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.
|
|
5
|
+
Home-page: https://github.com/wandeln/AdamWClip
|
|
6
|
+
Author: Nils Wandel
|
|
7
|
+
Author-email: wandeln@cs.uni-bonn.de
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: torch
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: home-page
|
|
20
|
+
Dynamic: requires-dist
|
|
21
|
+
Dynamic: requires-python
|
|
22
|
+
Dynamic: summary
|
|
23
|
+
|
|
24
|
+
# AdamWClip: AdamW with adaptive gradient clipping
|
|
25
|
+
|
|
26
|
+
AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.
|
|
27
|
+
This way, the gradient clipping thresholds automatically adapt to the gradient statistics and become equivariant with respect to scaling the gradients.
|
|
28
|
+
This makes finding suitable clipping thresholds much easier (usually, the default thresholds are good to go).
|
|
29
|
+
Furthermore, by directly utilizing the internal state variables of Adam, AdamWClip implements adaptive gradient clipping without an additional memory footprint.
|
|
30
|
+
|
|
31
|
+
## Useage
|
|
32
|
+
|
|
33
|
+
To use AdamWClip in your pytorch project, simply run the following:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
%pip install AdamWClip
|
|
37
|
+
from AdamWClip import AdamWClip
|
|
38
|
+
...
|
|
39
|
+
optimizer = AdamWClip(model.parameters(),*args)
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
On top of the standard parameters from AdamW, AdamWClip offers the following additional parameters:
|
|
43
|
+
|
|
44
|
+
- clip_grad_adapt: adaptive gradient clipping threshold in terms of standard deviations of the clipped gradient distribution (default: 3)
|
|
45
|
+
- clip_grad_min: minimum value for the adaptive gradient clipping threshold (default: 0.01)
|
|
46
|
+
- clip_grad_warm_up: Number of initial update steps without gradient clipping to obtain reasonable gradient statistics at the beginning (default: 10)
|
|
47
|
+
|
|
48
|
+
In most instances, the default values should be fine.
|
|
49
|
+
|
|
50
|
+
If this optimizer becomes useful to you, please consider citing this repository :)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# AdamWClip: AdamW with adaptive gradient clipping
|
|
2
|
+
|
|
3
|
+
AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.
|
|
4
|
+
This way, the gradient clipping thresholds automatically adapt to the gradient statistics and become equivariant with respect to scaling the gradients.
|
|
5
|
+
This makes finding suitable clipping thresholds much easier (usually, the default thresholds are good to go).
|
|
6
|
+
Furthermore, by directly utilizing the internal state variables of Adam, AdamWClip implements adaptive gradient clipping without an additional memory footprint.
|
|
7
|
+
|
|
8
|
+
## Useage
|
|
9
|
+
|
|
10
|
+
To use AdamWClip in your pytorch project, simply run the following:
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
%pip install AdamWClip
|
|
14
|
+
from AdamWClip import AdamWClip
|
|
15
|
+
...
|
|
16
|
+
optimizer = AdamWClip(model.parameters(),*args)
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
On top of the standard parameters from AdamW, AdamWClip offers the following additional parameters:
|
|
20
|
+
|
|
21
|
+
- clip_grad_adapt: adaptive gradient clipping threshold in terms of standard deviations of the clipped gradient distribution (default: 3)
|
|
22
|
+
- clip_grad_min: minimum value for the adaptive gradient clipping threshold (default: 0.01)
|
|
23
|
+
- clip_grad_warm_up: Number of initial update steps without gradient clipping to obtain reasonable gradient statistics at the beginning (default: 10)
|
|
24
|
+
|
|
25
|
+
In most instances, the default values should be fine.
|
|
26
|
+
|
|
27
|
+
If this optimizer becomes useful to you, please consider citing this repository :)
|
adamwclip-0.1.0/setup.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as f:
|
|
4
|
+
long_description = f.read()
|
|
5
|
+
|
|
6
|
+
setup(
|
|
7
|
+
name="AdamWClip",
|
|
8
|
+
version="0.1.0",
|
|
9
|
+
packages=find_packages(),
|
|
10
|
+
install_requires=["torch"],
|
|
11
|
+
author="Nils Wandel",
|
|
12
|
+
author_email="wandeln@cs.uni-bonn.de",
|
|
13
|
+
description="AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.",
|
|
14
|
+
long_description=long_description,
|
|
15
|
+
long_description_content_type="text/markdown",
|
|
16
|
+
url="https://github.com/wandeln/AdamWClip",
|
|
17
|
+
classifiers=[
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
],
|
|
22
|
+
python_requires=">=3.8",
|
|
23
|
+
)
|