AdamWClip 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ from .optimizer import AdamWClip as AdamWClip
2
+
3
+ __all__ = ["AdamWClip"]
@@ -0,0 +1,83 @@
1
+ import torch
2
+ from torch.optim.optimizer import Optimizer
3
+
4
+ class AdamWClip(Optimizer):
5
+ """
6
+ AdamW optimizer with adaptive gradient clipping
7
+ """
8
+
9
+ def __init__(self, params, lr=0.001, betas=(0.9,0.999), eps=1e-8, weight_decay=0.01, clip_grad_adapt = 3, clip_grad_min=0.01, clip_grad_warm_up=10):
10
+ """
11
+ params, lr, betas, eps, weight_decay is identical to AdamW optimizer
12
+ On top of that, we have the following parameters:
13
+ :clip_grad_adapt: adaptive gradient clipping threshold in terms of standard deviations of the clipped gradient distribution (default: 3)
14
+ :clip_grad_min: minimum value for the adaptive gradient clipping threshold (default: 0.01)
15
+ :clip_grad_warm_up: Number of initial update steps without gradient clipping to obtain reasonable gradient statistics at the beginning (default: 10)
16
+ """
17
+ beta_1, beta_2 = betas
18
+ defaults = dict(lr=lr, beta_1=beta_1, beta_2=beta_2,weight_decay=weight_decay, clip_grad_adapt = clip_grad_adapt,clip_grad_min=clip_grad_min)
19
+ self.iteration = 0
20
+ self.clip_grad_warm_up = clip_grad_warm_up
21
+ self.eps = eps
22
+ super(AdamWClip, self).__init__(params, defaults)
23
+
24
+ def step(self, closure=None, end=False):
25
+ """Performs a single optimization step."""
26
+ loss = None
27
+ if closure is not None:
28
+ loss = closure()
29
+
30
+ self.iteration += 1
31
+
32
+ for group in self.param_groups:
33
+ lr = group['lr']
34
+ beta_1 = group['beta_1']
35
+ beta_2 = group['beta_2']
36
+ weight_decay = group['weight_decay']
37
+ clip_grad_adapt = group['clip_grad_adapt']
38
+ clip_grad_min = group['clip_grad_min']
39
+
40
+ for p in group['params']:
41
+ if p.grad is None:
42
+ continue
43
+ theta = p.data
44
+ grad = p.grad.data
45
+ if weight_decay != 0:
46
+ theta.add_(-weight_decay*lr*theta)
47
+
48
+ # Momentum part
49
+ param_state = self.state[p]
50
+
51
+ # Buffers:
52
+ if 'sum_grad' not in param_state:
53
+ # 1. momentum
54
+ sum_grad = param_state['sum_grad'] = torch.zeros_like(grad)
55
+ # 2. momentum
56
+ sum_grad_grad = param_state['sum_grad_grad'] = torch.zeros_like(grad)
57
+ else:
58
+ # 1. momentum
59
+ sum_grad = param_state['sum_grad']
60
+ # 2. momentum
61
+ sum_grad_grad = param_state['sum_grad_grad']
62
+
63
+ if clip_grad_adapt is not None and self.iteration > self.clip_grad_warm_up:
64
+ E_grad_grad = sum_grad_grad / (1-beta_2**self.iteration)
65
+ clamp_adapt = clip_grad_adapt*torch.sqrt(E_grad_grad).clamp_(min=clip_grad_min)
66
+ grad.clamp_(-clamp_adapt,clamp_adapt)
67
+
68
+ # 1. momentum
69
+ sum_grad.mul_(beta_1).add_(grad*(1-beta_1))
70
+
71
+ # 2. momentum
72
+ sum_grad_grad.mul_(beta_2).add_(grad*grad*(1-beta_2))
73
+
74
+ # bias correction
75
+ E_grad = sum_grad / (1-beta_1**self.iteration)
76
+ E_grad_grad = sum_grad_grad / (1-beta_2**self.iteration) # could be optimized
77
+
78
+ step = lr*E_grad/(torch.sqrt(E_grad_grad)+self.eps) # could be optimized
79
+
80
+ # apply update step
81
+ theta.add_(-step)
82
+
83
+ return loss
@@ -0,0 +1,50 @@
1
+ Metadata-Version: 2.4
2
+ Name: AdamWClip
3
+ Version: 0.1.0
4
+ Summary: AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.
5
+ Home-page: https://github.com/wandeln/AdamWClip
6
+ Author: Nils Wandel
7
+ Author-email: wandeln@cs.uni-bonn.de
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: torch
14
+ Dynamic: author
15
+ Dynamic: author-email
16
+ Dynamic: classifier
17
+ Dynamic: description
18
+ Dynamic: description-content-type
19
+ Dynamic: home-page
20
+ Dynamic: requires-dist
21
+ Dynamic: requires-python
22
+ Dynamic: summary
23
+
24
+ # AdamWClip: AdamW with adaptive gradient clipping
25
+
26
+ AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.
27
+ This way, the gradient clipping thresholds automatically adapt to the gradient statistics and become equivariant with respect to scaling the gradients.
28
+ This makes finding suitable clipping thresholds much easier (usually, the default thresholds are good to go).
29
+ Furthermore, by directly utilizing the internal state variables of Adam, AdamWClip implements adaptive gradient clipping without an additional memory footprint.
30
+
31
+ ## Useage
32
+
33
+ To use AdamWClip in your pytorch project, simply run the following:
34
+
35
+ ```python
36
+ %pip install AdamWClip
37
+ from AdamWClip import AdamWClip
38
+ ...
39
+ optimizer = AdamWClip(model.parameters(),*args)
40
+ ```
41
+
42
+ On top of the standard parameters from AdamW, AdamWClip offers the following additional parameters:
43
+
44
+ - clip_grad_adapt: adaptive gradient clipping threshold in terms of standard deviations of the clipped gradient distribution (default: 3)
45
+ - clip_grad_min: minimum value for the adaptive gradient clipping threshold (default: 0.01)
46
+ - clip_grad_warm_up: Number of initial update steps without gradient clipping to obtain reasonable gradient statistics at the beginning (default: 10)
47
+
48
+ In most instances, the default values should be fine.
49
+
50
+ If this optimizer becomes useful to you, please consider citing this repository :)
@@ -0,0 +1,10 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ AdamWClip/__init__.py
5
+ AdamWClip/optimizer.py
6
+ AdamWClip.egg-info/PKG-INFO
7
+ AdamWClip.egg-info/SOURCES.txt
8
+ AdamWClip.egg-info/dependency_links.txt
9
+ AdamWClip.egg-info/requires.txt
10
+ AdamWClip.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ torch
@@ -0,0 +1 @@
1
+ AdamWClip
@@ -0,0 +1,50 @@
1
+ Metadata-Version: 2.4
2
+ Name: AdamWClip
3
+ Version: 0.1.0
4
+ Summary: AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.
5
+ Home-page: https://github.com/wandeln/AdamWClip
6
+ Author: Nils Wandel
7
+ Author-email: wandeln@cs.uni-bonn.de
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: torch
14
+ Dynamic: author
15
+ Dynamic: author-email
16
+ Dynamic: classifier
17
+ Dynamic: description
18
+ Dynamic: description-content-type
19
+ Dynamic: home-page
20
+ Dynamic: requires-dist
21
+ Dynamic: requires-python
22
+ Dynamic: summary
23
+
24
+ # AdamWClip: AdamW with adaptive gradient clipping
25
+
26
+ AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.
27
+ This way, the gradient clipping thresholds automatically adapt to the gradient statistics and become equivariant with respect to scaling the gradients.
28
+ This makes finding suitable clipping thresholds much easier (usually, the default thresholds are good to go).
29
+ Furthermore, by directly utilizing the internal state variables of Adam, AdamWClip implements adaptive gradient clipping without an additional memory footprint.
30
+
31
+ ## Useage
32
+
33
+ To use AdamWClip in your pytorch project, simply run the following:
34
+
35
+ ```python
36
+ %pip install AdamWClip
37
+ from AdamWClip import AdamWClip
38
+ ...
39
+ optimizer = AdamWClip(model.parameters(),*args)
40
+ ```
41
+
42
+ On top of the standard parameters from AdamW, AdamWClip offers the following additional parameters:
43
+
44
+ - clip_grad_adapt: adaptive gradient clipping threshold in terms of standard deviations of the clipped gradient distribution (default: 3)
45
+ - clip_grad_min: minimum value for the adaptive gradient clipping threshold (default: 0.01)
46
+ - clip_grad_warm_up: Number of initial update steps without gradient clipping to obtain reasonable gradient statistics at the beginning (default: 10)
47
+
48
+ In most instances, the default values should be fine.
49
+
50
+ If this optimizer becomes useful to you, please consider citing this repository :)
@@ -0,0 +1,27 @@
1
+ # AdamWClip: AdamW with adaptive gradient clipping
2
+
3
+ AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.
4
+ This way, the gradient clipping thresholds automatically adapt to the gradient statistics and become equivariant with respect to scaling the gradients.
5
+ This makes finding suitable clipping thresholds much easier (usually, the default thresholds are good to go).
6
+ Furthermore, by directly utilizing the internal state variables of Adam, AdamWClip implements adaptive gradient clipping without an additional memory footprint.
7
+
8
+ ## Useage
9
+
10
+ To use AdamWClip in your pytorch project, simply run the following:
11
+
12
+ ```python
13
+ %pip install AdamWClip
14
+ from AdamWClip import AdamWClip
15
+ ...
16
+ optimizer = AdamWClip(model.parameters(),*args)
17
+ ```
18
+
19
+ On top of the standard parameters from AdamW, AdamWClip offers the following additional parameters:
20
+
21
+ - clip_grad_adapt: adaptive gradient clipping threshold in terms of standard deviations of the clipped gradient distribution (default: 3)
22
+ - clip_grad_min: minimum value for the adaptive gradient clipping threshold (default: 0.01)
23
+ - clip_grad_warm_up: Number of initial update steps without gradient clipping to obtain reasonable gradient statistics at the beginning (default: 10)
24
+
25
+ In most instances, the default values should be fine.
26
+
27
+ If this optimizer becomes useful to you, please consider citing this repository :)
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,23 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r", encoding="utf-8") as f:
4
+ long_description = f.read()
5
+
6
+ setup(
7
+ name="AdamWClip",
8
+ version="0.1.0",
9
+ packages=find_packages(),
10
+ install_requires=["torch"],
11
+ author="Nils Wandel",
12
+ author_email="wandeln@cs.uni-bonn.de",
13
+ description="AdamWClip is an optimizer that extends AdamW with adaptive gradient clipping.",
14
+ long_description=long_description,
15
+ long_description_content_type="text/markdown",
16
+ url="https://github.com/wandeln/AdamWClip",
17
+ classifiers=[
18
+ "Programming Language :: Python :: 3",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Operating System :: OS Independent",
21
+ ],
22
+ python_requires=">=3.8",
23
+ )