heavyball 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- heavyball/__init__.py +6 -7
- {heavyball-0.17.0.dist-info → heavyball-0.17.1.dist-info}/METADATA +17 -17
- {heavyball-0.17.0.dist-info → heavyball-0.17.1.dist-info}/RECORD +6 -6
- {heavyball-0.17.0.dist-info → heavyball-0.17.1.dist-info}/LICENSE +0 -0
- {heavyball-0.17.0.dist-info → heavyball-0.17.1.dist-info}/WHEEL +0 -0
- {heavyball-0.17.0.dist-info → heavyball-0.17.1.dist-info}/top_level.txt +0 -0
heavyball/__init__.py
CHANGED
@@ -21,26 +21,25 @@ PalmForEachSoap = PaLMForeachSOAP
|
|
21
21
|
PaLMSOAP = PaLMForeachSOAP
|
22
22
|
PaLMSFAdamW = PaLMForeachSFAdamW
|
23
23
|
PaLMSFSoap = SFPaLMForeachSOAP
|
24
|
-
PaLMForeachSOAP = PaLMForeachSOAP
|
25
24
|
PrecondScheduleSFPaLMSOAP = PrecondScheduleSFPaLMSOAP
|
26
25
|
SOAP = ForeachSOAP
|
27
26
|
SFAdamW = ForeachSFAdamW
|
28
27
|
LaProp = ForeachLaProp
|
29
28
|
ADOPT = ForeachADOPT
|
30
|
-
|
31
|
-
|
29
|
+
PrecondScheduleSOAP = PrecondScheduleForeachSOAP
|
30
|
+
PrecondSchedulePaLMSOAP = PrecondSchedulePaLMForeachSOAP
|
32
31
|
PSGDKron = ForeachPSGDKron
|
33
32
|
AdamW = ForeachAdamW
|
34
33
|
PurePSGD = ForeachPurePSGD
|
35
34
|
PaLMPAdam = ForeachPaLMPAdam
|
36
35
|
DelayedPSGD = ForeachDelayedPSGD
|
37
36
|
CachedPSGDKron = ForeachCachedPSGDKron
|
38
|
-
CachedDelayedPSGDKron
|
37
|
+
CachedDelayedPSGDKron = ForeachCachedDelayedPSGDKron
|
39
38
|
|
40
39
|
__all__ = ['PalmForEachSoap', 'PaLMForeachSFAdamW', 'PaLMForeachSOAP', 'SFPaLMForeachSOAP', 'PrecondScheduleSFPaLMSOAP',
|
41
40
|
'ForeachSOAP', 'ForeachSFAdamW', 'ForeachLaProp', 'ForeachADOPT', 'PrecondScheduleForeachSOAP',
|
42
41
|
'PrecondSchedulePaLMForeachSOAP', 'ForeachPSGDKron', 'ForeachAdamW', 'ForeachPurePSGD', 'ForeachPaLMPAdam',
|
43
|
-
'ForeachDelayedPSGD', 'ForeachCachedPSGDKron', 'ForeachCachedDelayedPSGDKron' #
|
44
|
-
'PaLMSOAP', 'PaLMSFAdamW', 'PaLMSFSoap', 'PaLMSFAdamW', '
|
42
|
+
'ForeachDelayedPSGD', 'ForeachCachedPSGDKron', 'ForeachCachedDelayedPSGDKron', #
|
43
|
+
'PaLMSOAP', 'PaLMSFAdamW', 'PaLMSFSoap', 'PaLMSFAdamW', 'PrecondScheduleSFPaLMSOAP',
|
45
44
|
'SOAP', 'SFAdamW', 'LaProp', 'ADOPT', 'PSGDKron', 'AdamW', 'PurePSGD', 'PaLMPAdam', 'DelayedPSGD',
|
46
|
-
'CachedPSGDKron', 'CachedDelayedPSGDKron']
|
45
|
+
'CachedPSGDKron', 'CachedDelayedPSGDKron', 'PrecondScheduleSOAP', 'PrecondSchedulePaLMSOAP']
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: heavyball
|
3
|
-
Version: 0.17.
|
3
|
+
Version: 0.17.1
|
4
4
|
Summary: Efficient optimizers
|
5
5
|
Home-page: https://github.com/clashluke/heavyball
|
6
6
|
Author: Lucas Nestler
|
@@ -32,8 +32,8 @@ A simple package of efficient optimizers
|
|
32
32
|
The goal is not to thrive for completeness, full maintenance or abstraction, but instead to provide a simple
|
33
33
|
largely static alternative to `torch.optim` with more and better optimizers.
|
34
34
|
|
35
|
-
Currently (2024-11-
|
36
|
-
recommended experimental optimizer is `
|
35
|
+
Currently (2024-11-20, 0.17.0), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
|
36
|
+
recommended experimental optimizer is `DelayedPSGDKron` ([tuning guide](docs/psgd_efficiency.md)).
|
37
37
|
|
38
38
|
## Features
|
39
39
|
|
@@ -62,7 +62,7 @@ import heavyball
|
|
62
62
|
model = torch.nn.Linear(16, 1)
|
63
63
|
|
64
64
|
# Create an optimizer
|
65
|
-
optimizer = heavyball.
|
65
|
+
optimizer = heavyball.PrecondSchedulePaLMSOAP(model.parameters(), lr=1e-3)
|
66
66
|
|
67
67
|
x = torch.randn(128, 16)
|
68
68
|
y = torch.randn(128, 1)
|
@@ -76,19 +76,19 @@ for _ in range(1000):
|
|
76
76
|
|
77
77
|
## Optimizers
|
78
78
|
|
79
|
-
| Name
|
80
|
-
|
81
|
-
| **
|
82
|
-
| **
|
83
|
-
| **
|
84
|
-
| **
|
85
|
-
| **
|
86
|
-
| **
|
87
|
-
| **
|
88
|
-
| **
|
89
|
-
| **
|
90
|
-
| **
|
91
|
-
| **
|
79
|
+
| Name | Description | Advantages / Disadvantages |
|
80
|
+
|-------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
81
|
+
| **AdamW** | More efficient (speed, memory) [AdamW](https://arxiv.org/abs/1711.05101) | + Faster than AdamW<br>+ Possibly more (numerically) stable
|
82
|
+
| **LaProp** | More efficient (speed, memory) [LaProp](https://arxiv.org/abs/2002.04839) | + Same cost as AdamW<br>+ Marginally better converence (better proofs)<br>+ Higher hyperparameter stability<br>- Not a guaranteed win (can be neutral)<br>- No "Slingshot" |
|
83
|
+
| **ADOPT** | More efficient (speed, memory) [ADOPT](https://arxiv.org/abs/2411.02853) | + Same cost as AdamW<br>+ Rigorous mathematical convergence proofs, even for challenging models (GANs)<br>- Empirically underperforms LaProp<br>- no bf16 |
|
84
|
+
| **SFAdamW** | More efficient (speed, memory) [ScheduleFree AdamW](https://arxiv.org/abs/2405.15682) | + Same cost as AdamW, but better eval perf<br>+ Full control over hyperparameters |
|
85
|
+
| **PaLMSFAdamW** | ForeachSFAdamW with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311) | + Same cost as AdamW, but better eval perf<br>+ Less control, but faster early and more stable late convergence<br>+ ScheduleFree<br>- slow early convergence |
|
86
|
+
| **SOAP** | More efficient (speed, memory) [SOAP](https://arxiv.org/abs/2409.11321) | + Faster convergence (loss-at-step)<br>+ Full control over hyperparameters<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second) |
|
87
|
+
| **PaLMSOAP** | ForeachSOAP with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311) | + Faster convergence (loss-at-step)<br>+ Less control, but faster early and more stable late convergence<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second) |
|
88
|
+
| **SFPaLMSOAP** | ScheduleFree PaLMForeachSOAP | + Fast convergence (loss-at-step)<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized) |
|
89
|
+
| **PrecondScheduleSFPaLMSOAP** | SFPaLMForeachSOAP with [preconditioner schedule](https://github.com/lixilinx/psgd_torch/), matching the error of PrecondEvery=2 with the cost of PrecondEvery=512 | + Better initial convergence than SFPaLMForeachSOAP<br>+ Significantly faster (sec/it) later<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of step |
|
90
|
+
| **PrecondSchedulePaLMSOAP** | PrecondScheduleSFPaLMForeachSOAP without schedule-free | + Best initial convergence<br>+ Significantly faster (sec/it) later<br>+ high stability<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps |
|
91
|
+
| **PrecondScheduleSOAP** | PrecondScheduleSFPaLMForeachSOAP without PaLM's beta2 schedule | + Better initial convergence<br>+ Significantly faster (sec/it) later<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps |
|
92
92
|
|
93
93
|
## Precond Schedule
|
94
94
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
heavyball/__init__.py,sha256=
|
1
|
+
heavyball/__init__.py,sha256=iqP428JWwwx-XDOZ0nUdbCkOLEyfoqVyWZLQLAcwxaw,2214
|
2
2
|
heavyball/cached_delayed_psgd_kron.py,sha256=DvjNNHzbnS-NDq965wve-VQ-ol7IFljYYGTuTwPHOhU,6971
|
3
3
|
heavyball/cached_psgd_kron.py,sha256=xy3-yRKFUvRTstJb_asMVp-k-5Zuw_HyILPi7BsuMKQ,6974
|
4
4
|
heavyball/delayed_psgd.py,sha256=rDDUj3miEn6HRJmKl-ZImsqkqBASSn8aC7MEV_06fzU,6017
|
@@ -17,8 +17,8 @@ heavyball/psgd_kron.py,sha256=2IpPj2TOExNGm8hSewi3er2GczJRNgC7r2J5yYSSA_0,5998
|
|
17
17
|
heavyball/pure_psgd.py,sha256=uA7W9a3Qm1sxHQhtNxaUYrmE5x55lP5iJOKy_qT8XaQ,5341
|
18
18
|
heavyball/schedule_free_palm_foreach_soap.py,sha256=zkcikH5wWbzq4kOrmBjilvY3iWzuUddcv2HNEPKr3MI,6366
|
19
19
|
heavyball/utils.py,sha256=Jqh7VdWGeiSdwaPtUNB9l14wuuFPSReLaTwJA3juFbM,28765
|
20
|
-
heavyball-0.17.
|
21
|
-
heavyball-0.17.
|
22
|
-
heavyball-0.17.
|
23
|
-
heavyball-0.17.
|
24
|
-
heavyball-0.17.
|
20
|
+
heavyball-0.17.1.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
|
21
|
+
heavyball-0.17.1.dist-info/METADATA,sha256=2FAgCpyuH4G-B_m0mhbl-sdkMizS1sd8oNmNkPpAKN0,11810
|
22
|
+
heavyball-0.17.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
23
|
+
heavyball-0.17.1.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
|
24
|
+
heavyball-0.17.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|