PyPI - heavyball - Versions diffs - 0.16.0__tar.gz → 0.17.1__tar.gz - Mend

heavyball 0.16.0tar.gz → 0.17.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{heavyball-0.16.0 → heavyball-0.17.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.16.0
+Version: 0.17.1
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler
@@ -32,8 +32,8 @@ A simple package of efficient optimizers
 The goal is not to thrive for completeness, full maintenance or abstraction, but instead to provide a simple
 largely static alternative to `torch.optim` with more and better optimizers.
-Currently (2024-11-17, 0.15.0), the recommended stable optimizer is `PrecondSchedulePaLMForeachSOAP` (see below). The
-recommended experimental optimizer is `ForeachPSGDKron`.
+Currently (2024-11-20, 0.17.0), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
+recommended experimental optimizer is `DelayedPSGDKron` ([tuning guide](docs/psgd_efficiency.md)).
 ## Features
@@ -62,7 +62,7 @@ import heavyball
 model = torch.nn.Linear(16, 1)
 # Create an optimizer
-optimizer = heavyball.PrecondSchedulePaLMForeachSOAP(model.parameters(), lr=1e-3)
+optimizer = heavyball.PrecondSchedulePaLMSOAP(model.parameters(), lr=1e-3)
 x = torch.randn(128, 16)
 y = torch.randn(128, 1)
@@ -76,19 +76,19 @@ for _ in range(1000):
 ## Optimizers
-| Name                                 | Description                                                                                                                                                       | Advantages / Disadvantages                                                                                                                                                                                                                                                                                                            |
-|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| **ForeachAdamW**                     | More efficient (speed, memory) [AdamW](https://arxiv.org/abs/1711.05101)                                                                                          | + Faster than AdamW<br>+ Possibly more (numerically) stable
-| **ForeachLaProp**                    | More efficient (speed, memory) [LaProp](https://arxiv.org/abs/2002.04839)                                                                                         | + Same cost as AdamW<br>+ Marginally better converence (better proofs)<br>+ Higher hyperparameter stability<br>- Not a guaranteed win (can be neutral)<br>- No "Slingshot"                                                                                                                                                            |
-| **ForeachADOPT**                     | More efficient (speed, memory) [ADOPT](https://arxiv.org/abs/2411.02853)                                                                                          | + Same cost as AdamW<br>+ Rigorous mathematical convergence proofs, even for challenging models (GANs)<br>- Empirically underperforms LaProp<br>- no bf16                                                                                                                                                                             |
-| **ForeachSFAdamW**                   | More efficient (speed, memory) [ScheduleFree AdamW](https://arxiv.org/abs/2405.15682)                                                                             | + Same cost as AdamW, but better eval perf<br>+ Full control over hyperparameters                                                                                                                                                                                                                                                     |
-| **PaLMForeachSFAdamW**               | ForeachSFAdamW with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                     | + Same cost as AdamW, but better eval perf<br>+ Less control, but faster early and more stable late convergence<br>+ ScheduleFree<br>- slow early convergence                                                                                                                                                                         |
-| **ForeachSOAP**                      | More efficient (speed, memory) [SOAP](https://arxiv.org/abs/2409.11321)                                                                                           | + Faster convergence (loss-at-step)<br>+ Full control over hyperparameters<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                                                |
-| **PaLMForeachSOAP**                  | ForeachSOAP with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                        | + Faster convergence (loss-at-step)<br>+ Less control, but faster early and more stable late convergence<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                  |
-| **SFPaLMForeachSOAP**                | ScheduleFree PaLMForeachSOAP                                                                                                                                      | + Fast convergence (loss-at-step)<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized)                                                                                                      |
-| **PrecondScheduleSFPaLMForeachSOAP** | SFPaLMForeachSOAP with [preconditioner schedule](https://github.com/lixilinx/psgd_torch/), matching the error of PrecondEvery=2 with the cost of PrecondEvery=512 | + Better initial convergence than SFPaLMForeachSOAP<br>+ Significantly faster (sec/it) later<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of step |
-| **PrecondSchedulePaLMForeachSOAP**   | PrecondScheduleSFPaLMForeachSOAP without schedule-free                                                                                                            | + Best initial convergence<br>+ Significantly faster (sec/it) later<br>+ high stability<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                   |
-| **PrecondScheduleForeachSOAP**       | PrecondScheduleSFPaLMForeachSOAP without PaLM's beta2 schedule                                                                                                    | + Better initial convergence<br>+ Significantly faster (sec/it) later<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                                     |
+| Name                    | Description                                                                                                                                                       | Advantages / Disadvantages                                                                                                                                                                                                                                                                                                            |
+|-------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **AdamW**               | More efficient (speed, memory) [AdamW](https://arxiv.org/abs/1711.05101)                                                                                          | + Faster than AdamW<br>+ Possibly more (numerically) stable
+| **LaProp**              | More efficient (speed, memory) [LaProp](https://arxiv.org/abs/2002.04839)                                                                                         | + Same cost as AdamW<br>+ Marginally better converence (better proofs)<br>+ Higher hyperparameter stability<br>- Not a guaranteed win (can be neutral)<br>- No "Slingshot"                                                                                                                                                            |
+| **ADOPT**               | More efficient (speed, memory) [ADOPT](https://arxiv.org/abs/2411.02853)                                                                                          | + Same cost as AdamW<br>+ Rigorous mathematical convergence proofs, even for challenging models (GANs)<br>- Empirically underperforms LaProp<br>- no bf16                                                                                                                                                                             |
+| **SFAdamW**             | More efficient (speed, memory) [ScheduleFree AdamW](https://arxiv.org/abs/2405.15682)                                                                             | + Same cost as AdamW, but better eval perf<br>+ Full control over hyperparameters                                                                                                                                                                                                                                                     |
+| **PaLMSFAdamW**         | ForeachSFAdamW with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                     | + Same cost as AdamW, but better eval perf<br>+ Less control, but faster early and more stable late convergence<br>+ ScheduleFree<br>- slow early convergence                                                                                                                                                                         |
+| **SOAP**                | More efficient (speed, memory) [SOAP](https://arxiv.org/abs/2409.11321)                                                                                           | + Faster convergence (loss-at-step)<br>+ Full control over hyperparameters<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                                                |
+| **PaLMSOAP**            | ForeachSOAP with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                        | + Faster convergence (loss-at-step)<br>+ Less control, but faster early and more stable late convergence<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                  |
+| **SFPaLMSOAP**          | ScheduleFree PaLMForeachSOAP                                                                                                                                      | + Fast convergence (loss-at-step)<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized)                                                                                                      |
+| **PrecondScheduleSFPaLMSOAP** | SFPaLMForeachSOAP with [preconditioner schedule](https://github.com/lixilinx/psgd_torch/), matching the error of PrecondEvery=2 with the cost of PrecondEvery=512 | + Better initial convergence than SFPaLMForeachSOAP<br>+ Significantly faster (sec/it) later<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of step |
+| **PrecondSchedulePaLMSOAP** | PrecondScheduleSFPaLMForeachSOAP without schedule-free                                                                                                            | + Best initial convergence<br>+ Significantly faster (sec/it) later<br>+ high stability<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                   |
+| **PrecondScheduleSOAP** | PrecondScheduleSFPaLMForeachSOAP without PaLM's beta2 schedule                                                                                                    | + Better initial convergence<br>+ Significantly faster (sec/it) later<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                                     |
 ## Precond Schedule

{heavyball-0.16.0 → heavyball-0.17.1}/README.md RENAMED Viewed

@@ -8,8 +8,8 @@ A simple package of efficient optimizers
 The goal is not to thrive for completeness, full maintenance or abstraction, but instead to provide a simple
 largely static alternative to `torch.optim` with more and better optimizers.
-Currently (2024-11-17, 0.15.0), the recommended stable optimizer is `PrecondSchedulePaLMForeachSOAP` (see below). The
-recommended experimental optimizer is `ForeachPSGDKron`.
+Currently (2024-11-20, 0.17.0), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
+recommended experimental optimizer is `DelayedPSGDKron` ([tuning guide](docs/psgd_efficiency.md)).
 ## Features
@@ -38,7 +38,7 @@ import heavyball
 model = torch.nn.Linear(16, 1)
 # Create an optimizer
-optimizer = heavyball.PrecondSchedulePaLMForeachSOAP(model.parameters(), lr=1e-3)
+optimizer = heavyball.PrecondSchedulePaLMSOAP(model.parameters(), lr=1e-3)
 x = torch.randn(128, 16)
 y = torch.randn(128, 1)
@@ -52,19 +52,19 @@ for _ in range(1000):
 ## Optimizers
-| Name                                 | Description                                                                                                                                                       | Advantages / Disadvantages                                                                                                                                                                                                                                                                                                            |
-|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| **ForeachAdamW**                     | More efficient (speed, memory) [AdamW](https://arxiv.org/abs/1711.05101)                                                                                          | + Faster than AdamW<br>+ Possibly more (numerically) stable
-| **ForeachLaProp**                    | More efficient (speed, memory) [LaProp](https://arxiv.org/abs/2002.04839)                                                                                         | + Same cost as AdamW<br>+ Marginally better converence (better proofs)<br>+ Higher hyperparameter stability<br>- Not a guaranteed win (can be neutral)<br>- No "Slingshot"                                                                                                                                                            |
-| **ForeachADOPT**                     | More efficient (speed, memory) [ADOPT](https://arxiv.org/abs/2411.02853)                                                                                          | + Same cost as AdamW<br>+ Rigorous mathematical convergence proofs, even for challenging models (GANs)<br>- Empirically underperforms LaProp<br>- no bf16                                                                                                                                                                             |
-| **ForeachSFAdamW**                   | More efficient (speed, memory) [ScheduleFree AdamW](https://arxiv.org/abs/2405.15682)                                                                             | + Same cost as AdamW, but better eval perf<br>+ Full control over hyperparameters                                                                                                                                                                                                                                                     |
-| **PaLMForeachSFAdamW**               | ForeachSFAdamW with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                     | + Same cost as AdamW, but better eval perf<br>+ Less control, but faster early and more stable late convergence<br>+ ScheduleFree<br>- slow early convergence                                                                                                                                                                         |
-| **ForeachSOAP**                      | More efficient (speed, memory) [SOAP](https://arxiv.org/abs/2409.11321)                                                                                           | + Faster convergence (loss-at-step)<br>+ Full control over hyperparameters<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                                                |
-| **PaLMForeachSOAP**                  | ForeachSOAP with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                        | + Faster convergence (loss-at-step)<br>+ Less control, but faster early and more stable late convergence<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                  |
-| **SFPaLMForeachSOAP**                | ScheduleFree PaLMForeachSOAP                                                                                                                                      | + Fast convergence (loss-at-step)<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized)                                                                                                      |
-| **PrecondScheduleSFPaLMForeachSOAP** | SFPaLMForeachSOAP with [preconditioner schedule](https://github.com/lixilinx/psgd_torch/), matching the error of PrecondEvery=2 with the cost of PrecondEvery=512 | + Better initial convergence than SFPaLMForeachSOAP<br>+ Significantly faster (sec/it) later<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of step |
-| **PrecondSchedulePaLMForeachSOAP**   | PrecondScheduleSFPaLMForeachSOAP without schedule-free                                                                                                            | + Best initial convergence<br>+ Significantly faster (sec/it) later<br>+ high stability<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                   |
-| **PrecondScheduleForeachSOAP**       | PrecondScheduleSFPaLMForeachSOAP without PaLM's beta2 schedule                                                                                                    | + Better initial convergence<br>+ Significantly faster (sec/it) later<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                                     |
+| Name                    | Description                                                                                                                                                       | Advantages / Disadvantages                                                                                                                                                                                                                                                                                                            |
+|-------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **AdamW**               | More efficient (speed, memory) [AdamW](https://arxiv.org/abs/1711.05101)                                                                                          | + Faster than AdamW<br>+ Possibly more (numerically) stable
+| **LaProp**              | More efficient (speed, memory) [LaProp](https://arxiv.org/abs/2002.04839)                                                                                         | + Same cost as AdamW<br>+ Marginally better converence (better proofs)<br>+ Higher hyperparameter stability<br>- Not a guaranteed win (can be neutral)<br>- No "Slingshot"                                                                                                                                                            |
+| **ADOPT**               | More efficient (speed, memory) [ADOPT](https://arxiv.org/abs/2411.02853)                                                                                          | + Same cost as AdamW<br>+ Rigorous mathematical convergence proofs, even for challenging models (GANs)<br>- Empirically underperforms LaProp<br>- no bf16                                                                                                                                                                             |
+| **SFAdamW**             | More efficient (speed, memory) [ScheduleFree AdamW](https://arxiv.org/abs/2405.15682)                                                                             | + Same cost as AdamW, but better eval perf<br>+ Full control over hyperparameters                                                                                                                                                                                                                                                     |
+| **PaLMSFAdamW**         | ForeachSFAdamW with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                     | + Same cost as AdamW, but better eval perf<br>+ Less control, but faster early and more stable late convergence<br>+ ScheduleFree<br>- slow early convergence                                                                                                                                                                         |
+| **SOAP**                | More efficient (speed, memory) [SOAP](https://arxiv.org/abs/2409.11321)                                                                                           | + Faster convergence (loss-at-step)<br>+ Full control over hyperparameters<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                                                |
+| **PaLMSOAP**            | ForeachSOAP with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                        | + Faster convergence (loss-at-step)<br>+ Less control, but faster early and more stable late convergence<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                  |
+| **SFPaLMSOAP**          | ScheduleFree PaLMForeachSOAP                                                                                                                                      | + Fast convergence (loss-at-step)<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized)                                                                                                      |
+| **PrecondScheduleSFPaLMSOAP** | SFPaLMForeachSOAP with [preconditioner schedule](https://github.com/lixilinx/psgd_torch/), matching the error of PrecondEvery=2 with the cost of PrecondEvery=512 | + Better initial convergence than SFPaLMForeachSOAP<br>+ Significantly faster (sec/it) later<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of step |
+| **PrecondSchedulePaLMSOAP** | PrecondScheduleSFPaLMForeachSOAP without schedule-free                                                                                                            | + Best initial convergence<br>+ Significantly faster (sec/it) later<br>+ high stability<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                   |
+| **PrecondScheduleSOAP** | PrecondScheduleSFPaLMForeachSOAP without PaLM's beta2 schedule                                                                                                    | + Better initial convergence<br>+ Significantly faster (sec/it) later<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                                     |
 ## Precond Schedule

{heavyball-0.16.0 → heavyball-0.17.1}/heavyball/__init__.py RENAMED Viewed

@@ -14,31 +14,32 @@ from .precond_schedule_sfpsoap import PrecondScheduleSFPaLMSOAP
 from .psgd_kron import ForeachPSGDKron
 from .pure_psgd import ForeachPurePSGD
 from .schedule_free_palm_foreach_soap import SFPaLMForeachSOAP
+from .cached_delayed_psgd_kron import ForeachCachedDelayedPSGDKron
 PalmForEachSoap = PaLMForeachSOAP
 PaLMSOAP = PaLMForeachSOAP
 PaLMSFAdamW = PaLMForeachSFAdamW
 PaLMSFSoap = SFPaLMForeachSOAP
-PaLMForeachSOAP = PaLMForeachSOAP
 PrecondScheduleSFPaLMSOAP = PrecondScheduleSFPaLMSOAP
 SOAP = ForeachSOAP
 SFAdamW = ForeachSFAdamW
 LaProp = ForeachLaProp
 ADOPT = ForeachADOPT
-PrecondScheduleForeachSOAP = PrecondScheduleForeachSOAP
-PrecondSchedulePaLMForeachSOAP = PrecondSchedulePaLMForeachSOAP
+PrecondScheduleSOAP = PrecondScheduleForeachSOAP
+PrecondSchedulePaLMSOAP = PrecondSchedulePaLMForeachSOAP
 PSGDKron = ForeachPSGDKron
 AdamW = ForeachAdamW
 PurePSGD = ForeachPurePSGD
 PaLMPAdam = ForeachPaLMPAdam
 DelayedPSGD = ForeachDelayedPSGD
 CachedPSGDKron = ForeachCachedPSGDKron
+CachedDelayedPSGDKron = ForeachCachedDelayedPSGDKron
 __all__ = ['PalmForEachSoap', 'PaLMForeachSFAdamW', 'PaLMForeachSOAP', 'SFPaLMForeachSOAP', 'PrecondScheduleSFPaLMSOAP',
            'ForeachSOAP', 'ForeachSFAdamW', 'ForeachLaProp', 'ForeachADOPT', 'PrecondScheduleForeachSOAP',
            'PrecondSchedulePaLMForeachSOAP', 'ForeachPSGDKron', 'ForeachAdamW', 'ForeachPurePSGD', 'ForeachPaLMPAdam',
-           'ForeachDelayedPSGD', 'ForeachCachedPSGDKron',  #
-           'PaLMSOAP', 'PaLMSFAdamW', 'PaLMSFSoap', 'PaLMSFAdamW', 'PaLMForeachSOAP', 'PrecondScheduleSFPaLMSOAP',
+           'ForeachDelayedPSGD', 'ForeachCachedPSGDKron', 'ForeachCachedDelayedPSGDKron',  #
+           'PaLMSOAP', 'PaLMSFAdamW', 'PaLMSFSoap', 'PaLMSFAdamW', 'PrecondScheduleSFPaLMSOAP',
            'SOAP', 'SFAdamW', 'LaProp', 'ADOPT', 'PSGDKron', 'AdamW', 'PurePSGD', 'PaLMPAdam', 'DelayedPSGD',
-           'CachedPSGDKron']
+           'CachedPSGDKron', 'CachedDelayedPSGDKron', 'PrecondScheduleSOAP', 'PrecondSchedulePaLMSOAP']

heavyball-0.17.1/heavyball/cached_delayed_psgd_kron.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""
+Originally from Evan Walters and Omead Pooladzandi, 2024
+Modified under Creative Commons Attribution 4.0 International
+Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5ee8a1a4c29e4780bbf6c521e545189eff9/kron_torch/kron.py
+"""
+from typing import Optional
+import torch
+from heavyball.utils import einsum_base
+from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
+    precond_update_prob_schedule, split_p_and_g_in_group, line_to_triu, triu_to_line, set_, einsum_base, promote
+class ForeachCachedDelayedPSGDKron(PSGDBase):
+    """
+    Implements PSGD with off-by-one preconditioning (akin to ADOPT and SOAP) with cached preconditioners.
+    Args:
+        params (iterable): Iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float): Learning rate.
+        b1 (float): Momentum parameter.
+        weight_decay (float): Weight decay (L2 penalty).
+        preconditioner_update_probability (callable or float, optional): Probability of
+            updating the preconditioner. If None, defaults to a schedule that anneals
+            from 1.0 to 0.03 by 4000 steps.
+        max_size_triangular (int): Max size for dim's preconditioner to be triangular.
+        min_ndim_triangular (int): Minimum number of dimensions a layer needs
+            to have triangular preconditioners.
+        memory_save_mode: (string, optional), None, 'one_diag', or 'all_diag', None is default
+            to set all preconditioners to be triangular, 'one_diag' sets the largest
+            or last dim to be diagonal per layer, and 'all_diag' sets all preconditioners
+            to be diagonal.
+        momentum_into_precond_update: (bool), whether to send momentum into preconditioner
+            update instead of raw gradients.
+    """
+    def __init__(self, params, lr=0.001, beta=0.9, weight_decay=0.0, preconditioner_update_probability=None,
+                 max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
+                 momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
+                 split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True,
+                 foreach: bool = True, q_dtype='float32'):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= beta < 1.0:
+            raise ValueError(f"Invalid beta parameter: {beta}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+        if preconditioner_update_probability is None:
+            preconditioner_update_probability = precond_update_prob_schedule()
+        if clip_fn is None:
+            clip_fn = lambda x: trust_region_clip_(x, 0.9, 1.5)
+        self.preconditioner_update_probability = preconditioner_update_probability
+        self.clip_fn = clip_fn
+        defaults = dict(lr=lr, beta=beta, weight_decay=weight_decay, max_size_triangular=max_size_triangular,
+                        min_ndim_triangular=min_ndim_triangular, memory_save_mode=memory_save_mode,
+                        momentum_into_precond_update=momentum_into_precond_update, precond_lr=0.1,
+                        # precond lr hardcoded to 0.1
+                        precond_init_scale=1.0,  # precond init scale hardcoded to 1.0
+                        step=0, warmup_steps=warmup_steps, merge_dims=merge_dims, split=split,
+                        store_triu_as_line=store_triu_as_line,
+                        q_dtype=q_dtype)
+        super().__init__(params, defaults, foreach)
+        self._prob_step = 0
+    def _step(self, group):
+        # update preconditioners all together
+        update_prob = self.preconditioner_update_probability
+        if callable(update_prob):
+            update_prob = update_prob(self._prob_step)
+        do_update = self.rng.random() < update_prob
+        self._prob_step += 1
+        momentum_into_precond_update = group.get("momentum_into_precond_update", True)
+        precond_init_scale = group['precond_init_scale']
+        max_size_triangular = group['max_size_triangular']
+        min_ndim_triangular = group['min_ndim_triangular']
+        memory_save_mode = group['memory_save_mode']
+        precond_lr = group['precond_lr']
+        weight_decay = group['weight_decay']
+        lr = group['lr']
+        beta = group['beta']
+        store_triu_as_line = group['store_triu_as_line']
+        q_dtype = getattr(torch, group['q_dtype'])
+        vals = []
+        for p, g in split_p_and_g_in_group(group):
+            state = self.state_(p)
+            if 'Q' not in state:
+                state["exp_avg"] = torch.zeros_like(g)
+                Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
+                                                 memory_save_mode, dtype=q_dtype)
+                state['Q'] = triu_to_line(Q) if store_triu_as_line else Q
+                state['Q_cache'] = [torch.empty_like(q) for q in Q]
+                expr = [f'{c.upper()}{c}' if q_.ndim == 2 else c for c, q_ in zip(einsum_base, Q)]
+                expr = ','.join(expr)
+                grad_expr = ''.join(c for c, _ in zip(einsum_base, g.shape))
+                out_expr = ''.join(c.upper() if c.upper() in expr else c for c in grad_expr)
+                expr = f'{expr},{grad_expr}->{out_expr}'
+                state['cache_expr'] = expr
+            vals.append((p, g, state["exp_avg"], state["Q"], state['Q_cache']))
+        if not vals:
+            return
+        p_list, grad_list, exp_avg_list, Q_list, Q_cache_list = zip(*vals)
+        del vals
+        group["step"] += 1
+        torch._foreach_lerp_(exp_avg_list, grad_list, (1 - beta) / (1 - beta ** group["step"]))
+        grad_list, Q_list, Q_cache_list, exp_avg_list = list(grad_list), list(Q_list), list(Q_cache_list), list(
+            exp_avg_list)
+        for i, (p, g) in enumerate(zip(p_list, grad_list)):
+            cached_q = Q_cache_list.pop(0)
+            q_orig = Q_list.pop(0)
+            ea = exp_avg_list.pop(0)
+            if do_update:
+                q = line_to_triu(q_orig) if store_triu_as_line else q_orig
+                q32 = [promote(q_) for q_ in q]
+                self.balance([g], [q32])
+                self.do_update([p], [ea if momentum_into_precond_update else g], [q32], precond_lr, [q_orig], store_triu_as_line=store_triu_as_line)
+                for c_, q_ in zip(cached_q, q):
+                    if q_.ndim == 2:
+                        torch.matmul(q_.T.conj(), q_, out=c_)
+                    else:
+                        torch.mul(q_.conj(), q_, out=c_)
+            set_(g, torch.einsum(self.state_(p)['cache_expr'], *cached_q, ea))
+        grad_list = self.clip_fn(grad_list)
+        lr = -warmup(lr, group['step'], group['warmup_steps'])
+        update_param_(p_list, grad_list, lr, weight_decay)

{heavyball-0.16.0 → heavyball-0.17.1}/heavyball/cached_psgd_kron.py RENAMED Viewed

@@ -10,7 +10,7 @@ import torch
 from heavyball.utils import einsum_base
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
-    precond_update_prob_schedule, split_p_and_g_in_group, line_to_triu, triu_to_line, set_, einsum_base
+    precond_update_prob_schedule, split_p_and_g_in_group, line_to_triu, triu_to_line, set_, einsum_base, promote
 class ForeachCachedPSGDKron(PSGDBase):
@@ -40,7 +40,7 @@ class ForeachCachedPSGDKron(PSGDBase):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True,
-                 foreach: bool = True):
+                 foreach: bool = True, q_dtype='float32'):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= beta < 1.0:
@@ -61,7 +61,8 @@ class ForeachCachedPSGDKron(PSGDBase):
                         # precond lr hardcoded to 0.1
                         precond_init_scale=1.0,  # precond init scale hardcoded to 1.0
                         step=0, warmup_steps=warmup_steps, merge_dims=merge_dims, split=split,
-                        store_triu_as_line=store_triu_as_line)
+                        store_triu_as_line=store_triu_as_line,
+                        q_dtype=q_dtype)
         super().__init__(params, defaults, foreach)
         self._prob_step = 0
@@ -84,6 +85,7 @@ class ForeachCachedPSGDKron(PSGDBase):
         lr = group['lr']
         beta = group['beta']
         store_triu_as_line = group['store_triu_as_line']
+        q_dtype = getattr(torch, group['q_dtype'])
         vals = []
@@ -93,7 +95,7 @@ class ForeachCachedPSGDKron(PSGDBase):
             if 'Q' not in state:
                 state["exp_avg"] = torch.zeros_like(g)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
-                                                 memory_save_mode, dtype=g.dtype)
+                                                 memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q
                 state['Q_cache'] = [torch.empty_like(q) for q in Q]
@@ -124,18 +126,21 @@ class ForeachCachedPSGDKron(PSGDBase):
             q_orig = Q_list.pop(0)
             ea = exp_avg_list.pop(0)
+            new = torch.einsum(self.state_(p)['cache_expr'], *cached_q, ea)
             if do_update:
                 q = line_to_triu(q_orig) if store_triu_as_line else q_orig
-                self.balance([g], [q])
-                self.do_update([p], [ea if momentum_into_precond_update else g], [q], precond_lr,
-                               [q_orig] if store_triu_as_line else None)
+                q32 = [promote(q_) for q_ in q]
+                self.balance([g], [q32])
+                self.do_update([p], [ea if momentum_into_precond_update else g], [q32], precond_lr, [q_orig], store_triu_as_line=store_triu_as_line)
                 for c_, q_ in zip(cached_q, q):
                     if q_.ndim == 2:
                         torch.matmul(q_.T.conj(), q_, out=c_)
                     else:
                         torch.mul(q_.conj(), q_, out=c_)
-            set_(g, torch.einsum(self.state_(p)['cache_expr'], *cached_q, ea))
+            set_(g, new)
         grad_list = self.clip_fn(grad_list)
         lr = -warmup(lr, group['step'], group['warmup_steps'])

{heavyball-0.16.0 → heavyball-0.17.1}/heavyball/delayed_psgd.py RENAMED Viewed

@@ -8,7 +8,7 @@ import torch
 from heavyball.utils import copy_stochastic_list_
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
-    precond_update_prob_schedule, split_p_and_g_in_group, triu_to_line, line_to_triu, set_
+    precond_update_prob_schedule, split_p_and_g_in_group, triu_to_line, line_to_triu, set_, promote
 class ForeachDelayedPSGD(PSGDBase):
@@ -39,7 +39,7 @@ class ForeachDelayedPSGD(PSGDBase):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: callable = None, store_triu_as_line: bool = True,
-                 foreach: bool = True):
+                 foreach: bool = True, q_dtype='float32'):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= beta < 1.0:
@@ -60,7 +60,7 @@ class ForeachDelayedPSGD(PSGDBase):
                         # precond lr hardcoded to 0.1
                         precond_init_scale=1.0,  # precond init scale hardcoded to 1.0
                         step=0, warmup_steps=warmup_steps, merge_dims=merge_dims, split=split,
-                        store_triu_as_line=store_triu_as_line)
+                        store_triu_as_line=store_triu_as_line, q_dtype=q_dtype)
         super().__init__(params, defaults, foreach)
         self._prob_step = 0
@@ -83,6 +83,7 @@ class ForeachDelayedPSGD(PSGDBase):
         lr = group['lr']
         beta = group['beta']
         store_triu_as_line = group['store_triu_as_line']
+        q_dtype = getattr(torch, group['q_dtype'])
         vals = []
@@ -92,7 +93,7 @@ class ForeachDelayedPSGD(PSGDBase):
             if 'Q' not in state:
                 state["exp_avg"] = torch.zeros_like(g)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
-                                                 memory_save_mode, dtype=g.dtype)
+                                                 memory_save_mode, dtype=q_dtype)
                 state["Q"] = triu_to_line(Q) if store_triu_as_line else Q
             vals.append((p, g, state["exp_avg"], state["Q"]))
@@ -114,9 +115,9 @@ class ForeachDelayedPSGD(PSGDBase):
             q = line_to_triu(q_orig) if store_triu_as_line else q_orig
             new = psgd_precond_grad(q, self.state_(p)["exprs"], ea)
             if do_update:
-                self.do_update([p], [ea if momentum_into_precond_update else g], [q], precond_lr,
-                               [q_orig] if store_triu_as_line else None)
-                self.balance([g], [q])
+                q32 = [promote(q_) for q_ in q]
+                self.do_update([p], [ea if momentum_into_precond_update else g], [q32], precond_lr, [q_orig], store_triu_as_line=store_triu_as_line)
+                self.balance([g], [q32])
             set_(g, new)
         grad_list = self.clip_fn(grad_list)

{heavyball-0.16.0 → heavyball-0.17.1}/heavyball/p_adam.py RENAMED Viewed

@@ -8,7 +8,7 @@ import torch
 from heavyball.utils import triu_to_line, line_to_triu
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, PSGDBase, precond_update_prob_schedule, \
-    exp_avg_sq_, beta_debias, split_p_and_g_in_group
+    exp_avg_sq_, beta_debias, split_p_and_g_in_group, promote
 class ForeachPaLMPAdam(PSGDBase):
@@ -39,7 +39,7 @@ class ForeachPaLMPAdam(PSGDBase):
                  momentum_into_precond_update=True, warmup_steps: int = 1, betas=(None, None), beta: float = 0.9,
                  beta2_scale: float = 0.8, merge_dims: bool = False, split: bool = False, clip_fn: callable = None,
                  store_triu_as_line: bool = True,
-                 foreach: bool = True):
+                 foreach: bool = True, q_dtype='float32'):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= weight_decay:
@@ -60,7 +60,7 @@ class ForeachPaLMPAdam(PSGDBase):
                         # precond lr hardcoded to 0.1
                         precond_init_scale=1.0,  # precond init scale hardcoded to 1.0
                         step=0, warmup_steps=warmup_steps, beta=beta, beta2_scale=beta2_scale, merge_dims=merge_dims,
-                        split=split, store_triu_as_line=store_triu_as_line)
+                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype)
         super().__init__(params, defaults, foreach)
         self._prob_step = 0
@@ -81,6 +81,7 @@ class ForeachPaLMPAdam(PSGDBase):
         weight_decay = group['weight_decay']
         lr = group['lr']
         store_triu_as_line = group['store_triu_as_line']
+        q_dtype = getattr(torch, group['q_dtype'])
         vals = []
@@ -91,7 +92,7 @@ class ForeachPaLMPAdam(PSGDBase):
                 state['exp_avg'] = torch.zeros_like(g)
                 state['exp_avg_sq'] = torch.zeros_like(g)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular,
-                                                 min_ndim_triangular, memory_save_mode, dtype=g.dtype)
+                                                 min_ndim_triangular, memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q
             vals.append((p, g, state["Q"], state['exp_avg'], state['exp_avg_sq']))
@@ -106,9 +107,10 @@ class ForeachPaLMPAdam(PSGDBase):
         Q_triu = [line_to_triu(q) if store_triu_as_line else q for q in Q_list]
         if do_update:
-            self.balance(grad_list, Q_triu)
-            self.do_update(p_list, grad_list, Q_triu, precond_lr, Q_list if store_triu_as_line else None)
+            for g, p, q_, q_orig in zip(grad_list, p_list, Q_triu, Q_list):
+                q32 = [promote(qq_) for qq_ in q_]
+                self.balance([g], [q32])
+                self.do_update([p], [g], [q32], precond_lr, [q_orig], store_triu_as_line=store_triu_as_line)
         torch._foreach_lerp_(exp_avg, grad_list, 1 - beta_debias(group['beta'], group['step']))
         beta2 = 1 - group['step'] ** -group['beta2_scale']

{heavyball-0.16.0 → heavyball-0.17.1}/heavyball/psgd_kron.py RENAMED Viewed

@@ -9,7 +9,7 @@ from typing import Optional
 import torch
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
-    precond_update_prob_schedule, split_p_and_g_in_group, line_to_triu, triu_to_line, set_
+    precond_update_prob_schedule, split_p_and_g_in_group, line_to_triu, triu_to_line, set_, promote
 class ForeachPSGDKron(PSGDBase):
@@ -39,7 +39,7 @@ class ForeachPSGDKron(PSGDBase):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True,
-                 foreach: bool = True):
+                 foreach: bool = True, q_dtype='float32'):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= beta < 1.0:
@@ -60,7 +60,7 @@ class ForeachPSGDKron(PSGDBase):
                         # precond lr hardcoded to 0.1
                         precond_init_scale=1.0,  # precond init scale hardcoded to 1.0
                         step=0, warmup_steps=warmup_steps, merge_dims=merge_dims, split=split,
-                        store_triu_as_line=store_triu_as_line)
+                        store_triu_as_line=store_triu_as_line, q_dtype=q_dtype)
         super().__init__(params, defaults, foreach)
         self._prob_step = 0
@@ -83,6 +83,7 @@ class ForeachPSGDKron(PSGDBase):
         lr = group['lr']
         beta = group['beta']
         store_triu_as_line = group['store_triu_as_line']
+        q_dtype = getattr(torch, group['q_dtype'])
         vals = []
@@ -92,7 +93,7 @@ class ForeachPSGDKron(PSGDBase):
             if 'Q' not in state:
                 state["exp_avg"] = torch.zeros_like(g)
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
-                                                 memory_save_mode, dtype=g.dtype)
+                                                 memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q
             vals.append((p, g, state["exp_avg"], state["Q"]))
@@ -114,9 +115,9 @@ class ForeachPSGDKron(PSGDBase):
             q = line_to_triu(q_orig) if store_triu_as_line else q_orig
             if do_update:
-                self.balance([g], [q])
-                self.do_update([p], [ea if momentum_into_precond_update else g], [q], precond_lr,
-                               [q_orig] if store_triu_as_line else None)
+                q32 = [promote(q_) for q_ in q]
+                self.balance([ea if momentum_into_precond_update else g], [q32])
+                self.do_update([p], [g], [q32], precond_lr, [q_orig], store_triu_as_line=store_triu_as_line)
             set_(g, psgd_precond_grad(q, self.state_(p)["exprs"], ea))
         grad_list = self.clip_fn(grad_list)

{heavyball-0.16.0 → heavyball-0.17.1}/heavyball/pure_psgd.py RENAMED Viewed

@@ -5,9 +5,10 @@ Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5e
 """
 import torch
+from heavyball.utils import copy_stochastic_list_
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, PSGDBase, precond_update_prob_schedule, \
-    split_p_and_g_in_group, line_to_triu, triu_to_line
+    split_p_and_g_in_group, line_to_triu, triu_to_line, promote
 class ForeachPurePSGD(PSGDBase):
@@ -37,7 +38,7 @@ class ForeachPurePSGD(PSGDBase):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
                  split: bool = False, clip_fn: callable = None, store_triu_as_line: bool = True,
-                 foreach: bool = True):
+                 foreach: bool = True, q_dtype='float32'):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= weight_decay:
@@ -56,7 +57,7 @@ class ForeachPurePSGD(PSGDBase):
                         # precond lr hardcoded to 0.1
                         precond_init_scale=1.0,  # precond init scale hardcoded to 1.0
                         step=0, warmup_steps=warmup_steps, merge_dims=merge_dims, split=split,
-                        store_triu_as_line=store_triu_as_line)
+                        store_triu_as_line=store_triu_as_line, q_dtype=q_dtype)
         super().__init__(params, defaults, foreach)
         self._prob_step = 0
@@ -77,6 +78,7 @@ class ForeachPurePSGD(PSGDBase):
         weight_decay = group['weight_decay']
         lr = group['lr']
         store_triu_as_line = group['store_triu_as_line']
+        q_dtype = getattr(torch, group['q_dtype'])
         vals = []
@@ -85,7 +87,7 @@ class ForeachPurePSGD(PSGDBase):
             if 'Q' not in state:
                 Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
-                                                 memory_save_mode, dtype=g.dtype)
+                                                 memory_save_mode, dtype=q_dtype)
                 state['Q'] = triu_to_line(Q) if store_triu_as_line else Q
             vals.append((p, g, state["Q"]))
@@ -104,8 +106,9 @@ class ForeachPurePSGD(PSGDBase):
             q = line_to_triu(q_orig) if store_triu_as_line else q_orig
             if do_update:
-                self.balance([g], [q])
-                self.do_update([p], [g], [q], precond_lr, [q_orig] if store_triu_as_line else None)
+                q32 = [promote(q_) for q_ in q]
+                self.balance([g], [q32])
+                self.do_update([p], [g], [q32], precond_lr, [q_orig], store_triu_as_line=store_triu_as_line)
             psgd_precond_grad(q, self.state_(p)["exprs"], g, inplace=True)
         grad_list = self.clip_fn(grad_list)

{heavyball-0.16.0 → heavyball-0.17.1}/heavyball/utils.py RENAMED Viewed

@@ -325,9 +325,9 @@ def compute_ggt(grad, GG, max_precond_dim, precondition_1d, beta):
 def promote(x):
-    if x is (torch.bfloat16, torch.float16):
+    if x in (torch.bfloat16, torch.float16):
         return torch.float32
-    if x.dtype in (torch.bfloat16, torch.float16):
+    if hasattr(x, 'dtype') and x.dtype in (torch.bfloat16, torch.float16):
         return x.float()
     return x
@@ -468,15 +468,15 @@ class ScheduleFree(StatefulOptimizer):
 def copy_stochastic_list_(target: List[torch.Tensor], source: List[torch.Tensor]):
     for t, s in zip(target, source):
-        if t.dtype == torch.bfloat16:
-            copy_stochastic_(t, s)
-        else:
-            set_(t, s)
+        copy_stochastic_(t, s)
 def copy_stochastic_(target: torch.Tensor, source: torch.Tensor):
     if target.data_ptr() == source.data_ptr():
         return
+    if target.dtype != torch.bfloat16:
+        set_(target, source)
+        return
     """Taken as-is from https://github.com/pytorch/pytorch/issues/120376#issuecomment-1974828905"""
     # create a random 16 bit integer
@@ -555,7 +555,7 @@ def init_Q_exprs(t, scale, max_size, min_ndim_triangular, memory_save_mode, dtyp
     for i, (size, dim_d) in enumerate(zip(shape, dim_diag)):
         if size == 1 or size > max_size or len(shape) < min_ndim_triangular or dim_d:
             # use diagonal matrix as preconditioner for this dim
-            Q.append(scale * torch.ones(size, dtype=dtype, device=t.device))
+            Q.append(scale * torch.ones(size, dtype=promote(dtype), device=t.device))
             piece1A.append(letters[i])
             piece2A = piece2A + letters[i]
@@ -669,11 +669,11 @@ def psgd_update_precond(Q, exprs, V, G, step, tiny):
 @decorator
 def psgd_precond_grad(Q, exprs, G, inplace: bool = False):
     """Precondition gradient G with preconditioner Q."""
-    out = torch.einsum(exprs[-1], *[q.conj() for q in Q], *Q, G)
+    out = torch.einsum(exprs[-1], *[q.conj() for q in Q], *Q, G.to(Q[0].dtype))
     if inplace:
         set_(G, out)
         return G
-    return out
+    return out.to(G.dtype)
 def norm_clip_(x, scale=None):
@@ -768,28 +768,33 @@ def line_to_triu(Q_list: List[Tuple[Optional[List[int]], torch.Tensor]]):
 def update_triu_(q_state, materialised):
     for (shape0, q), (shape1, m) in zip(q_state, triu_to_line(materialised)):
         assert shape0 == shape1
-        set_(q, m)
+        copy_stochastic_(q, m)
 class PSGDBase(StatefulOptimizer):
+    balance_probability: float = 0.01
     def __init__(self, parameters, groups, foreach: bool = True):
         super().__init__(parameters, groups, foreach)
         self.rng = random.Random(0x1923213)
         self._tiny = torch.finfo(torch.bfloat16).tiny
     def balance(self, grad_list, Q_list):
-        if self.rng.random() > 0.01:
+        if self.rng.random() > self.balance_probability:
             return
         for g, q in zip(grad_list, Q_list):
             if g.dim() > 1:
                 psgd_balance_Q(q)
-    def do_update(self, p_list, grad_list, q_list, precond_lr, original_q: Optional[List] = None):
+    def do_update(self, p_list, grad_list, q_list, precond_lr, original_q: Optional[List] = None, store_triu_as_line=False):
         for i, (p, grad, Q) in enumerate(zip(p_list, grad_list, q_list)):
             psgd_update_precond(Q, self.state_(p)["exprs"], torch.randn_like(grad), grad, precond_lr, self._tiny)
             if original_q:
-                update_triu_(original_q[i], Q)
+                if store_triu_as_line:
+                    update_triu_(original_q[i], Q)
+                else:
+                    copy_stochastic_(original_q[i], Q)
 def precond_update_prob_schedule(max_prob=1.0, min_prob=0.03, decay=0.001, flat_start=250):

{heavyball-0.16.0 → heavyball-0.17.1}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.16.0
+Version: 0.17.1
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler
@@ -32,8 +32,8 @@ A simple package of efficient optimizers
 The goal is not to thrive for completeness, full maintenance or abstraction, but instead to provide a simple
 largely static alternative to `torch.optim` with more and better optimizers.
-Currently (2024-11-17, 0.15.0), the recommended stable optimizer is `PrecondSchedulePaLMForeachSOAP` (see below). The
-recommended experimental optimizer is `ForeachPSGDKron`.
+Currently (2024-11-20, 0.17.0), the recommended stable optimizer is `PrecondSchedulePaLMSOAP` (see below). The
+recommended experimental optimizer is `DelayedPSGDKron` ([tuning guide](docs/psgd_efficiency.md)).
 ## Features
@@ -62,7 +62,7 @@ import heavyball
 model = torch.nn.Linear(16, 1)
 # Create an optimizer
-optimizer = heavyball.PrecondSchedulePaLMForeachSOAP(model.parameters(), lr=1e-3)
+optimizer = heavyball.PrecondSchedulePaLMSOAP(model.parameters(), lr=1e-3)
 x = torch.randn(128, 16)
 y = torch.randn(128, 1)
@@ -76,19 +76,19 @@ for _ in range(1000):
 ## Optimizers
-| Name                                 | Description                                                                                                                                                       | Advantages / Disadvantages                                                                                                                                                                                                                                                                                                            |
-|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| **ForeachAdamW**                     | More efficient (speed, memory) [AdamW](https://arxiv.org/abs/1711.05101)                                                                                          | + Faster than AdamW<br>+ Possibly more (numerically) stable
-| **ForeachLaProp**                    | More efficient (speed, memory) [LaProp](https://arxiv.org/abs/2002.04839)                                                                                         | + Same cost as AdamW<br>+ Marginally better converence (better proofs)<br>+ Higher hyperparameter stability<br>- Not a guaranteed win (can be neutral)<br>- No "Slingshot"                                                                                                                                                            |
-| **ForeachADOPT**                     | More efficient (speed, memory) [ADOPT](https://arxiv.org/abs/2411.02853)                                                                                          | + Same cost as AdamW<br>+ Rigorous mathematical convergence proofs, even for challenging models (GANs)<br>- Empirically underperforms LaProp<br>- no bf16                                                                                                                                                                             |
-| **ForeachSFAdamW**                   | More efficient (speed, memory) [ScheduleFree AdamW](https://arxiv.org/abs/2405.15682)                                                                             | + Same cost as AdamW, but better eval perf<br>+ Full control over hyperparameters                                                                                                                                                                                                                                                     |
-| **PaLMForeachSFAdamW**               | ForeachSFAdamW with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                     | + Same cost as AdamW, but better eval perf<br>+ Less control, but faster early and more stable late convergence<br>+ ScheduleFree<br>- slow early convergence                                                                                                                                                                         |
-| **ForeachSOAP**                      | More efficient (speed, memory) [SOAP](https://arxiv.org/abs/2409.11321)                                                                                           | + Faster convergence (loss-at-step)<br>+ Full control over hyperparameters<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                                                |
-| **PaLMForeachSOAP**                  | ForeachSOAP with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                        | + Faster convergence (loss-at-step)<br>+ Less control, but faster early and more stable late convergence<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                  |
-| **SFPaLMForeachSOAP**                | ScheduleFree PaLMForeachSOAP                                                                                                                                      | + Fast convergence (loss-at-step)<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized)                                                                                                      |
-| **PrecondScheduleSFPaLMForeachSOAP** | SFPaLMForeachSOAP with [preconditioner schedule](https://github.com/lixilinx/psgd_torch/), matching the error of PrecondEvery=2 with the cost of PrecondEvery=512 | + Better initial convergence than SFPaLMForeachSOAP<br>+ Significantly faster (sec/it) later<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of step |
-| **PrecondSchedulePaLMForeachSOAP**   | PrecondScheduleSFPaLMForeachSOAP without schedule-free                                                                                                            | + Best initial convergence<br>+ Significantly faster (sec/it) later<br>+ high stability<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                   |
-| **PrecondScheduleForeachSOAP**       | PrecondScheduleSFPaLMForeachSOAP without PaLM's beta2 schedule                                                                                                    | + Better initial convergence<br>+ Significantly faster (sec/it) later<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                                     |
+| Name                    | Description                                                                                                                                                       | Advantages / Disadvantages                                                                                                                                                                                                                                                                                                            |
+|-------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **AdamW**               | More efficient (speed, memory) [AdamW](https://arxiv.org/abs/1711.05101)                                                                                          | + Faster than AdamW<br>+ Possibly more (numerically) stable
+| **LaProp**              | More efficient (speed, memory) [LaProp](https://arxiv.org/abs/2002.04839)                                                                                         | + Same cost as AdamW<br>+ Marginally better converence (better proofs)<br>+ Higher hyperparameter stability<br>- Not a guaranteed win (can be neutral)<br>- No "Slingshot"                                                                                                                                                            |
+| **ADOPT**               | More efficient (speed, memory) [ADOPT](https://arxiv.org/abs/2411.02853)                                                                                          | + Same cost as AdamW<br>+ Rigorous mathematical convergence proofs, even for challenging models (GANs)<br>- Empirically underperforms LaProp<br>- no bf16                                                                                                                                                                             |
+| **SFAdamW**             | More efficient (speed, memory) [ScheduleFree AdamW](https://arxiv.org/abs/2405.15682)                                                                             | + Same cost as AdamW, but better eval perf<br>+ Full control over hyperparameters                                                                                                                                                                                                                                                     |
+| **PaLMSFAdamW**         | ForeachSFAdamW with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                     | + Same cost as AdamW, but better eval perf<br>+ Less control, but faster early and more stable late convergence<br>+ ScheduleFree<br>- slow early convergence                                                                                                                                                                         |
+| **SOAP**                | More efficient (speed, memory) [SOAP](https://arxiv.org/abs/2409.11321)                                                                                           | + Faster convergence (loss-at-step)<br>+ Full control over hyperparameters<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                                                |
+| **PaLMSOAP**            | ForeachSOAP with [PaLM's beta2 schedule](https://arxiv.org/abs/2204.02311)                                                                                        | + Faster convergence (loss-at-step)<br>+ Less control, but faster early and more stable late convergence<br>- more memory usage<br>- more hyperparameters<br>- higher overhead than AdamW (can be ammortized; better loss-at-second)                                                                                                  |
+| **SFPaLMSOAP**          | ScheduleFree PaLMForeachSOAP                                                                                                                                      | + Fast convergence (loss-at-step)<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized)                                                                                                      |
+| **PrecondScheduleSFPaLMSOAP** | SFPaLMForeachSOAP with [preconditioner schedule](https://github.com/lixilinx/psgd_torch/), matching the error of PrecondEvery=2 with the cost of PrecondEvery=512 | + Better initial convergence than SFPaLMForeachSOAP<br>+ Significantly faster (sec/it) later<br>+ less memory usage than PaLMForeachSOAP (more tham AdamW)<br>- slower initial convergence than PaLMForeachSOAP (but allows higher LRs)<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of step |
+| **PrecondSchedulePaLMSOAP** | PrecondScheduleSFPaLMForeachSOAP without schedule-free                                                                                                            | + Best initial convergence<br>+ Significantly faster (sec/it) later<br>+ high stability<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                   |
+| **PrecondScheduleSOAP** | PrecondScheduleSFPaLMForeachSOAP without PaLM's beta2 schedule                                                                                                    | + Better initial convergence<br>+ Significantly faster (sec/it) later<br>- more memory usage than PrecondScheduleSFPaLMForeachSOAP<br>- higher overhead than AdamW (can be ammortized), goes to 0 with increasing number of steps                                                                                                     |
 ## Precond Schedule

{heavyball-0.16.0 → heavyball-0.17.1}/heavyball.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,6 +2,7 @@ LICENSE
 README.md
 setup.py
 heavyball/__init__.py
+heavyball/cached_delayed_psgd_kron.py
 heavyball/cached_psgd_kron.py
 heavyball/delayed_psgd.py
 heavyball/foreach_adamw.py
@@ -24,6 +25,7 @@ heavyball.egg-info/SOURCES.txt
 heavyball.egg-info/dependency_links.txt
 heavyball.egg-info/requires.txt
 heavyball.egg-info/top_level.txt
+test/test_bf16_q.py
 test/test_closure.py
 test/test_foreach.py
 test/test_memory.py

{heavyball-0.16.0 → heavyball-0.17.1}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setuptools.setup(
     name='heavyball',
     license='BSD',
     description='Efficient optimizers',
-    version='0.16.0',
+    version='0.17.1',
     long_description=README,
     url='https://github.com/clashluke/heavyball',
     packages=setuptools.find_packages(),

heavyball-0.17.1/test/test_bf16_q.py ADDED Viewed

@@ -0,0 +1,52 @@
+import heavyball
+import heavyball.utils
+import pytest
+import torch
+from benchmark.utils import get_optim
+from heavyball.utils import clean, set_torch, PSGDBase
+from torch import nn
+def get_memory():
+    clean()
+    torch.cuda.synchronize()
+    clean()
+    torch.cuda.synchronize()
+    return torch.cuda.memory_allocated()
+@pytest.mark.parametrize("opt", heavyball.__all__)
+@pytest.mark.parametrize("size,depth", [(256, 2)])
+def test_foreach(opt, size, depth: int, iterations: int = 128, outer_iterations: int = 3):
+    set_torch()
+    opt = getattr(heavyball, opt)
+    if not issubclass(opt, PSGDBase):
+        raise pytest.skip('Only PSGD is supported')
+    peaks = []
+    losses = []
+    for q_dtype in ['float32', 'bfloat16']:
+        peaks.append([])
+        losses.append([])
+        for i in range(outer_iterations):
+            torch.manual_seed(0x2131290)
+            model = nn.Sequential(*[nn.Linear(size, size) for _ in range(depth)]).cuda()
+            o = get_optim(opt, model.parameters(), lr=1e-3, q_dtype=q_dtype)
+            for _ in range(iterations):
+                loss = model(torch.randn((1024, size)).cuda()).square().mean()
+                loss.backward()
+                o.step()
+                o.zero_grad()
+                losses[-1].append(loss.detach())
+            del model, o
+            clean()
+    for i, (l0, l1) in enumerate(zip(*losses)):
+        print(i, l0.item(), l1.item())
+        assert torch.allclose(l0, l1, rtol=0.1)

{heavyball-0.16.0 → heavyball-0.17.1}/test/test_closure.py RENAMED Viewed

@@ -20,7 +20,7 @@ class Param(nn.Module):
 @pytest.mark.parametrize("opt", heavyball.__all__)
 @pytest.mark.parametrize("size", [(4, 4, 4, 4), ])
-def test_closre(opt, size: List[int], depth: int = 2, iterations: int = 5, outer_iterations: int = 3):
+def test_closure(opt, size: List[int], depth: int = 2, iterations: int = 5, outer_iterations: int = 3):
     clean()
     set_torch()

{heavyball-0.16.0 → heavyball-0.17.1}/test/test_memory.py RENAMED Viewed

@@ -25,14 +25,14 @@ expected_memory = {'adamw': {'after': 4, 'peak': 5.1}, 'soap': {'after': 7, 'pea
 @pytest.mark.parametrize("size,depth", [(8192, 1), (2048, 16)])
 def test_memory(opt, method, size, depth: int, iterations: int = 5, outer_iterations: int = 3):
     if 'soap' not in opt.lower() and method != 'qr':
-        return
+        raise pytest.skip('Only SOAP supports `method` argument')
     set_torch()
     for k, v in expected_memory.items():
         if k in opt.lower():
             break
     else:
-        raise ValueError(f'Unknown optimizer {opt}')
+        raise pytest.skip(f'Opt {opt} not supported')
     opt = getattr(heavyball, opt)
     heavyball.utils.zeroth_power_mode = method

{heavyball-0.16.0 → heavyball-0.17.1}/test/test_merge.py RENAMED Viewed

@@ -26,7 +26,7 @@ class Param(nn.Module):
 def test_merge(opt, method, size: List[int], merge, split, depth: int = 2, iterations: int = 5,
                outer_iterations: int = 3):
     if 'soap' not in opt.lower() and method != 'qr':
-        return
+        raise pytest.skip('Only SOAP supports `method` argument')
     clean()
     set_torch()

{heavyball-0.16.0 → heavyball-0.17.1}/test/test_psgd.py RENAMED Viewed

@@ -1,11 +1,10 @@
-import pytest
-import torch
-from torch import nn
 import heavyball
 import heavyball.utils
+import pytest
+import torch
 from benchmark.utils import get_optim
 from heavyball.utils import clean, set_torch
+from torch import nn
 def get_memory():
@@ -16,10 +15,6 @@ def get_memory():
     return torch.cuda.memory_allocated()
-expected_memory = {'adamw': {'after': 4, 'peak': 5.1}, 'soap': {'after': 7, 'peak': 14},
-                   'psgd': {'after': 4, 'peak': 11.5}, 'padam': {'after': 5, 'peak': 11.4}}
 @pytest.mark.parametrize("opt", ['ForeachPSGDKron', 'ForeachPaLMPAdam', 'ForeachPurePSGD', 'ForeachDelayedPSGD'])
 @pytest.mark.parametrize("method",
                          ['norm_clip_', 'mu_law_compress', 'a_law_compress', 'trust_region_clip_', 'identity'])
@@ -27,12 +22,6 @@ expected_memory = {'adamw': {'after': 4, 'peak': 5.1}, 'soap': {'after': 7, 'pea
 def test_clip(opt, method, size, depth: int, iterations: int = 100, outer_iterations: int = 3):
     set_torch()
-    for k, v in expected_memory.items():
-        if k in opt.lower():
-            break
-    else:
-        raise ValueError(f'Unknown optimizer {opt}')
     opt = getattr(heavyball, opt)
     for i in range(outer_iterations):