PyPI - heavyball - Versions diffs - 1.5.2__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

heavyball 1.5.2py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

heavyball/__init__.py +73 -17
heavyball/chainable.py +76 -14
heavyball/utils.py +322 -175
{heavyball-1.5.2.dist-info → heavyball-1.6.0.dist-info}/METADATA +4 -6
heavyball-1.6.0.dist-info/RECORD +8 -0
heavyball-1.5.2.dist-info/RECORD +0 -8
{heavyball-1.5.2.dist-info → heavyball-1.6.0.dist-info}/LICENSE +0 -0
{heavyball-1.5.2.dist-info → heavyball-1.6.0.dist-info}/WHEEL +0 -0
{heavyball-1.5.2.dist-info → heavyball-1.6.0.dist-info}/top_level.txt +0 -0

heavyball/__init__.py CHANGED Viewed

@@ -104,23 +104,17 @@ class ForeachSOAP(C.BaseOpt):
             Nikhil Vyas, Depen Morwani, Rosie Zhao, Itai Shapira, David Brandfonbrener, Lucas Janson, Sham Kakade
             https://arxiv.org/abs/2409.11321
             https://github.com/nikhilvyas/SOAP
-        ScheduleFree:
-            The Road Less Scheduled
-            Aaron Defazio, Xingyu Alice Yang, Harsh Mehta, Konstantin Mishchenko, Ahmed Khaled, Ashok Cutkosky
-            https://arxiv.org/abs/2405.15682
-            https://github.com/facebookresearch/schedule_free
     """
     use_precond_schedule: bool = False
     def __init__(self, params, lr: float = 3e-3, betas=(0.9, 0.95), shampoo_beta: float = 0.95, eps: float = 1e-8,
                  weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
-                 data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 0,
-                 split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
-                 mars_gamma: float = 0.0025, palm: bool = C.use_default, precond_scheduler=(1 / 3, 9),
-                 beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
-                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default):
+                 correct_bias: bool = True, warmup_steps: int = 0, split: bool = False, foreach: bool = True,
+                 mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025, palm: bool = C.use_default,
+                 precond_scheduler=(1 / 3, 9), beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
+                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default,
+                 storage_dtype: str = 'float32', stochastic_schedule: bool = False):
         use_precond_schedule = C.default(use_precond_schedule, self.use_precond_schedule)
         defaults = locals()
@@ -137,6 +131,54 @@ class ForeachSOAP(C.BaseOpt):
                          C.scale_by_soap)
+class ForeachSignLaProp(C.BaseOpt):
+    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
+                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_laprop, C.sign)
+class ForeachSOLP(C.BaseOpt):
+    """
+    ForeachSOLP
+    Sources:
+        Baseline SOAP:
+            SOAP: Improving and Stabilizing Shampoo using Adam
+            Nikhil Vyas, Depen Morwani, Rosie Zhao, Itai Shapira, David Brandfonbrener, Lucas Janson, Sham Kakade
+            https://arxiv.org/abs/2409.11321
+            https://github.com/nikhilvyas/SOAP
+    """
+    use_precond_schedule: bool = False
+    def __init__(self, params, lr: float = 3e-3, betas=(0.9, 0.95), shampoo_beta: float = 0.95, eps: float = 1e-8,
+                 weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
+                 merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
+                 correct_bias: bool = True, warmup_steps: int = 0, split: bool = False, foreach: bool = True,
+                 mars: bool = False, caution: bool = False, mars_gamma: float = 0.0025, palm: bool = C.use_default,
+                 precond_scheduler=(1 / 3, 9), beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
+                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default,
+                 storage_dtype: str = 'float32', stochastic_schedule: bool = False):
+        use_precond_schedule = C.default(use_precond_schedule, self.use_precond_schedule)
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        if use_precond_schedule:
+            del defaults['precondition_frequency']
+            self.precond_schedule = utils.get_soap_precond_schedule(defaults.pop("precond_scheduler"))
+        else:
+            del defaults['precond_scheduler']
+            self.precond_schedule = 1 / defaults.pop("precondition_frequency")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,  #
+                         functools.partial(C.scale_by_soap, inner='laprop'))
 class PaLMForeachSOAP(ForeachSOAP):
     use_precond_schedule: bool = False
     palm: bool = True
@@ -163,6 +205,18 @@ class OrthoLaProp(C.BaseOpt):
                          C.orthogonalize_grad_to_param, C.scale_by_laprop)
+class LaPropOrtho(C.BaseOpt):
+    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
+                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_laprop,
+                         C.orthogonalize_grad_to_param)
 class ForeachPSGDKron(C.BaseOpt):
     """
     Originally from Evan Walters and Omead Pooladzandi, 2024
@@ -178,10 +232,10 @@ class ForeachPSGDKron(C.BaseOpt):
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 0, merge_dims: bool = False,
                  split: bool = False, store_triu_as_line: bool = True, foreach: bool = True, q_dtype='float32',
-                 stochastic_schedule: bool = True, storage_dtype: str = 'float32', mars: bool = False,
+                 stochastic_schedule: bool = False, storage_dtype: str = 'float32', mars: bool = False,
                  caution: bool = False, mars_gamma: float = 0.0025, delayed: Optional[bool] = C.use_default,
                  cached: Optional[bool] = C.use_default, exp_avg_input: Optional[bool] = C.use_default,
-                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default, #
+                 gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default,  #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
         defaults = locals()
@@ -238,10 +292,12 @@ DelayedPSGD = ForeachDelayedPSGD
 CachedPSGDKron = ForeachCachedPSGDKron
 CachedDelayedPSGDKron = ForeachCachedDelayedPSGDKron
 Muon = ForeachMuon
+SignLaProp = ForeachSignLaProp
 __all__ = ["Muon", "RMSprop", "PrecondSchedulePaLMSOAP", "PSGDKron", "PurePSGD", "DelayedPSGD", "CachedPSGDKron",
            "CachedDelayedPSGDKron", "PalmForEachSoap", "PaLMSOAP", "PaLMSFAdamW", "LaProp", "ADOPT",
-           "PrecondScheduleSOAP", "PrecondSchedulePaLMSOAP", 'RMSprop', 'MuonLaProp',  #
-           "ForeachAdamW", "ForeachSFAdamW", "ForeachLaProp", "ForeachADOPT", "ForeachSOAP", "ForeachPSGDKron",
-           "ForeachPurePSGD", "ForeachDelayedPSGD", "ForeachCachedPSGDKron", "ForeachCachedDelayedPSGDKron",
-           "ForeachRMSprop", "ForeachMuon", 'ForeachCachedNewtonPSGD']
+           "PrecondScheduleSOAP", "PrecondSchedulePaLMSOAP", 'RMSprop', 'MuonLaProp', 'ForeachSignLaProp'  #
+                                                                                      "ForeachAdamW", "ForeachSFAdamW",
+           "ForeachLaProp", "ForeachADOPT", "ForeachSOAP", "ForeachPSGDKron", "ForeachPurePSGD", "ForeachDelayedPSGD",
+           "ForeachCachedPSGDKron", "ForeachCachedDelayedPSGDKron", "ForeachRMSprop", "ForeachMuon",
+           'ForeachCachedNewtonPSGD', 'OrthoLaProp', 'LaPropOrtho', 'SignLaProp']

heavyball/chainable.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import functools
 import random
-from typing import Optional, Union, Literal
+from typing import Optional, Union, Literal, List
 import torch
@@ -127,7 +127,7 @@ def zero_guard(*names):
 def copy_guard(index, *names):
-    return functools.partial(CopyGuard, index=index, names=names, )
+    return functools.partial(CopyGuard, index=index, names=names)
 def general_guard(*names, init_fn, skip_first: bool = True):
@@ -152,6 +152,22 @@ def exp_avg(group, update, grad, param, exp_avg):
     return utils.scale_by_exp_avg_(exp_avg, update, utils.beta_debias(utils.get_beta1(group), group["step"]))
+@zero_guard('exp_avg')
+@no_state
+def weight_decay_to_ema(group, update, grad, param, exp_avg):
+    utils.weight_decay_to_ema_(exp_avg, update, utils.beta_debias(group['ema_beta'], group['step']),
+                               group['weight_decay_to_ema'] * group['lr'])
+    return update
+@zero_guard('exp_avg')
+@no_state
+def l1_weight_decay_to_ema(group, update, grad, param, exp_avg):
+    utils.l1_weight_decay_to_ema_(exp_avg, update, utils.beta_debias(group['ema_beta'], group['step']),
+                                  group['weight_decay_to_ema'] * group['lr'])
+    return update
 @zero_guard("exp_avg_sq")
 @no_state
 def scale_by_exp_avg_sq(group, update, grad, param, exp_avg_sq):
@@ -206,14 +222,15 @@ def update_by_schedule_free(group, update, grad, param, z):
 @no_state
 def update_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
     if group['step'] == 1:
-        utils.exp_avg_sq_(exp_avg_sq, update, 0, 1)
+        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, 0, group['eps'])
         raise SkipUpdate
     if group['step'] == 2:
         update = utils.promote(update)
         easq = utils.promote(exp_avg_sq)
         [utils.set_(ea, u / easq_.sqrt().clamp_(min=group['eps'])) for ea, u, easq_ in zip(exp_avg, update, easq)]
-        utils.exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']), 1)
+        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']),
+                                   group['eps'])
         raise SkipUpdate
     utils.fused_adopt_(param, update, grad, exp_avg_sq, exp_avg, utils.get_beta1(group), utils.get_beta2(group),
@@ -225,21 +242,22 @@ def update_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
 @no_state
 def scale_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
     if group['step'] == 1:
-        utils.exp_avg_sq_(exp_avg_sq, update, 0, 1)
+        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, 0, group['eps'])
         raise SkipUpdate
     if group['step'] == 2:
         update = utils.promote(update)
         easq = utils.promote(exp_avg_sq)
         [utils.set_(ea, u / easq_.sqrt().clamp_(min=group['eps'])) for ea, u, easq_ in zip(exp_avg, update, easq)]
-        utils.exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']), 1)
+        utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']),
+                                   group['eps'])
         raise SkipUpdate
     return utils.adopt(update, exp_avg_sq, exp_avg, utils.get_beta1(group), utils.get_beta2(group), group['step'] - 2)
-def _init_soap(state, group, update, grad, param):
-    utils.init_preconditioner(grad, state, utils.get_beta2(group), group['max_precond_dim'], group['precondition_1d'])
+def _init_soap(state, group, update, grad, param, inner: str = ''):
+    utils.init_preconditioner(grad, state, group['max_precond_dim'], group['precondition_1d'])
 def _init_psgd(state, group, update, grad, param, cached: bool = False, prob: Optional[callable] = None):
@@ -295,6 +313,25 @@ def nesterov_momentum(group, updates, grads, params, momentum):
     return utils.nesterov_momentum(momentum, updates, utils.get_beta1(group))
+@zero_guard('momentum')
+@no_state
+def nesterov_ema(group, updates, grads, params, momentum):  # equivalent to Grokfast
+    return utils.nesterov_ema(momentum, updates, utils.get_beta1(group))
+def _store_std(state, group, update, grad, param):
+    state['init_std'] = torch.std(grad, dim=0)
+@general_guard("init_std", init_fn=_store_std)
+@no_state
+def mup_approx(group, updates, grads, params, init_std):
+    _updates = [(u, i) for u, i in zip(updates, init_std) if u.ndim > 1]
+    _updates, _init_std = zip(*_updates)
+    utils.stochastic_multiply_(_updates, _init_std)
+    return updates
 @zero_guard("momentum")
 @no_state
 def heavyball_momentum(group, updates, grads, params, momentum):
@@ -308,15 +345,16 @@ _optim_fns = {'adam': utils.adam_, 'laprop': utils.laprop_}
 @general_guard("Q", "GG", init_fn=_init_soap)
 @no_state
 def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG, inner: str = 'adam'):
-    update = utils.promote(update)
+    update = utils.promote(update)  # Promote to highest precision if needed
     grad_projected = [utils.project(u, q, False) for u, q in zip(update, Q)]
     fn = _optim_fns[inner]
-    precond = fn(exp_avg, exp_avg_sq, grad_projected, utils.get_beta1(group), utils.get_beta2(group), group['step'])
+    precond = fn(exp_avg, exp_avg_sq, grad_projected, utils.get_beta1(group), utils.get_beta2(group), group['step'] - 1,
+                 group['eps'])
     precond = [utils.project(p, q, True) for p, q in zip(precond, Q)]
-    for u, q, gg, eas in zip(update, Q, GG, exp_avg_sq):
-        utils.update_preconditioner(u, q, gg, eas, group['max_precond_dim'], group['precondition_1d'],
+    for u, q, gg, ea in zip(update, Q, GG, exp_avg):
+        utils.update_preconditioner(u, q, gg, ea, group['max_precond_dim'], group['precondition_1d'],
                                     utils.beta_debias(group['shampoo_beta'], group['step']),
                                     group['is_preconditioning'])
     return precond
@@ -414,6 +452,11 @@ def update_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: st
     raise SkipUpdate
+@no_state
+def sign(group, update, grad, param, graft: bool = True):
+    return utils.sign_(update, graft)
 @general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def update_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
@@ -439,8 +482,7 @@ def apply_to_idx(fn, idx):
     return _fn
-def chain(state: Union[callable, dict], group, grad, param, *fns):
-    update = [torch.clone(g, memory_format=torch.preserve_format) for g in grad]
+def _inner_chain(state, group, update, grad, param, *fns):
     skip_update = False
     for fn in fns:
         try:
@@ -450,10 +492,30 @@ def chain(state: Union[callable, dict], group, grad, param, *fns):
             continue
         if update is None:
             break
+    return update, skip_update
+def chain(state: Union[callable, dict], group, grad, param, *fns):
+    update = [torch.clone(g, memory_format=torch.preserve_format) for g in grad]
+    update, skip_update = _inner_chain(state, group, update, grad, param, *fns)
     if not skip_update and update is not None:
         utils.update_param_(param, update, group['lr'], group['weight_decay'], caution=group['caution'], grad=grad)
+def create_branch(branches: List[List[callable]], merge_fn: callable):
+    def _branch(state, group, update, grad, param):
+        outputs = []
+        for branch in branches:
+            branch_update = [torch.clone(u, memory_format=torch.preserve_format) for u in update]
+            branch_update, skip_update = _inner_chain(state, group, branch_update, grad, param, *branch)
+            if skip_update:
+                raise ValueError("Branches should not skip updates")
+            outputs.append(branch_update)
+        return merge_fn(outputs)
+    return _branch
 class ChainOpt(utils.StatefulOptimizer):
     promote: bool = False

heavyball/utils.py CHANGED Viewed

@@ -1,24 +1,40 @@
+import copy
 import functools
 import gc
+import inspect
 import math
 import random
 import string
+import sys
+import time
 import warnings
+from datetime import datetime
 from typing import List, Optional, Tuple, Callable, Union
+from unittest.mock import patch
+import hyperopt
 import numpy as np
 import torch
 from torch import Tensor
+from torch._dynamo import config
 from torch._dynamo.exc import TorchDynamoException
 from torch.backends import cudnn, opt_einsum
 from torch.utils._pytree import tree_map
+config.cache_size_limit = 2 ** 16
+np.warnings = warnings
 compile_mode = "max-autotune-no-cudagraphs"
 dynamic = False
 compile_mode_recommended_to_none = None
 zeroth_power_mode = 'qr'  # 'qr' is baseline, 'newtonschulz' converges better and faster
 tiny_bf16 = torch.finfo(torch.bfloat16).tiny
+base_args = {'betas': (0.9, 0.999), 'precondition_frequency': 1, 'merge_dims': False, 'warmup_steps': 100,
+             'max_precond_dim': 2 ** 16, 'beta': 0.9, 'max_size_triangular': 2 ** 16, 'split': False, 'eps': 1e-8,
+             'weight_decay': 1e-4}
 def decorator(func):
     compiled = None
@@ -51,7 +67,7 @@ def decorator_knowngood(func: Callable):
     return _fn
-einsum_base = string.ascii_lowercase + string.ascii_uppercase
+einsum_base = string.ascii_lowercase
 @decorator_knowngood
@@ -103,29 +119,29 @@ def dim_merger(grad, max_precond_dim, split: bool = False):
     but we want to merge conv kernels into fan-in or at least merge the kernel
     so, [128, 64, 3, 3] should result in [128, 576] or [128, 64, 9] instead of [73728] or [8192, 3, 3] the baseline
     would've done
+    By @francois-rozet (commit: 68cde41eaf7e73b4c46eacb6a944865dcc081f1d), re-commited due to faulty merge
     """
-    shape = grad.shape
     new_shape = []
-    curr_shape = 1
-    for sh in shape[1:][::-1]:
-        temp_shape = curr_shape * sh
-        if temp_shape > max_precond_dim:
-            if curr_shape > 1:
-                new_shape.append(curr_shape)
-                curr_shape = sh
+    cum_size = 1
+    for s in grad.shape[1:][::-1]:
+        temp_size = cum_size * s
+        if temp_size > max_precond_dim:
+            if cum_size > 1:
+                new_shape.append(cum_size)
+                cum_size = s
             else:
-                new_shape.append(sh)
-                curr_shape = 1
+                new_shape.append(s)
+                cum_size = 1
         else:
-            curr_shape = temp_shape
-    new_shape = [*shape[:1], *new_shape[::-1]]
+            cum_size = temp_size
-    if curr_shape > 1 or len(new_shape) == 0:
-        new_shape.append(curr_shape)
+    if cum_size > 1:
+        new_shape.append(cum_size)
-    new_grad = grad.reshape(new_shape)  # needs to be .reshape() due to channels_last
+    new_shape = [grad.shape[0], *new_shape[::-1]]
+    new_grad = grad.reshape(new_shape)
     if not split:
         return new_grad
@@ -153,12 +169,11 @@ def beta_debias(beta, step):
 def _compilable_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor,
                             out: List[Optional[Tensor]]):
     s32, g32 = [list(map(promote, x)) for x in (state, grad)]
-    s32 = torch._foreach_mul(s32, beta2)
-    s32 = [s + g * g * (1 - beta2) for s, g in zip(s32, g32)]
-    denom = torch._foreach_sqrt(s32)
-    denom = [d.clamp(min=eps) for d in denom]
+    s32 = [s * beta2 + g * g * (1 - beta2) for s, g in zip(s32, g32)]
     copy_stochastic_list_(state, s32)
+    denom = [d.sqrt().clamp(min=eps) for d in s32]
     if out[0] is None:
         return denom
@@ -174,7 +189,7 @@ def exp_avg_sq_(state, grad, beta2, eps, out=None):
 @decorator_knowngood
 def _compilable_scale_by_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor):
-    g32 = promote(grad)
+    g32 = list(map(promote, grad))
     denom = _compilable_exp_avg_sq_(state, g32, beta2, eps, [None])
     out = torch._foreach_div(g32, denom)
     copy_stochastic_list_(grad, out)
@@ -189,7 +204,7 @@ def scale_by_exp_avg_sq_(exp_avg_sq, grad, beta2, eps):
 @decorator_knowngood
 def _compilable_exp_avg_(state, grad, beta):
-    lerped = _lerp32(state, grad, beta)
+    lerped = _lerp(state, grad, beta)
     copy_stochastic_list_(grad, lerped)
@@ -240,29 +255,31 @@ def clean():
     gc.collect()
-def set_torch():
+def _ignore_warning(msg):
+    warnings.filterwarnings('ignore', f'.*{msg}.*')
+def set_torch(benchmark_limit: int = 32):
     cudnn.benchmark = True
     cudnn.deterministic = False
+    cudnn.benchmark_limit = benchmark_limit
     torch.use_deterministic_algorithms(False)
     torch.set_float32_matmul_precision("high")  # highest: FP32, high: TF32, medium: bf16
     opt_einsum.enabled = True
-    opt_einsum.strategy = "auto-hq"
+    opt_einsum.strategy = "dp"
+    # Torch calls these for 2nd-order optimization in HeavyBall, but they are explicitly handled.
+    _ignore_warning(
+        'Using backward() with create_graph=True will create a reference cycle between the parameter and its gradient which can cause a memory leak')
+    _ignore_warning(
+        'We recommend using autograd.grad when creating the graph to avoid this. If you have to use this function, make sure to reset the .grad fields of your parameters to None after use to break the cycle and avoid the leak')
 @decorator
 def zeropower_via_newtonschulz5(G, steps=5, eps=1e-7):
-    """
-    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
-    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
-    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
-    zero even beyond the point where the iteration no longer converges all the way to one everywhere
-    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
-    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
-    performance at all relative to UV^T, where USV^T = G is the SVD.
-    """
     assert len(G.shape) == 2
     a, b, c = (3.4445, -4.7750, 2.0315)
-    X = G.bfloat16()
+    X = G.to(torch.bfloat16 if G.dtype != torch.float64 else G.dtype)  # Preserve float64 if present
     X /= (X.norm() + eps)  # ensure top singular value <= 1
     if G.size(0) > G.size(1):
         X = X.T
@@ -317,12 +334,24 @@ def nesterov_momentum(state, grad, beta):
     return grad
+@decorator_knowngood
+def _compilable_nesterov_ema_(state, grad, beta):
+    ema32 = _lerp(state, grad, beta)
+    stochastic_add_(grad, ema32, 1)
+def nesterov_ema(state, grad, beta):
+    state, grad = list_guard(state, grad)
+    beta = scalar_guard(beta, state[0])
+    _compilable_nesterov_ema_(state, grad, beta)
+    return grad
+@decorator_knowngood
 def _compilable_grafting(magnitude, direction):
     return direction * (magnitude.norm() / direction.norm().clamp(min=1e-6))
-# mode in ("newtonschulz", "qr", "svd")
-# scale_mode in ("none", "scale", "graft")
 @decorator_knowngood
 def inplace_orthogonal_(x: Tensor, mode: str, out: Tensor, scale_mode: str):
     if mode == 'newtonschulz' or x.shape[0] != x.shape[1]:
@@ -350,74 +379,82 @@ def _compilable_scatter_set(target, source, index):
     target[:] = source.contiguous()[index].reshape_as(target)
-def get_orthogonal_matrix_QR(GG, Q, exp_avg_sq):
+@decorator_knowngood
+def get_orthogonal_matrix_QR(GG: List[Tensor], Q: List[Tensor], exp_avg: Optional[Tensor] = None):
     """
     Computes the eigenbases of the preconditioner using one round of power iteration
-    followed by torch.linalg.qr decomposition.
+    followed by torch.linalg.qr decomposition, and updates exp_avg in-place from old to new eigenspace.
+    :param GG: List of accumulated gradient outer products.
+    :param Q: List of current eigenbases (updated in-place to Q_new).
+    :param exp_avg: Exponential moving average in the old eigenspace (updated in-place if provided).
     """
-    matrix = []
-    orth_matrix = []
-    for m, o in zip(GG, Q):
-        if len(m) == 0:
-            matrix.append([])
-            orth_matrix.append([])
-            continue
-        if m.data.dtype != torch.float:
-            matrix.append(promote(m.data))
-            orth_matrix.append(promote(o.data))
-        else:
-            matrix.append(promote(m.data))
-            orth_matrix.append(promote(o.data))
+    if isinstance(Q, list) and not Q:
+        return
+    if exp_avg is not None and exp_avg.dim() != len(Q):
+        raise ValueError(f"exp_avg dim {exp_avg.dim()} does not match Q length {len(Q)}")
-    indices = []
+    new_qs = []
-    for ind, (m, o, q) in enumerate(zip(matrix, orth_matrix, Q)):
+    for m, q in zip(GG, Q):
         if len(m) == 0:
-            indices.append(None)
             continue
-        tmp = m @ o
-        est_eig = torch.einsum('ij,ij->j', o, tmp)
+        m = promote(m.data)
+        q_old = promote(q.data)
+        tmp = m @ q_old
+        est_eig = torch.einsum('ij,ij->j', q_old, tmp)
         sort_idx = torch.argsort(est_eig, descending=True)
-        indices.append(sort_idx)
-        inplace_orthogonal_(tmp[:, sort_idx], zeroth_power_mode, q, "none")
-    indices = tuple(slice(None) if ind is None else ind.view(*(1,) * i, -1, *(1,) * (exp_avg_sq.dim() - i - 1))  #
-                    for i, ind in enumerate(indices))
-    _compilable_scatter_set(exp_avg_sq, exp_avg_sq, indices)
+        tmp[:, sort_idx], _ = torch.linalg.qr(tmp[:, sort_idx])
+        new_qs.append(tmp)
+    if exp_avg is None:
+        for q, q_new in zip(Q, new_qs):
+            copy_stochastic_(q, q_new)
+        return
+    assert exp_avg.ndim < 13, "exp_avg.ndim must be less than 13"
+    in_str = einsum_base[:exp_avg.dim()]
+    out_str = einsum_base[exp_avg.dim():2 * exp_avg.dim()]
+    from_shampoo = ",".join([o + i for m, i, o in zip(Q, in_str, in_str.upper()) if len(m) > 0])
+    if not from_shampoo:
+        return
+    to_shampoo = ','.join([i + o for m, i, o in zip(new_qs, in_str.upper(), out_str) if len(m) > 0])
+    out_str = ''.join([o if o in to_shampoo else i for i, o in zip(in_str, out_str)])
+    subscripts = f'{in_str},{from_shampoo},{to_shampoo}->{out_str}'
+    exp_avg_new = torch.einsum(subscripts, exp_avg, *[q for q in Q], *[q for q in new_qs])
+    copy_stochastic_(exp_avg, exp_avg_new)
+    for q, q_new in zip(Q, new_qs):
+        copy_stochastic_(q, q_new)
 def get_orthogonal_matrix(mat):
     """
     Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
     """
-    matrix = []
-    for m in mat:
-        if len(m) == 0:
-            matrix.append([])
-            continue
-        if m.data.dtype != torch.float:
-            float_data = False
-            original_type = m.data.dtype
-            original_device = m.data.device
-            matrix.append(promote(m.data))
-        else:
-            float_data = True
-            matrix.append(m.data)
     final = []
-    for m in matrix:
+    for m in mat:
         if len(m) == 0:
             final.append([])
             continue
+        m = promote(m.data)
         device, dtype = m.device, m.dtype
         for modifier in (None, torch.double, 'cpu'):
             if modifier is not None:
                 m = m.to(modifier)
             try:
-                Q = torch.linalg.eigh(m + 1e-30 * torch.eye(m.shape[0], device=m.device))[1].to(device=device,
-                                                                                                dtype=dtype)
+                eigval, eigvec = torch.linalg.eigh(m + 1e-30 * torch.eye(m.shape[0], device=m.device, dtype=m.dtype))
+                eigvec = eigvec.to(device=device, dtype=dtype)
                 break
             except torch.OutOfMemoryError:
                 pass
@@ -427,9 +464,9 @@ def get_orthogonal_matrix(mat):
         else:
             raise RuntimeError("Failed to compute eigenvalues.")
-        Q = torch.flip(Q, [1])
+        eigvec = torch.flip(eigvec, [1])
-        final.append(Q)
+        final.append(eigvec)
     return final
@@ -454,7 +491,7 @@ def get_beta1(group):
 def get_beta2(group):
-    if 'beta2_scale' in group:
+    if 'palm' in group and group['palm'] is True and 'beta2_scale' in group:
         step = max(group.get("step", 1), 1)
         return 1 - step ** -group['beta2_scale']
     if 'betas' in group:
@@ -509,21 +546,46 @@ def stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[float, int, T
     _compilable_stochastic_add_(x, y, alpha)
+@decorator_knowngood
+def _compilable_stochastic_multiply_(x: List[Tensor], y: List[Tensor]):
+    for x_, y_ in zip(x, y):
+        x32 = promote(x_)
+        y32 = promote(y_)
+        copy_stochastic_(x_, x32 * y32)
+def stochastic_multiply_(x: List[Tensor], y: List[Tensor]):
+    x, y = list_guard(x, y)
+    _compilable_stochastic_multiply_(x, y)
 @decorator
-def compute_ggt(grad, GG, max_precond_dim, precondition_1d, beta):
+def update_ggt(grad, GG, max_precond_dim, precondition_1d, beta):
+    """
+    Simplified by @francois-rozet in commit 704ccc4bab52429f945df421647ec82c54cdd65f
+    Re-commited due to faulty merge
+    """
     if grad.dim() == 1 and (not precondition_1d or grad.shape[0] > max_precond_dim):
         return
-    for idx, sh in enumerate(grad.shape):
-        if sh > max_precond_dim:
+    for idx, m in enumerate(GG):
+        if not isinstance(m, Tensor):
             continue
         b = einsum_base[idx]
         g0 = einsum_base[:grad.dim()]
         g1 = g0.replace(b, b.upper())
         outer_product = torch.einsum(f'{g0},{g1}->{b + b.upper()}', grad, grad)
-        GG[idx].lerp_(outer_product, 1 - beta)
+        m.lerp_(outer_product, 1 - beta)
+def tree_apply(fn):
+    def _fn(*args):
+        return tree_map(fn, *args)
+    return _fn
+@tree_apply
 def promote(x):
     if isinstance(x, torch.dtype) and x in (torch.bfloat16, torch.float16):
         return torch.float32
@@ -540,45 +602,38 @@ def min_dtype(xs: List[Tensor]):
     return torch.float32
-def update_preconditioner(grad, Q, GG, exp_avg_sq, max_precond_dim, precondition_1d, beta, update_precond):
+def update_preconditioner(grad, Q, GG, exp_avg, max_precond_dim, precondition_1d, beta, update_precond):
     """
     Updates the preconditioner matrices and the eigenbases (L, R, Q_L, Q_R in the paper).
     """
-    compute_ggt(grad, GG, max_precond_dim, precondition_1d, beta)
+    update_ggt(grad, GG, max_precond_dim, precondition_1d, beta)
     if update_precond:
-        get_orthogonal_matrix_QR(GG, Q, exp_avg_sq)
+        get_orthogonal_matrix_QR(GG, Q, exp_avg)
-def init_preconditioner(grad, state, beta, max_precond_dim=10000, precondition_1d=False):
+def init_preconditioner(grad, state, max_precond_dim, precondition_1d):
     """
     Initializes the preconditioner matrices (L and R in the paper).
     """
     state['GG'] = []  # Will hold all the preconditioner matrices (L and R in the paper).
-    if grad.dim() == 1:
-        if precondition_1d or grad.shape[0] > max_precond_dim:
-            state['GG'].append(torch.zeros(grad.shape[0], grad.shape[0], device=grad.device, dtype=grad.dtype))
-        else:
-            state['GG'].append([])
-    else:
+    if grad.numel() > 1 and (grad.ndim > 1 or precondition_1d):
         for sh in grad.shape:
             if sh > max_precond_dim:
-                state['GG'].append([])
+                state['GG'].append(None)
             else:
                 state['GG'].append(torch.zeros(sh, sh, device=grad.device, dtype=grad.dtype))
+    else:
+        state['GG'].append(None)
-    compute_ggt(grad, state['GG'], max_precond_dim, precondition_1d, beta)
+    update_ggt(grad, state['GG'], max_precond_dim, precondition_1d, 0)
     state['Q'] = get_orthogonal_matrix(state['GG'])
 @decorator
 def project(grad, Q, back: bool):
     """
     :param grad:
     :param Q:
-    :param merge_dims:
-    :param max_precond_dim:
     :param back: whether to project to Shampoo eigenbases or back to original space
     :return:
     """
@@ -591,12 +646,40 @@ def project(grad, Q, back: bool):
     return grad
+def modify_closure(closure):
+    """
+    Modifies the closure function to use create_graph=True in backward().
+    Args:
+        closure: The closure function passed to the optimizer.
+    Returns:
+        The return value of the modified closure.
+    """
+    def patched_backward(self, *args, **kwargs):
+        kwargs['create_graph'] = True
+        return original_backward(self, *args, **kwargs)
+    original_backward = torch.Tensor.backward
+    with patch.object(torch.Tensor, 'backward', patched_backward):
+        return closure()
 class StatefulOptimizer(torch.optim.Optimizer):
+    """
+    finite_differences saves memory, but needs more compute. (Alternative is true HVP)
+    Both `True` and `False` have some edge cases they don't support, so experiment with it.
+    The previous (heavyball<=1.5.3) default was `True`, which is incompatible with some benchmarks but works better with RevNet
+    Further notice that both methods have different numerics outputs
+    """
     ema_decay: float = 0.001
     compile_step: bool = False
     hessian_approx: bool = False
     precond_schedule: Union[Callable, float, None] = None
     stochastic_schedule: bool = False
+    finite_differences: bool = False
     def __init__(self, params, defaults, foreach: bool = True, use_ema: bool = False):
         super().__init__(params, {**defaults, 'foreach': foreach})
@@ -709,38 +792,68 @@ class StatefulOptimizer(torch.optim.Optimizer):
                         set_(self.state_(p)['param_ema'], p.data)
                         set_(p.data, ema_clone)
-    def step(self, closure: Optional[Callable] = None):
-        if self.precond_schedule is None:
-            self._is_preconditioning = False
-        else:
-            self._is_preconditioning = psgd_should_update(self._inner_group, self.precond_schedule, self._precond_rng)
+    def _handle_closure(self, closure):
         hessian_approx = self.hessian_approx and self._is_preconditioning
         if closure is None:
             if hessian_approx:
                 raise ValueError("Hessian approximation requires a closure.")
-            loss = None
-        else:
+            return None
+        if not hessian_approx:
             with torch.enable_grad():
                 loss = closure()
-            if hessian_approx:
-                grads = []
-                for group in self.param_groups:
-                    for p, g in self.split_p_and_g_in_group(group, skip_none=True, should_promote=False):
-                        grads.append(g)
-                        p.vector = torch.randn_like(p)
-                        p.orig = p.data.clone()
-                        stochastic_add_(p.data, p.vector, tiny_bf16)
-                with torch.enable_grad():
-                    closure()
-                for group in self.param_groups:
-                    for p, g in self.split_p_and_g_in_group(group, skip_none=True, should_promote=False):
-                        p.grad = grads.pop(0)
-                        stochastic_add_(g, p.grad, -1)
-                        p.hessian_vector = g
-                        p.data.copy_(p.orig)
-                        del p.orig
+            return loss
+        if self.finite_differences:
+            with torch.enable_grad():
+                loss = closure()  # closure without retain_graph=True
+            grads = []
+            for group in self.param_groups:
+                for p, g in self.split_p_and_g_in_group(group, skip_none=True, should_promote=False):
+                    grads.append(g)
+                    p.vector = torch.randn_like(p)
+                    p.orig = p.data.clone()
+                    stochastic_add_(p.data, p.vector, tiny_bf16)
+        else:
+            with torch.enable_grad():
+                loss = modify_closure(closure)
+        if self.finite_differences:
+            with torch.enable_grad():
+                closure()
+            for group in self.param_groups:
+                for p, g in self.split_p_and_g_in_group(group, skip_none=True, should_promote=False):
+                    p.grad = grads.pop(0)
+                    stochastic_add_(g, p.grad, -1)
+                    p.hessian_vector = g
+                    p.data.copy_(p.orig)
+                    del p.orig
+        else:
+            for group in self.param_groups:
+                for p, g in self.split_p_and_g_in_group(group, skip_none=True, should_promote=False):
+                    p.grad = g
+            params, grads = zip(*[x for group in self.param_groups for x in
+                                  self.split_p_and_g_in_group(group, skip_none=True, should_promote=False)])
+            vs = [torch.randn_like(p) for p in params]
+            with torch.enable_grad():
+                hvs = torch.autograd.grad(grads, params, vs)
+            for p, g, v, hv in zip(params, grads, vs, hvs):
+                p.hessian_vector = hv
+                p.grad = g
+                p.vector = v
+        return loss
+    def step(self, closure: Optional[Callable] = None):
+        if self.precond_schedule is None:
+            self._is_preconditioning = False
+        else:
+            self._is_preconditioning = psgd_should_update(self._inner_group, self.precond_schedule, self._precond_rng)
+        loss = self._handle_closure(closure)
         # we assume that parameters are constant and that there are no excessive recompiles
         with torch.no_grad(), torch._dynamo.utils.disable_cache_limit():
@@ -748,7 +861,7 @@ class StatefulOptimizer(torch.optim.Optimizer):
                 group['is_preconditioning'] = self._is_preconditioning
                 self._step(group)
                 if self.use_ema:
-                    self.ema_update(group)
+                    self.ema_update()
         return loss
@@ -758,12 +871,12 @@ def copy_stochastic_list_(target: List[Tensor], source: List[Tensor]):
         copy_stochastic_(t, s)
-def _lerp32(state: List[Tensor], grad: List[Tensor], beta):
+@decorator_knowngood
+def _lerp(state: List[Tensor], grad: List[Tensor], beta):
     ea32 = list(map(promote, state))
     grad = list(map(promote, grad))
     beta = promote(beta)
-    ea32 = [e.lerp(g, 1 - beta) for e, g in zip(ea32, grad)]
+    ea32 = [e * beta + g * (1 - beta) for e, g in zip(ea32, grad)]
     copy_stochastic_list_(state, ea32)
     return ea32
@@ -775,15 +888,14 @@ def _compilable_adam_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: Lis
     beta2 = beta_debias(beta2, step)
     g32 = list(map(promote, grad))
-    exp_avg32 = _lerp32(exp_avg, g32, beta1)
-    denom = exp_avg_sq_(exp_avg_sq, g32, beta2, eps)
-    u32 = torch._foreach_div(exp_avg32, denom)
+    exp_avg32 = _lerp(exp_avg, g32, beta1)
+    denom = _compilable_exp_avg_sq_(exp_avg_sq, g32, beta2, eps, [None])
+    u32 = [ea / d for ea, d in zip(exp_avg32, denom)]
     copy_stochastic_list_(grad, u32)
 def adam_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float, beta2: float, step: int,
-          eps: float):
+          eps: float = 1e-8):
     exp_avg, exp_avg_sq, grad = map(list_guard, (exp_avg, exp_avg_sq, grad))
     beta1, beta2, step, eps = scalar_guard(beta1, beta2, step, eps, exp_avg[0])
     _compilable_adam_(exp_avg, exp_avg_sq, grad, beta1, beta2, step, eps)
@@ -798,9 +910,8 @@ def _fused_compilable_adam_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq:
     beta2 = beta_debias(beta2, step)
     u32, g32 = [list(map(promote, x)) for x in [update, grad]]
-    exp_avg32 = _lerp32(exp_avg, u32, beta1)
-    denom = exp_avg_sq_(exp_avg_sq, u32, beta2, 1e-8)
+    exp_avg32 = _lerp(exp_avg, u32, beta1)
+    denom = _compilable_exp_avg_sq_(exp_avg_sq, u32, beta2, eps, [None])
     u32 = torch._foreach_div(exp_avg32, denom)
     _compilable_update_(y, u32, decay, lr, caution, g32)
@@ -810,51 +921,50 @@ def fused_adam_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor]
                 caution: bool):
     y, exp_avg, exp_avg_sq, grad = list_guard(y, exp_avg, exp_avg_sq, grad)
     beta1, beta2, step, lr = scalar_guard(beta1, beta2, step, lr, y[0])
-    return _fused_compilable_adam_(y, exp_avg, exp_avg_sq, update, grad, beta1, beta2, step, decay, lr, eps, caution)
+    _fused_compilable_adam_(y, exp_avg, exp_avg_sq, update, grad, beta1, beta2, step, decay, lr, eps, caution)
 @decorator_knowngood
 def _compilable_laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: Tensor,
-                        beta2: Tensor, step: Tensor):
+                        beta2: Tensor, step: Tensor, eps: Tensor):
     beta1 = beta_debias(beta1, step)
     beta2 = beta_debias(beta2, step)
     gp32 = list(map(promote, grad))
-    denom = exp_avg_sq_(exp_avg_sq, gp32, beta2, 1e-8)
+    denom = _compilable_exp_avg_sq_(exp_avg_sq, gp32, beta2, eps, [None])
     gp32 = torch._foreach_div(gp32, denom)
-    gp32 = _lerp32(exp_avg, gp32, beta1)
+    gp32 = _lerp(exp_avg, gp32, beta1)
     copy_stochastic_list_(grad, gp32)
-def laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float, beta2: float, step: int):
+def laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float, beta2: float, step: int,
+            eps: float = 1e-8):
     exp_avg, exp_avg_sq, grad = list_guard(exp_avg, exp_avg_sq, grad)
-    beta1, beta2, step = scalar_guard(beta1, beta2, step, exp_avg[0])
-    _compilable_laprop_(exp_avg, exp_avg_sq, grad, beta1, beta2, step)
+    beta1, beta2, step, eps = scalar_guard(beta1, beta2, step, eps, exp_avg[0])
+    _compilable_laprop_(exp_avg, exp_avg_sq, grad, beta1, beta2, step, eps)
     return grad
 @decorator_knowngood
 def _fused_compilable_laprop_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], update: List[Tensor],
                               grad: List[Tensor], beta1: Tensor, beta2: Tensor, step: Tensor, lr: Tensor, decay: Tensor,
-                              caution: bool):
+                              caution: bool, eps: Tensor):
     beta1 = beta_debias(beta1, step)
     beta2 = beta_debias(beta2, step)
     u32, gp32 = [list(map(promote, x)) for x in [update, grad]]
-    denom = exp_avg_sq_(exp_avg_sq, u32, beta2, 1e-8)
+    denom = _compilable_exp_avg_sq_(exp_avg_sq, u32, beta2, eps, [None])
     u32 = torch._foreach_div(u32, denom)
-    u32 = _lerp32(exp_avg, u32, beta1)
+    u32 = _lerp(exp_avg, u32, beta1)
     _compilable_update_(y, u32, decay, lr, caution, gp32)
 def fused_laprop_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], update: List[Tensor],
-                  grad: List[Tensor], beta1: float, beta2: float, step: int, lr: float, decay: float, caution: bool):
+                  grad: List[Tensor], beta1: float, beta2: float, step: int, lr: float, decay: float, caution: bool,
+                  eps: float = 1e-8):
     exp_avg, exp_avg_sq, grad, y = list_guard(exp_avg, exp_avg_sq, grad, y)
-    beta1, beta2, step, lr = scalar_guard(beta1, beta2, step, lr, exp_avg[0])
-    _fused_compilable_laprop_(y, exp_avg, exp_avg_sq, update, grad, beta1, beta2, step, lr, decay, caution)
+    beta1, beta2, step, lr, eps = scalar_guard(beta1, beta2, step, lr, eps, exp_avg[0])
+    _fused_compilable_laprop_(y, exp_avg, exp_avg_sq, update, grad, beta1, beta2, step, lr, decay, caution, eps)
 @decorator_knowngood
@@ -891,7 +1001,7 @@ def _compilable_adopt_(grad, exp_avg_sq, exp_avg, beta1, beta2, step):
     copy_stochastic_list_(exp_avg, exp_avg32)
     beta2 = beta_debias(beta2, step + 1)
-    exp_avg_sq32 = [eas32.lerp(g * g, 1 - beta2) for eas32, g in zip(exp_avg_sq32, u32)]
+    exp_avg_sq32 = [eas32.lerp(g * g, 1 - beta2) for eas32, g in zip(exp_avg_sq32, g32)]
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
     copy_stochastic_list_(grad, update)
@@ -970,12 +1080,15 @@ def get_soap_precond_schedule(precond_scheduler):
     return _inner
+def _max_idx(x: List[int]):
+    return len(x) - 1 - np.argmax(x[::-1])  # we want to start counting from the back, as torch is fan-out/fan-in
 def init_Q_exprs(t, scale, max_size, min_ndim_triangular, memory_save_mode, dtype=None):
     """For a scalar or tensor t, we initialize its preconditioner Q and
     reusable einsum expressions for updating Q and preconditioning gradient.
     """
     letters = string.ascii_lowercase + string.ascii_uppercase
     dtype = dtype if dtype is not None else t.dtype
     shape = t.shape
@@ -992,17 +1105,20 @@ def init_Q_exprs(t, scale, max_size, min_ndim_triangular, memory_save_mode, dtyp
     scale = scale ** (1 / len(shape))
+    dim_diag = [False for _ in shape]
     if memory_save_mode is None:
-        dim_diag = [False for _ in shape]
+        pass
     elif memory_save_mode == "one_diag":
-        rev_sorted_dims = np.argsort(shape)[::-1]
-        dim_diag = [False for _ in shape]
-        dim_diag[rev_sorted_dims[0]] = True
+        dim_diag[_max_idx(shape)] = True
+    elif memory_save_mode == "smart_one_diag":
+        sorted_shape = sorted(shape)
+        if len(shape) >= 2 and sorted_shape[-1] > sorted_shape[-2]:
+            dim_diag[_max_idx(shape)] = True
     elif memory_save_mode == "all_diag":
         dim_diag = [True for _ in shape]
     else:
         raise ValueError(f"Invalid memory_save_mode: {memory_save_mode}, must be one of "
-                         "[None, 'one_diag', 'all_diag']")
+                         "[None, 'one_diag', 'all_diag', 'smart_one_diag']")
     Q = []
     piece1A, piece2A, piece3A = ([], "", "")
@@ -1016,11 +1132,9 @@ def init_Q_exprs(t, scale, max_size, min_ndim_triangular, memory_save_mode, dtyp
             piece1A.append(letters[i])
             piece2A = piece2A + letters[i]
             piece3A = piece3A + letters[i]
             piece1 = "".join([(letters[i + 13] if j == i else letters[j]) for j in range(len(shape))])
             subscripts = piece1 + "," + piece1 + "->" + letters[i + 13]
             exprGs.append(subscripts)
             piece1P.append(letters[i + 13])
             piece2P.append(letters[i + 13])
             piece3P = piece3P + letters[i + 13]
@@ -1028,16 +1142,13 @@ def init_Q_exprs(t, scale, max_size, min_ndim_triangular, memory_save_mode, dtyp
         else:
             # use triangular matrix as preconditioner for this dim
             Q.append(scale * torch.eye(size, dtype=dtype, device=t.device))
             piece1A.append(letters[i] + letters[i + 13])
             piece2A = piece2A + letters[i + 13]
             piece3A = piece3A + letters[i]
             piece1 = "".join([(letters[i + 13] if j == i else letters[j]) for j in range(len(shape))])
             piece2 = "".join([(letters[i + 26] if j == i else letters[j]) for j in range(len(shape))])
             subscripts = (piece1 + "," + piece2 + "->" + letters[i + 13] + letters[i + 26])
             exprGs.append(subscripts)
             a, b, c = (letters[i], letters[i + 13], letters[i + 26])
             piece1P.append(a + b)
             piece2P.append(a + c)
@@ -1058,7 +1169,7 @@ def psgd_balance_Q(Q_in):
 def psgd_calc_A_and_conjB(exprA, G, Q, V=None):
-    eps = scalar_guard(math.sqrt(torch.finfo(torch.float32).eps), G)
+    eps = scalar_guard(math.sqrt(torch.finfo(G.dtype).eps), G)
     eps *= G.norm() / G.numel()
     G = G + torch.randn_like(G) * eps
     md = min_dtype(Q + [G])
@@ -1084,9 +1195,7 @@ def psgd_lb(A, max_abs):
     A /= max_abs
     a0 = torch.einsum('ij,ij->j', A, A)
     i = torch.argmax(a0)
     x = torch.index_select(A, 1, i).flatten().contiguous()
     x = torch.einsum('i,ij->j', x, A)
     x /= x.norm()
     x = torch.einsum('j,kj->k', x, A)
@@ -1099,15 +1208,12 @@ def psgd_lb(A, max_abs):
 def psgd_update_precond(Q, exprs, G, precond_lr, oq, store_triu_as_line, V):
     """Update Kronecker product preconditioner Q with pair (V, G)."""
     exprA, exprGs, _ = exprs
     A, conjB = psgd_calc_A_and_conjB(exprA, G, Q, V)
     for q, exprG, o in zip(Q, exprGs, oq):
         term1 = promote(torch.einsum(exprG, A, A))
         term2 = promote(torch.einsum(exprG, conjB, conjB))
         term1, term2 = term1 - term2, term1 + term2
         term1 *= precond_lr
         norm = term2.norm(float('inf'))
         if q.dim() < 2:
@@ -1221,6 +1327,48 @@ def identity(x):
     return x
+@decorator_knowngood
+def _compilable_weight_decay_to_ema_(p, ema, ema_decay, weight_decay):
+    ema32 = _lerp(ema, p, ema_decay)
+    _lerp(p, ema32, 1 - weight_decay)
+def weight_decay_to_ema_(p, ema, ema_decay, weight_decay):
+    p, ema = list_guard(p, ema)
+    ema_decay, weight_decay = scalar_guard(ema_decay, weight_decay, p[0])
+    _compilable_weight_decay_to_ema_(p, ema, ema_decay, weight_decay)
+@decorator_knowngood
+def _compilable_l1_weight_decay_to_ema_(p, ema, ema_decay, weight_decay):
+    ema32 = _lerp(ema, p, ema_decay)
+    for p_, e_ in zip(p, ema32):
+        p32 = promote(p_)
+        p32 = p32 + (p32 - e_).sign() * weight_decay
+        copy_stochastic_(p_, p32)
+def l1_weight_decay_to_ema_(p, ema, ema_decay, weight_decay):
+    p, ema = list_guard(p, ema)
+    ema_decay, weight_decay = scalar_guard(ema_decay, weight_decay, p[0])
+    _compilable_l1_weight_decay_to_ema_(p, ema, ema_decay, weight_decay)
+@decorator_knowngood
+def _compilable_sign_(grad: List[Tensor], graft: bool):
+    for g_ in grad:
+        gs = g_.sign()
+        if graft:
+            gs = _compilable_grafting(g_, gs)
+        copy_stochastic_(g_, gs)
+def sign_(grad: List[Tensor], graft: bool = True):
+    grad = list_guard(grad)
+    _compilable_sign_(grad, graft)
+    return grad
 @decorator_knowngood
 def _compilable_trust_region_clip_(grad, lerp, scale):
     # (sgn(x) * log(1 + |x|) * 0.1 + tanh(x) * 0.9).clamp_(min=-2, max=2)
@@ -1372,7 +1520,6 @@ def _compilable_orthogonalization(weight: List[Tensor], grad: List[Tensor], eps:
         if graft:
             out = _compilable_grafting(g, out)
         copy_stochastic_(g, out)

{heavyball-1.5.2.dist-info → heavyball-1.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.5.2
+Version: 1.6.0
 Summary: Efficient optimizers
-Home-page: https://github.com/clashluke/heavyball
-Author: Lucas Nestler
+Home-page: https://github.com/HomebrewML/HeavyBall
+Author: HeavyBall Authors
 Author-email: github.heavyball@nestler.sh
 License: BSD
 Classifier: Development Status :: 5 - Production/Stable
@@ -300,7 +300,7 @@ class ForeachSOAP(C.BaseOpt):
     def __init__(self, params, lr: float = 3e-3, betas=(0.9, 0.95), shampoo_beta: float = 0.95, eps: float = 1e-8,
                  weight_decay: float = 0.01, precondition_frequency: int = 2, max_precond_dim: int = 2048,  #
                  merge_dims: bool = True, precondition_1d: bool = False, normalize_grads: bool = False,
-                 data_format: str = "channels_first", correct_bias: bool = True, warmup_steps: int = 1,
+                 correct_bias: bool = True, warmup_steps: int = 1,
                  split: bool = False, foreach: bool = True, mars: bool = False, caution: bool = False,
                  mars_gamma: float = 0.0025, palm: bool = C.use_default, precond_scheduler=(1 / 3, 9),
                  beta2_scale: float = 0.8, use_precond_schedule: bool = C.use_default,
@@ -324,7 +324,6 @@ the second-order statistics of the gradients to accelerate convergence.
 * **`merge_dims`**: Whether to merge dimensions when forming the preconditioner.
 * **`precondition_1d`**: Whether to use a 1D preconditioner for 1D parameters.
 * **`normalize_grads`**: Whether to normalize gradients before applying SOAP.
-* **`data_format`**: `"channels_first"` or `"channels_last"`. Specifies the data format of the input tensors.
 * **`correct_bias`**: Enables/disables bias correction for the running averages.
 * **`warmup_steps`**: Number of steps for linear learning rate warmup.
 * **`split`**: Whether to split large dimensions when forming the preconditioner.
@@ -931,4 +930,3 @@ tasks. However, the best choice always depends on your specific model, dataset,
 * **`heavyball.utils`:** Remember to utilize the settings and functions in `heavyball.utils` (e.g., `set_torch`,
   `compile_mode`, `zeroth_power_mode`, clipping functions) to optimize performance and experiment with different
   configurations.

heavyball-1.6.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+heavyball/__init__.py,sha256=DKp8rEAf7mt2-j9XRVlgjaLjyfuwUsyl_uXJoOKWAHg,15362
+heavyball/chainable.py,sha256=n_u0QS92WitbtnENvNQ0m4dZTHuJ5ObQ88XA3cmhCfo,27298
+heavyball/utils.py,sha256=Nk0q_sfv47F-QC9Wwi5KCt-C_71OhuzM98XHlYGvl24,55905
+heavyball-1.6.0.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
+heavyball-1.6.0.dist-info/METADATA,sha256=5suezTlZCOBwCgHeFgkLaywYwjAWN1SPg6yhvAv1WgE,43441
+heavyball-1.6.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+heavyball-1.6.0.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
+heavyball-1.6.0.dist-info/RECORD,,

heavyball-1.5.2.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-heavyball/__init__.py,sha256=f0wWIjsibgA4_YwkPP8HFD7-snggYsAOFc84W0WnNMA,12049
-heavyball/chainable.py,sha256=ygeQU-t3RT0Q1BWrEQ_0b4SlXYy8aGDt0DCZAfbiNiw,25040
-heavyball/utils.py,sha256=D7ENwrIex_dgFiUHezymmsIdruoQ4_hYztIolCXo2KE,50636
-heavyball-1.5.2.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
-heavyball-1.5.2.dist-info/METADATA,sha256=n_2fW7Wcz_btxBRWFibTe8wnM10B2su100bJzW0bfZY,43584
-heavyball-1.5.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-heavyball-1.5.2.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
-heavyball-1.5.2.dist-info/RECORD,,

{heavyball-1.5.2.dist-info → heavyball-1.6.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{heavyball-1.5.2.dist-info → heavyball-1.6.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{heavyball-1.5.2.dist-info → heavyball-1.6.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

heavyball 1.5.2__py3-none-any.whl → 1.6.0__py3-none-any.whl

heavyball 1.5.2py3-none-any.whl → 1.6.0py3-none-any.whl