PyPI - heavyball - Versions diffs - 1.1.0__tar.gz → 1.1.2__tar.gz - Mend

heavyball 1.1.0tar.gz → 1.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{heavyball-1.1.0 → heavyball-1.1.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.1.0
+Version: 1.1.2
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.1.0 → heavyball-1.1.2}/heavyball/__init__.py RENAMED Viewed

@@ -10,9 +10,9 @@ class ForeachAdamW(C.BaseOpt):
                  foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
                  mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
                  update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
-        defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
-                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
-                        mars_gamma=mars_gamma, beta2_scale=beta2_scale)
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.update_by_adam)
@@ -25,9 +25,9 @@ class ForeachRMSprop(C.BaseOpt):
                  weight_lr_power=2.0, foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False,
                  caution: bool = False, mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
                  update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
-        defaults = dict(lr=lr, betas=betas, eps=eps, warmup_steps=warmup_steps, weight_decay=weight_decay,
-                        foreach=foreach, storage_dtype=storage_dtype, mars=mars, caution=caution, mars_gamma=mars_gamma,
-                        beta2_scale=beta2_scale)
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_exp_avg_sq)
@@ -36,10 +36,9 @@ class ForeachSFAdamW(C.ScheduleFree):
                  weight_lr_power=2.0, foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False,
                  caution: bool = False, mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
                  update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
-        defaults = dict(lr=lr, betas=betas, eps=eps, r=r, k=0, warmup_steps=warmup_steps, train_mode=True,
-                        weight_sum=0.0, lr_max=-1.0, weight_lr_power=weight_lr_power, weight_decay=weight_decay,
-                        foreach=foreach, storage_dtype=storage_dtype, mars=mars, caution=caution, mars_gamma=mars_gamma,
-                        beta2_scale=beta2_scale)
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_exp_avg_sq,
                          C.update_by_schedule_free)
@@ -53,9 +52,9 @@ class ForeachADOPT(C.BaseOpt):
                  foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
                  mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
                  update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
-        defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
-                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
-                        mars_gamma=mars_gamma, beta2_scale=beta2_scale)
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.update_by_adopt)
@@ -65,9 +64,9 @@ class ForeachMuon(C.BaseOpt):
                  mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
                  update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8,
                  nesterov: bool = True):
-        defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
-                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
-                        mars_gamma=mars_gamma, beta2_scale=beta2_scale)
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm,
                          C.nesterov_momentum if nesterov else C.heavyball_momentum, C.orthogonalize_update)
@@ -77,12 +76,24 @@ class ForeachLaProp(C.BaseOpt):
                  foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
                  mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
                  update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
-        defaults = dict(lr=lr, betas=betas, eps=eps, k=0, warmup_steps=warmup_steps, train_mode=True, weight_sum=0.0,
-                        lr_max=-1.0, weight_decay=weight_decay, storage_dtype=storage_dtype, mars=mars, caution=caution,
-                        mars_gamma=mars_gamma, beta2_scale=beta2_scale)
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.update_by_laprop)
+class MuonLaProp(C.BaseOpt):
+    def __init__(self, params, lr=0.0025, betas=(0.9, 0.99), eps=1e-8, weight_decay=0, warmup_steps=0,
+                 foreach: bool = True, storage_dtype: str = 'float32', mars: bool = False, caution: bool = False,
+                 mars_gamma: float = 0.0025, gradient_clipping: C.str_or_fn = C.use_default,
+                 update_clipping: C.str_or_fn = C.use_default, palm: bool = C.use_default, beta2_scale: float = 0.8):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
+        super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, palm, C.scale_by_laprop,
+                         C.orthogonalize_update)
 class ForeachSOAP(C.BaseOpt):
     """
     ForeachSOAP
@@ -112,12 +123,10 @@ class ForeachSOAP(C.BaseOpt):
                  gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default):
         use_precond_schedule = C.default(use_precond_schedule, self.use_precond_schedule)
-        defaults = {"lr": lr, "betas": betas, "shampoo_beta": shampoo_beta, "eps": eps, "weight_decay": weight_decay,
-                    "precondition_frequency": precondition_frequency, "max_precond_dim": max_precond_dim,
-                    "merge_dims": merge_dims, "precondition_1d": precondition_1d, "normalize_grads": normalize_grads,
-                    "correct_bias": correct_bias, 'warmup_steps': warmup_steps, 'split': split, 'mars': mars,
-                    'caution': caution, 'mars_gamma': mars_gamma, 'palm': palm, 'precond_scheduler': precond_scheduler,
-                    'beta2_scale': beta2_scale}
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
         if use_precond_schedule:
             del defaults['precondition_frequency']
         else:
@@ -161,19 +170,15 @@ class ForeachPSGDKron(C.BaseOpt):
                  gradient_clipping: C.str_or_fn = C.use_default, update_clipping: C.str_or_fn = C.use_default,  #
                  # expert parameters
                  precond_init_scale=1.0, precond_lr=0.1):
+        defaults = locals()
+        defaults.pop("self")
+        params = defaults.pop("params")
         delayed = C.default(delayed, self.delayed)
         cached = C.default(cached, self.cached)
         exp_avg_input = C.default(exp_avg_input, self.exp_avg_input)
         update_clipping = C.default(update_clipping, utils.trust_region_clip_)
-        defaults = dict(lr=lr, beta=beta, weight_decay=weight_decay, max_size_triangular=max_size_triangular,
-                        min_ndim_triangular=min_ndim_triangular, memory_save_mode=memory_save_mode,
-                        momentum_into_precond_update=momentum_into_precond_update, precond_lr=precond_lr,
-                        precond_init_scale=precond_init_scale, step=0, warmup_steps=warmup_steps, merge_dims=merge_dims,
-                        split=split, store_triu_as_line=store_triu_as_line, q_dtype=q_dtype,
-                        storage_dtype=storage_dtype, caution=caution, mars_gamma=mars_gamma, mars=mars,
-                        stochastic_schedule=stochastic_schedule)
         super().__init__(params, defaults, foreach, gradient_clipping, update_clipping, False,  #
                          *(C.exp_avg,) * exp_avg_input,  #
                          functools.partial(C.scale_by_delayed_psgd if delayed else C.scale_by_psgd, cached=cached,
@@ -215,9 +220,9 @@ CachedPSGDKron = ForeachCachedPSGDKron
 CachedDelayedPSGDKron = ForeachCachedDelayedPSGDKron
 Muon = ForeachMuon
-__all__ = ["Muon","RMSprop", "PrecondSchedulePaLMSOAP", "PSGDKron", "PurePSGD", "DelayedPSGD", "CachedPSGDKron",
+__all__ = ["Muon", "RMSprop", "PrecondSchedulePaLMSOAP", "PSGDKron", "PurePSGD", "DelayedPSGD", "CachedPSGDKron",
            "CachedDelayedPSGDKron", "PalmForEachSoap", "PaLMSOAP", "PaLMSFAdamW", "LaProp", "ADOPT",
-           "PrecondScheduleSOAP", "PrecondSchedulePaLMSOAP", 'RMSprop',   #
+           "PrecondScheduleSOAP", "PrecondSchedulePaLMSOAP", 'RMSprop', 'MuonLaProp',  #
            "ForeachAdamW", "ForeachSFAdamW", "ForeachLaProp", "ForeachADOPT", "ForeachSOAP", "ForeachPSGDKron",
            "ForeachPurePSGD", "ForeachDelayedPSGD", "ForeachCachedPSGDKron", "ForeachCachedDelayedPSGDKron",
            "ForeachRMSprop", "ForeachMuon"]

{heavyball-1.1.0 → heavyball-1.1.2}/heavyball/chainable.py RENAMED Viewed

@@ -140,16 +140,14 @@ class SkipUpdate(ValueError):
 @zero_guard("exp_avg")
 @no_state
 def exp_avg(group, update, grad, param, exp_avg):
-    utils.stochastic_lerp_(exp_avg, update, utils.beta_debias(utils.get_beta1(group), group["step"]))
-    return exp_avg
+    return utils.scale_by_exp_avg_(exp_avg, update, utils.beta_debias(utils.get_beta1(group), group["step"]))
 @zero_guard("exp_avg_sq")
 @no_state
 def scale_by_exp_avg_sq(group, update, grad, param, exp_avg_sq):
-    out = utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group["step"]),
-                                     group['eps'])
-    return out
+    return utils.scale_by_exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group["step"]),
+                                      group['eps'])
 @zero_guard("exp_avg", "exp_avg_sq")
@@ -162,22 +160,21 @@ def scale_by_adam(group, update, grad, param, exp_avg, exp_avg_sq):
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def update_by_adam(group, update, grad, param, exp_avg, exp_avg_sq):
-    utils.fused_adam_(param, exp_avg, exp_avg_sq, update, utils.get_beta1(group), utils.get_beta2(group), group['step'],
-                      group['lr'], group['eps'], group['weight_decay'], group['caution'])
+    utils.fused_adam_(param, exp_avg, exp_avg_sq, update, grad, utils.get_beta1(group), utils.get_beta2(group),
+                      group['step'], group['lr'], group['eps'], group['weight_decay'], group['caution'])
     raise SkipUpdate
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def scale_by_laprop(group, update, grad, param, exp_avg, exp_avg_sq):
-    return utils.laprop_(exp_avg, exp_avg_sq, update, utils.get_beta1(group), utils.get_beta2(group), group['step'],
-                         group['eps'])
+    return utils.laprop_(exp_avg, exp_avg_sq, update, utils.get_beta1(group), utils.get_beta2(group), group['step'])
 @zero_guard("exp_avg", "exp_avg_sq")
 @no_state
 def update_by_laprop(group, update, grad, param, exp_avg, exp_avg_sq):
-    utils.fused_laprop_(param, exp_avg, exp_avg_sq, update, utils.get_beta1(group), utils.get_beta2(group),
+    utils.fused_laprop_(param, exp_avg, exp_avg_sq, update, grad, utils.get_beta1(group), utils.get_beta2(group),
                         group['step'], group['lr'], group['weight_decay'], group['caution'])
     raise SkipUpdate
@@ -205,7 +202,7 @@ def update_by_adopt(group, update, grad, param, exp_avg, exp_avg_sq):
         utils.exp_avg_sq_(exp_avg_sq, update, utils.beta_debias(utils.get_beta2(group), group['step']), 1)
         raise SkipUpdate
-    utils.fused_adopt_(param, update, exp_avg_sq, exp_avg, utils.get_beta1(group), utils.get_beta2(group),
+    utils.fused_adopt_(param, update, grad, exp_avg_sq, exp_avg, utils.get_beta1(group), utils.get_beta2(group),
                        group['step'] - 2, group['lr'], group['eps'], group['weight_decay'], group['caution'])
     raise SkipUpdate
@@ -264,13 +261,13 @@ def precond_schedule(group, prob: Union[callable, float, None] = None, name: str
 @no_state_no_foreach
-def orthogonalize_update(group, update, grad, param):
+def orthogonalize_update(group, update, grad, param, scale_mode: str = "scale"):  # explore scale_mode="graft"
     if update.dim() == 1:
         return update
     original_shape = update.shape
     # doing it this way, as tmp and update are not guaranteed to share memory address or layout
     tmp = update.flatten(1, -1)
-    utils.inplace_orthogonal_(tmp, utils.zeroth_power_mode, tmp)
+    utils.inplace_orthogonal_(tmp, utils.zeroth_power_mode, tmp, scale_mode)
     return tmp.reshape(original_shape)
@@ -333,7 +330,7 @@ def _update_psgd_cache(cached, Q_cache, q):
 def _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache):
     if cached:
-        return utils.precond_grad_cached_(cache_expr, update, *cache_expr)
+        return utils.precond_grad_cached_(cache_expr, update, *Q_cache)
     return utils.psgd_precond_grad(exprs[-1], update, *Q_mat)
@@ -350,9 +347,12 @@ def _fused_cached_psgd_precond_grad(group, grad, param, cached, cache_expr, expr
 @no_state_no_foreach
 def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                   prob: Optional[callable] = None):
+    old = update
+    update = update.to(memory_format=torch.contiguous_format)
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
-    return _cached_psgd_precond_grad(False, cache_expr, exprs, update, Q_mat, Q_cache)
+    out = _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
+    return torch.as_strided(out, old.shape, old.stride())
 @general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
@@ -360,7 +360,7 @@ def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str
 def scale_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                           prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    precond = _cached_psgd_precond_grad(False, cache_expr, exprs, update, Q_mat, Q_cache)
+    precond = _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
     _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
     return precond
@@ -400,7 +400,7 @@ def apply_to_idx(fn, idx):
 def chain(state: Union[callable, dict], group, grad, param, *fns):
-    update = [torch.clone(g) for g in grad]
+    update = [torch.clone(g, memory_format=torch.preserve_format) for g in grad]
     skip_update = False
     for fn in fns:
         try:
@@ -417,7 +417,6 @@ def chain(state: Union[callable, dict], group, grad, param, *fns):
 class ChainOpt(utils.StatefulOptimizer):
     def __init__(self, params, defaults, foreach: bool, *fns):
         super().__init__(params, defaults, foreach)
         self.fns = tuple(fns)
     def _step(self, group):
@@ -472,9 +471,10 @@ class BaseOpt(ChainOpt):
     update_clipping: str_or_fn = None
     palm: bool = False
     auto_fuse: bool = True
+    compile_step: bool = False
     def __init__(self, params, defaults, foreach: bool, gradient_clipping: str_or_fn, update_clipping: str_or_fn,
-                 palm: bool = None, *fns):
+                 palm: bool = use_default, *fns):
         if default(update_clipping, self.update_clipping) is None:
             if fns and self.auto_fuse:
                 args, kwargs = None, None
@@ -489,6 +489,7 @@ class BaseOpt(ChainOpt):
         else:
             if any(fn in (update_by_adopt, update_by_adam, update_by_laprop, update_by_schedule_free) for fn in fns):
                 raise ValueError("`update_by` functions do not support update clipping. Use `scale_by`")
         fns = tuple(fns)
         if default(palm, self.palm):
@@ -504,9 +505,9 @@ class BaseOpt(ChainOpt):
 class ScheduleFree(BaseOpt):
     def eval(self):
         for group in self.param_groups:
-            train_mode = group['train_mode']
+            group['train_mode'] = train_mode = not group.get('train_mode')
             beta1 = utils.get_beta1(group)
-            if beta1 > 0 and train_mode:
+            if beta1 > 0 and not train_mode:
                 for p in group['params']:
                     state = self.state_(p)
                     if 'z' in state:
@@ -515,13 +516,12 @@ class ScheduleFree(BaseOpt):
                         p32 = utils.promote(p.data)
                         p32.lerp_(end=z, weight=1 - 1 / beta1)
                         utils.copy_stochastic_(p.data, p32)
-                group['train_mode'] = False
     def train(self):
         for group in self.param_groups:
-            train_mode = group['train_mode']
+            group['train_mode'] = train_mode = not group.get('train_mode')
             beta1 = utils.get_beta1(group)
-            if beta1 > 0 and not train_mode:
+            if beta1 > 0 and train_mode:
                 for p in group['params']:
                     state = self.state_(p)
                     if 'z' in state:
@@ -529,4 +529,3 @@ class ScheduleFree(BaseOpt):
                         p32 = utils.promote(p.data)
                         p32.lerp_(end=z, weight=1 - beta1)
                         utils.copy_stochastic_(p.data, p32)
-                group['train_mode'] = True

{heavyball-1.1.0 → heavyball-1.1.2}/heavyball/utils.py RENAMED Viewed

@@ -163,7 +163,7 @@ def beta_debias(beta, step):
 def _compilable_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor,
                             out: List[Optional[Tensor]]):
     s32, g32 = [list(map(promote, x)) for x in (state, grad)]
-    torch._foreach_mul_(s32, beta2)
+    s32 = torch._foreach_mul(s32, beta2)
     [s.addcmul_(g, g, value=1 - beta2) for s, g in zip(s32, g32)]
     denom = torch._foreach_sqrt(s32)
     [d.clamp_(min=eps) for d in denom]
@@ -185,11 +185,11 @@ def exp_avg_sq_(state, grad, beta2, eps, out=None):
 @decorator_knowngood
 def _compilable_scale_by_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor):
     s32, g32 = [list(map(promote, x)) for x in (state, grad)]
-    torch._foreach_mul_(s32, beta2)
+    s32 = torch._foreach_mul(s32, beta2)
     [s.addcmul_(g, g, value=1 - beta2) for s, g in zip(s32, g32)]
     denom = torch._foreach_sqrt(s32)
     [d.clamp_(min=eps) for d in denom]
-    out = torch._foreach_div_(g32, denom)
+    out = torch._foreach_div(g32, denom)
     copy_stochastic_list_(state, s32)
     copy_stochastic_list_(grad, out)
@@ -201,6 +201,21 @@ def scale_by_exp_avg_sq_(exp_avg_sq, grad, beta2, eps):
     return grad
+@decorator_knowngood
+def _compilable_exp_avg_(state, grad, beta):
+    s32, g32 = [list(map(promote, x)) for x in (state, grad)]
+    s32 = [s.lerp(g, beta) for s, g in zip(s32, g32)]
+    copy_stochastic_list_(state, s32)
+    copy_stochastic_list_(grad, s32)
+def scale_by_exp_avg_(state, grad, beta):
+    state, grad = list_guard(state, grad)
+    beta = scalar_guard(beta, state[0])
+    _compilable_exp_avg_(state, grad, beta)
+    return grad
 @decorator_knowngood
 def _compilable_agc_(parameters: List[Tensor], gradients: List[Tensor], clip_val: float, minimum: float, eps: float):
     p_norm = torch._foreach_norm(parameters)
@@ -210,7 +225,7 @@ def _compilable_agc_(parameters: List[Tensor], gradients: List[Tensor], clip_val
     torch._foreach_div_(p_norm, g_norm)
     torch._foreach_mul_(p_norm, clip_val)
     torch._foreach_minimum_(p_norm, 1)
-    return torch._foreach_mul(gradients, p_norm)
+    torch._foreach_mul_(gradients, p_norm)
 def adaptive_gradient_clipping_(parameters: List[Tensor], gradients: List[Tensor], clip_val: float,
@@ -219,7 +234,8 @@ def adaptive_gradient_clipping_(parameters: List[Tensor], gradients: List[Tensor
         return gradients
     parameters, gradients = list_guard(parameters, gradients)
     clip_val = scalar_guard(clip_val, parameters[0])
-    return _compilable_agc_(parameters, gradients, clip_val, minimum, eps)
+    _compilable_agc_(parameters, gradients, clip_val, minimum, eps)
+    return gradients
 def is_compiling():
@@ -289,7 +305,7 @@ def ortho(x):
 @decorator_knowngood
 def _compilable_heavyball_momentum_(state, grad, beta):
     s32, g32 = [list(map(promote, x)) for x in (state, grad)]
-    torch._foreach_mul_(s32, beta)
+    s32 = torch._foreach_mul(s32, beta)
     torch._foreach_add_(s32, g32)
     copy_stochastic_list_(state, s32)
     copy_stochastic_list_(grad, s32)
@@ -298,7 +314,7 @@ def _compilable_heavyball_momentum_(state, grad, beta):
 @decorator_knowngood
 def _compilable_nesterov_momentum_(state, grad, beta):
     s32, g32 = [list(map(promote, x)) for x in (state, grad)]
-    torch._foreach_mul_(s32, beta)
+    s32 = torch._foreach_mul(s32, beta)
     torch._foreach_add_(s32, g32)
     [g.add_(s, alpha=beta) for g, s in zip(g32, s32)]
     copy_stochastic_list_(state, s32)
@@ -319,17 +335,27 @@ def nesterov_momentum(state, grad, beta):
     return grad
+# mode in ("newtonschulz", "qr", "svd")
+# scale_mode in ("none", "scale", "graft")
 @decorator_knowngood
-def inplace_orthogonal_(x, mode, out):
-    if mode == 'qr':
-        y = torch.linalg.qr(x).Q
+def inplace_orthogonal_(x: Tensor, mode: str, out: Tensor, scale_mode: str):
+    if mode == 'newtonschulz' or x.shape[0] != x.shape[1]:
+        y = zeropower_via_newtonschulz5(x, 5)
+    elif mode == 'qr':
+        y = torch.linalg.qr(promote(x)).Q
     elif mode == 'svd':
-        u, s, v = torch.linalg.svd(x)
+        u, s, v = torch.linalg.svd(promote(x))
         y = u @ v.T
-    elif mode == 'newtonschulz':
-        y = zeropower_via_newtonschulz5(x, 5)
     else:
         raise NotImplementedError(f"Unknown zeroth_power_mode: {mode}")
+    if scale_mode == "none":
+        pass
+    elif scale_mode == "scale":
+        y *= max(1, x.size(0) / x.size(1)) ** 0.5
+    elif scale_mode == "graft":
+        y *= x.norm() / y.norm().clamp_(min=1e-6)
+    else:
+        raise NotImplementedError(f"Unknown scale_mode: {scale_mode}")
     set_(out, y)
@@ -363,7 +389,7 @@ def get_orthogonal_matrix_QR(GG, Q, exp_avg_sq):
         est_eig = torch.einsum('ij,ij->j', o, tmp)
         sort_idx = torch.argsort(est_eig, descending=True)
         indices.append(sort_idx)
-        inplace_orthogonal_(tmp[:, sort_idx], q)
+        inplace_orthogonal_(tmp[:, sort_idx], zeroth_power_mode, q, "none")
     indices = tuple(slice(None) if ind is None else ind.view(*(1,) * i, -1, *(1,) * (exp_avg_sq.dim() - i - 1))  #
                     for i, ind in enumerate(indices))
@@ -422,8 +448,7 @@ def _compilable_stochastic_lerp_(x: List[Tensor], y: List[Tensor], a: Union[floa
     for x_, y_ in zip(x, y):
         x32 = promote(x_)
         y32 = promote(y_)
-        x32.lerp_(y32, a)
-        copy_stochastic_(x_, x32)
+        copy_stochastic_(x_, x32.lerp(y32, a))
 def get_beta1(group):
@@ -484,7 +509,7 @@ def _compilable_stochastic_add_(x: List[Tensor], y: List[Tensor], alpha: Union[f
     for x_, y_ in zip(x, y):
         x32 = promote(x_)
         y32 = promote(y_)
-        x32.add_(y32, alpha=alpha)
+        x32.add_(y32, alpha=alpha)  # can't use out-of-place here; torch.compile doesn't handle data-dependent inputs
         copy_stochastic_(x_, x32)
@@ -506,7 +531,7 @@ def compute_ggt(grad, GG, max_precond_dim, precondition_1d, beta):
         g0 = einsum_base[:grad.dim()]
         g1 = g0.replace(b, b.upper())
         outer_product = torch.einsum(f'{g0},{g1}->{b + b.upper()}', grad, grad)
-        GG[idx].lerp_(promote(outer_product), 1 - beta)
+        GG[idx].lerp_(outer_product, 1 - beta)
 def promote(x):
@@ -571,7 +596,8 @@ def project(grad, Q, back: bool):
     preconditioners = ",".join([(g + g.upper())[::-1 if back else 1] for m, g in zip(Q, param) if len(m) > 0])
     if preconditioners:
         out = ''.join([c.upper() if c.upper() in preconditioners else c for c in param])
-        grad = torch.einsum(f'{param},{preconditioners}->{out}', grad, *[q for q in Q if len(q) > 0])
+        out = torch.einsum(f'{param},{preconditioners}->{out}', promote(grad), *[q for q in Q if len(q) > 0])
+        grad = out.to(grad.dtype)
     return grad
@@ -724,20 +750,26 @@ def copy_stochastic_list_(target: List[Tensor], source: List[Tensor]):
         copy_stochastic_(t, s)
+def _lerp32(state: List[Tensor], grad: List[Tensor], beta):
+    ea32 = list(map(promote, state))
+    grad = list(map(promote, grad))
+    ea32 = [e.lerp(g, 1 - beta) for e, g in zip(ea32, grad)]
+    copy_stochastic_list_(state, ea32)
+    return ea32
 @decorator_knowngood
 def _compilable_adam_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: Tensor, beta2: Tensor,
                       step: Tensor):
     beta1 = beta_debias(beta1, step)
     beta2 = beta_debias(beta2, step)
-    g32, exp_avg_sq32, exp_avg32 = [list(map(promote, x)) for x in [grad, exp_avg_sq, exp_avg]]
+    g32 = list(map(promote, grad))
-    [ea32.lerp_(g, 1 - beta1) for ea32, g in zip(exp_avg32, g32)]
-    denom = exp_avg_sq_(exp_avg_sq32, g32, beta2, 1e-8)
+    exp_avg32 = _lerp32(exp_avg, g32, beta1)
+    denom = exp_avg_sq_(exp_avg_sq, g32, beta2, 1e-8)
     u32 = torch._foreach_div(exp_avg32, denom)
-    copy_stochastic_list_(exp_avg, exp_avg32)
-    copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
     copy_stochastic_list_(grad, u32)
@@ -749,28 +781,26 @@ def adam_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], b
 @decorator_knowngood
-def _fused_compilable_adam_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor],
-                            beta1: Tensor, beta2: Tensor, step: Tensor, decay: Tensor, lr: Tensor, eps: Tensor,
-                            caution: bool):
+def _fused_compilable_adam_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], update: List[Tensor],
+                            grad: List[Tensor], beta1: Tensor, beta2: Tensor, step: Tensor, decay: Tensor, lr: Tensor,
+                            eps: Tensor, caution: bool):
     beta1 = beta_debias(beta1, step)
     beta2 = beta_debias(beta2, step)
-    g32, exp_avg_sq32, exp_avg32 = [list(map(promote, x)) for x in [grad, exp_avg_sq, exp_avg]]
+    u32, g32 = [list(map(promote, x)) for x in [update, grad]]
-    [ea32.lerp_(g, 1 - beta1) for ea32, g in zip(exp_avg32, g32)]
-    denom = exp_avg_sq_(exp_avg_sq32, g32, beta2, 1e-8)
+    exp_avg32 = _lerp32(exp_avg, u32, beta1)
+    denom = exp_avg_sq_(exp_avg_sq, u32, beta2, 1e-8)
     u32 = torch._foreach_div(exp_avg32, denom)
-    copy_stochastic_list_(exp_avg, exp_avg32)
-    copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
-    _compilable_update_(y, u32, decay, lambda a, b, c: a.add_(b, alpha=c), lr, caution, g32)
+    _compilable_update_(y, u32, decay, stochastic_add_, lr, caution, g32)
-def fused_adam_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float,
-                beta2: float, step: int, lr: float, eps: float, decay: float, caution: bool):
+def fused_adam_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], update: List[Tensor],
+                grad: List[Tensor], beta1: float, beta2: float, step: int, lr: float, eps: float, decay: float,
+                caution: bool):
     y, exp_avg, exp_avg_sq, grad = list_guard(y, exp_avg, exp_avg_sq, grad)
     beta1, beta2, step, lr = scalar_guard(beta1, beta2, step, lr, y[0])
-    return _fused_compilable_adam_(y, exp_avg, exp_avg_sq, grad, beta1, beta2, step, decay, lr, eps, caution)
+    return _fused_compilable_adam_(y, exp_avg, exp_avg_sq, update, grad, beta1, beta2, step, decay, lr, eps, caution)
 @decorator_knowngood
@@ -779,14 +809,13 @@ def _compilable_laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: L
     beta1 = beta_debias(beta1, step)
     beta2 = beta_debias(beta2, step)
-    gp32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg_sq]]
+    gp32 = list(map(promote, grad))
-    denom = exp_avg_sq_(exp_avg_sq32, gp32, beta2, 1e-8)
+    denom = exp_avg_sq_(exp_avg_sq, gp32, beta2, 1e-8)
     gp32 = torch._foreach_div(gp32, denom)
-    stochastic_lerp_(exp_avg, gp32, 1 - beta1)
+    gp32 = _lerp32(exp_avg, gp32, beta1)
-    copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
-    copy_stochastic_list_(grad, exp_avg)
+    copy_stochastic_list_(grad, gp32)
 def laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float, beta2: float, step: int):
@@ -797,52 +826,50 @@ def laprop_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor],
 @decorator_knowngood
-def _fused_compilable_laprop_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor],
-                              grad_projected: List[Tensor], beta1: Tensor, beta2: Tensor, step: Tensor, lr: Tensor,
-                              decay: Tensor, caution: bool):
+def _fused_compilable_laprop_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], update: List[Tensor],
+                              grad: List[Tensor], beta1: Tensor, beta2: Tensor, step: Tensor, lr: Tensor, decay: Tensor,
+                              caution: bool):
     beta1 = beta_debias(beta1, step)
     beta2 = beta_debias(beta2, step)
-    gp32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad_projected, exp_avg_sq]]
+    u32, gp32 = [list(map(promote, x)) for x in [update, grad]]
-    denom = exp_avg_sq_(exp_avg_sq32, gp32, beta2, 1e-8)
-    gp32 = torch._foreach_div(gp32, denom)
-    stochastic_lerp_(exp_avg, gp32, 1 - beta1)
-    update_param_(y, gp32, lr, decay, caution=caution, grad=gp32)
-    copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
+    denom = exp_avg_sq_(exp_avg_sq, u32, beta2, 1e-8)
+    u32 = torch._foreach_div(u32, denom)
+    u32 = _lerp32(exp_avg, u32, beta1)
+    _compilable_update_(y, u32, decay, stochastic_add_, lr, caution, gp32)
-def fused_laprop_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: List[Tensor], beta1: float,
-                  beta2: float, step: int, lr: float, decay: float, caution: bool):
+def fused_laprop_(y: List[Tensor], exp_avg: List[Tensor], exp_avg_sq: List[Tensor], update: List[Tensor],
+                  grad: List[Tensor], beta1: float, beta2: float, step: int, lr: float, decay: float, caution: bool):
     exp_avg, exp_avg_sq, grad, y = list_guard(exp_avg, exp_avg_sq, grad, y)
     beta1, beta2, step, lr = scalar_guard(beta1, beta2, step, lr, exp_avg[0])
-    _fused_compilable_laprop_(y, exp_avg, exp_avg_sq, grad, beta1, beta2, step, lr, decay, caution)
+    _fused_compilable_laprop_(y, exp_avg, exp_avg_sq, update, grad, beta1, beta2, step, lr, decay, caution)
 @decorator_knowngood
-def _fused_compilable_adopt_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
-    g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]
-    update_param_(y, exp_avg, lr, decay, caution=caution, grad=g32)
+def _fused_compilable_adopt_(y, update, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
+    u32, g32, exp_avg_sq32, exp_avg32 = [list(map(promote, x)) for x in [update, grad, exp_avg_sq, exp_avg]]
+    _compilable_update_(y, u32, decay, stochastic_add_, lr, caution, g32)
     beta1 = beta_debias(beta1, step)
     denom = torch._foreach_sqrt(exp_avg_sq32)
     [denom.clamp_(min=eps) for denom in denom]
-    torch._foreach_mul_(exp_avg32, beta1)
-    [ea32.addcdiv_(g, d, value=1 - beta1) for ea32, g, d in zip(exp_avg32, g32, denom)]
+    exp_avg32 = torch._foreach_mul(exp_avg32, beta1)
+    [ea32.addcdiv_(g, d, value=1 - beta1) for ea32, g, d in zip(exp_avg32, u32, denom)]
+    copy_stochastic_list_(exp_avg, exp_avg32)
     beta2 = beta_debias(beta2, step + 1)
-    torch._foreach_mul_(exp_avg_sq32, beta2)
-    [eas32.addcmul_(g, g, value=1 - beta2) for eas32, g in zip(exp_avg_sq32, g32)]
-    copy_stochastic_list_(exp_avg, exp_avg32)
+    exp_avg_sq32 = torch._foreach_mul(exp_avg_sq32, beta2)
+    [eas32.addcmul_(g, g, value=1 - beta2) for eas32, g in zip(exp_avg_sq32, u32)]
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
-def fused_adopt_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
+def fused_adopt_(y, update, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
     exp_avg, exp_avg_sq, grad, y = list_guard(exp_avg, exp_avg_sq, grad, y)
     beta1, beta2, step, lr = scalar_guard(beta1, beta2, step, lr, exp_avg[0])
-    _fused_compilable_adopt_(y, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution)
+    _fused_compilable_adopt_(y, update, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution)
 @decorator_knowngood
@@ -853,21 +880,21 @@ def _compilable_adopt_(grad, exp_avg_sq, exp_avg, beta1, beta2, step):
     beta1 = beta_debias(beta1, step)
     denom = torch._foreach_sqrt(exp_avg_sq32)
     [denom.clamp_(min=1e-8) for denom in denom]
-    torch._foreach_mul_(exp_avg32, beta1)
+    exp_avg32 = torch._foreach_mul(exp_avg32, beta1)
     [ea32.addcdiv_(g, d, value=1 - beta1) for ea32, g, d in zip(exp_avg32, g32, denom)]
+    copy_stochastic_list_(exp_avg, exp_avg32)
     beta2 = beta_debias(beta2, step + 1)
-    torch._foreach_mul_(exp_avg_sq32, beta2)
+    exp_avg_sq32 = torch._foreach_mul(exp_avg_sq32, beta2)
     [eas32.addcmul_(g, g, value=1 - beta2) for eas32, g in zip(exp_avg_sq32, g32)]
-    copy_stochastic_list_(exp_avg, exp_avg32)
     copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
     copy_stochastic_list_(grad, update)
 def adopt(grad, exp_avg_sq, exp_avg, beta1, beta2, step):
-    exp_avg, exp_avg_sq, grad = list_guard(exp_avg, exp_avg_sq, grad, y)
-    beta1, beta2, step = scalar_guard(beta1, beta2, step, lr, exp_avg[0])
+    exp_avg, exp_avg_sq, grad = list_guard(exp_avg, exp_avg_sq, grad)
+    beta1, beta2, step = scalar_guard(beta1, beta2, step, exp_avg[0])
     _compilable_adopt_(grad, exp_avg_sq, exp_avg, beta1, beta2, step)
     return grad
@@ -912,7 +939,7 @@ def _compilable_update_(p: List[Tensor], u: List[Tensor], decay: Tensor, add_fn:
     for p32_, u32_, g_ in zip(p32, u32, g):  # lr is data-dependent -> can't compile a foreach
         if caution:
-            _compilable_cautioning_(promote(g_), u32_)
+            u32_ = _compilable_cautioning(promote(g_), u32_)
         add_fn(p32_, u32_, lr)
     copy_stochastic_list_(p, p32)
@@ -1228,7 +1255,7 @@ def psgd_should_update(group, prob: Union[float, callable], rng: Optional[random
         prob = prob(group[f'{name}_prob_step'])
     if group['stochastic_schedule']:
         return rng.random() < prob
-    cumulative_prob = state.get(name, 0)
+    cumulative_prob = group.get(name, 0)
     group[name] = cumulative_prob + prob
     return int(group[name]) > int(cumulative_prob)
@@ -1289,15 +1316,16 @@ def mars_correction(g, old_g, beta1, gamma):
 @decorator_knowngood
-def _compilable_cautioning_(g: Tensor, update: Tensor):
-    mask = (g * update) > 0
-    update.masked_fill_(~mask, 0)
-    scale = mask.numel() / mask.sum().clamp(min=1)
+def _compilable_cautioning(g: Tensor, update: Tensor):
+    mask = g.signbit() ^ update.signbit()  # "Mask if they point in different directions"
+    update = update.masked_fill(mask, 0)
+    scale = mask.numel() / (mask.numel() - mask.sum()).clamp(min=1)
     update.mul_(scale)
+    return update
 def caution(g, update):
-    _compilable_cautioning_(g, update)
+    return _compilable_cautioning(g, update)
 def precond_update_prob_schedule(max_prob=1.0, min_prob=0.03, decay=0.001, flat_start=500):

{heavyball-1.1.0 → heavyball-1.1.2}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.1.0
+Version: 1.1.2
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.1.0 → heavyball-1.1.2}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setuptools.setup(
     name='heavyball',
     license='BSD',
     description='Efficient optimizers',
-    version='1.1.0',
+    version='1.1.2',
     long_description=README,
     url='https://github.com/clashluke/heavyball',
     packages=setuptools.find_packages(),

{heavyball-1.1.0 → heavyball-1.1.2}/test/test_channels_last.py RENAMED Viewed

@@ -11,13 +11,14 @@ from heavyball.utils import clean, set_torch
 from torch import nn
 from torch._dynamo import config
+heavyball.utils.zeroth_power_mode = 'newtonschulz'
 heavyball.utils.compile_mode = 'default'
 config.cache_size_limit = 128
 @pytest.mark.parametrize("opt", heavyball.__all__)
 @pytest.mark.parametrize("size,depth", [(128, 1)])
-def test_foreach(opt, size, depth: int, iterations: int = 32, outer_iterations: int = 1):
+def test_foreach(opt, size, depth: int, iterations: int = 1024, outer_iterations: int = 1):
     set_torch()
     opt = getattr(heavyball, opt)