PyPI - heavyball - Versions diffs - 1.6.0__py3-none-any.whl → 1.6.2__py3-none-any.whl - Mend

heavyball 1.6.0py3-none-any.whl → 1.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

heavyball/utils.py CHANGED Viewed

@@ -1,18 +1,12 @@
-import copy
 import functools
 import gc
-import inspect
 import math
 import random
 import string
-import sys
-import time
 import warnings
-from datetime import datetime
 from typing import List, Optional, Tuple, Callable, Union
 from unittest.mock import patch
-import hyperopt
 import numpy as np
 import torch
 from torch import Tensor
@@ -165,14 +159,17 @@ def beta_debias(beta, step):
     return 1 - (1 - beta) / (1 - beta ** step)
+def eps_sqrt(item, eps):
+    return item.sqrt().clamp(min=eps)
 @decorator_knowngood
 def _compilable_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor,
                             out: List[Optional[Tensor]]):
-    s32, g32 = [list(map(promote, x)) for x in (state, grad)]
-    s32 = [s * beta2 + g * g * (1 - beta2) for s, g in zip(s32, g32)]
-    copy_stochastic_list_(state, s32)
+    g32 = promote(grad)
+    s32 = _lerp(state, torch._foreach_mul(g32, g32), beta2)
-    denom = [d.sqrt().clamp(min=eps) for d in s32]
+    denom = [eps_sqrt(d, eps) for d in s32]
     if out[0] is None:
         return denom
@@ -189,7 +186,7 @@ def exp_avg_sq_(state, grad, beta2, eps, out=None):
 @decorator_knowngood
 def _compilable_scale_by_exp_avg_sq_(state: List[Tensor], grad: List[Tensor], beta2: Tensor, eps: Tensor):
-    g32 = list(map(promote, grad))
+    g32 = promote(grad)
     denom = _compilable_exp_avg_sq_(state, g32, beta2, eps, [None])
     out = torch._foreach_div(g32, denom)
     copy_stochastic_list_(grad, out)
@@ -265,8 +262,8 @@ def set_torch(benchmark_limit: int = 32):
     cudnn.benchmark_limit = benchmark_limit
     torch.use_deterministic_algorithms(False)
     torch.set_float32_matmul_precision("high")  # highest: FP32, high: TF32, medium: bf16
-    opt_einsum.enabled = True
-    opt_einsum.strategy = "dp"
+    opt_einsum.enabled = False
+    opt_einsum.strategy = "auto"
     # Torch calls these for 2nd-order optimization in HeavyBall, but they are explicitly handled.
     _ignore_warning(
@@ -379,7 +376,7 @@ def _compilable_scatter_set(target, source, index):
     target[:] = source.contiguous()[index].reshape_as(target)
-@decorator_knowngood
+# @decorator_knowngood
 def get_orthogonal_matrix_QR(GG: List[Tensor], Q: List[Tensor], exp_avg: Optional[Tensor] = None):
     """
     Computes the eigenbases of the preconditioner using one round of power iteration
@@ -398,7 +395,8 @@ def get_orthogonal_matrix_QR(GG: List[Tensor], Q: List[Tensor], exp_avg: Optiona
     new_qs = []
     for m, q in zip(GG, Q):
-        if len(m) == 0:
+        if m is None:
+            new_qs.append(None)
             continue
         m = promote(m.data)
@@ -420,52 +418,60 @@ def get_orthogonal_matrix_QR(GG: List[Tensor], Q: List[Tensor], exp_avg: Optiona
     in_str = einsum_base[:exp_avg.dim()]
     out_str = einsum_base[exp_avg.dim():2 * exp_avg.dim()]
-    from_shampoo = ",".join([o + i for m, i, o in zip(Q, in_str, in_str.upper()) if len(m) > 0])
+    from_shampoo = ",".join([o + i for m, i, o in zip(Q, in_str, in_str.upper()) if m is not None])
     if not from_shampoo:
         return
-    to_shampoo = ','.join([i + o for m, i, o in zip(new_qs, in_str.upper(), out_str) if len(m) > 0])
+    to_shampoo = ','.join([i + o for m, i, o in zip(new_qs, in_str.upper(), out_str) if m is not None])
     out_str = ''.join([o if o in to_shampoo else i for i, o in zip(in_str, out_str)])
     subscripts = f'{in_str},{from_shampoo},{to_shampoo}->{out_str}'
-    exp_avg_new = torch.einsum(subscripts, exp_avg, *[q for q in Q], *[q for q in new_qs])
+    exp_avg_new = torch.einsum(subscripts, exp_avg, *[q for q in Q if q is not None],
+                               *[q for q in new_qs if q is not None])
     copy_stochastic_(exp_avg, exp_avg_new)
     for q, q_new in zip(Q, new_qs):
-        copy_stochastic_(q, q_new)
+        if q is not None:
+            copy_stochastic_(q, q_new)
-def get_orthogonal_matrix(mat):
+def get_orthogonal_matrix(mat, max_eps: float = 1e-3, min_eps: float = 1e-30):
     """
     Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
     """
     final = []
     for m in mat:
-        if len(m) == 0:
-            final.append([])
+        if m is None:
+            final.append(None)
             continue
         m = promote(m.data)
         device, dtype = m.device, m.dtype
-        for modifier in (None, torch.double, 'cpu'):
-            if modifier is not None:
-                m = m.to(modifier)
+        eps = min_eps
+        while True:
             try:
-                eigval, eigvec = torch.linalg.eigh(m + 1e-30 * torch.eye(m.shape[0], device=m.device, dtype=m.dtype))
+                eye = torch.eye(m.shape[0], device=m.device, dtype=m.dtype)
+                eigval, eigvec = torch.linalg.eigh(m + eps * eye)
                 eigvec = eigvec.to(device=device, dtype=dtype)
                 break
             except torch.OutOfMemoryError:
-                pass
+                if m.device.type == 'cpu':
+                    raise
+                else:
+                    m = m.cpu()
             except RuntimeError:  # failed to compute eigenvalues
-                continue
+                if m.dtype != torch.double:
+                    m = m.double()
+                elif eps < max_eps:
+                    eps = eps ** (2 / 3)
+                else:
+                    raise
             clean()
-        else:
-            raise RuntimeError("Failed to compute eigenvalues.")
+        eigvec = eigvec.to(device=m.device, dtype=m.dtype)
         eigvec = torch.flip(eigvec, [1])
         final.append(eigvec)
     return final
@@ -476,7 +482,9 @@ def _compilable_stochastic_lerp_(x: List[Tensor], y: List[Tensor], a: Union[floa
     for x_, y_ in zip(x, y):
         x32 = promote(x_)
         y32 = promote(y_)
-        copy_stochastic_(x_, x32.lerp(y32, a))
+        if x32.dtype != y32.dtype:
+            y32 = y32.to(x32.dtype)
+        copy_stochastic_(x_, x32 * (1 - a) + y32 * a)
 def get_beta1(group):
@@ -575,7 +583,7 @@ def update_ggt(grad, GG, max_precond_dim, precondition_1d, beta):
         g0 = einsum_base[:grad.dim()]
         g1 = g0.replace(b, b.upper())
         outer_product = torch.einsum(f'{g0},{g1}->{b + b.upper()}', grad, grad)
-        m.lerp_(outer_product, 1 - beta)
+        stochastic_lerp_(m, outer_product, 1 - beta)
 def tree_apply(fn):
@@ -618,7 +626,8 @@ def init_preconditioner(grad, state, max_precond_dim, precondition_1d):
     state['GG'] = []  # Will hold all the preconditioner matrices (L and R in the paper).
     if grad.numel() > 1 and (grad.ndim > 1 or precondition_1d):
         for sh in grad.shape:
-            if sh > max_precond_dim:
+            if sh > max_precond_dim or sh == 1:
+                # via @francois-rozet: https://github.com/HomebrewML/HeavyBall/commit/8b86be04967e2d095136d5603724f488f2d46592#diff-a430393dd0a6ee393944a9ed16416115c175de2414cf4a96e647197697f265e9R621
                 state['GG'].append(None)
             else:
                 state['GG'].append(torch.zeros(sh, sh, device=grad.device, dtype=grad.dtype))
@@ -638,10 +647,10 @@ def project(grad, Q, back: bool):
     :return:
     """
     param = einsum_base[:grad.dim()]
-    preconditioners = ",".join([(g + g.upper())[::-1 if back else 1] for m, g in zip(Q, param) if len(m) > 0])
+    preconditioners = ",".join([(g + g.upper())[::-1 if back else 1] for m, g in zip(Q, param) if m is not None])
     if preconditioners:
         out = ''.join([c.upper() if c.upper() in preconditioners else c for c in param])
-        out = torch.einsum(f'{param},{preconditioners}->{out}', promote(grad), *[q for q in Q if len(q) > 0])
+        out = torch.einsum(f'{param},{preconditioners}->{out}', promote(grad), *[q for q in Q if q is not None])
         grad = out.to(grad.dtype)
     return grad
@@ -876,7 +885,7 @@ def _lerp(state: List[Tensor], grad: List[Tensor], beta):
     ea32 = list(map(promote, state))
     grad = list(map(promote, grad))
     beta = promote(beta)
-    ea32 = [e * beta + g * (1 - beta) for e, g in zip(ea32, grad)]
+    stochastic_lerp_(ea32, grad, 1 - beta)
     copy_stochastic_list_(state, ea32)
     return ea32
@@ -890,7 +899,7 @@ def _compilable_adam_(exp_avg: List[Tensor], exp_avg_sq: List[Tensor], grad: Lis
     g32 = list(map(promote, grad))
     exp_avg32 = _lerp(exp_avg, g32, beta1)
     denom = _compilable_exp_avg_sq_(exp_avg_sq, g32, beta2, eps, [None])
-    u32 = [ea / d for ea, d in zip(exp_avg32, denom)]
+    u32 = torch._foreach_div(exp_avg32, denom)
     copy_stochastic_list_(grad, u32)
@@ -973,14 +982,11 @@ def _fused_compilable_adopt_(y, update, grad, exp_avg_sq, exp_avg, beta1, beta2,
     _compilable_update_(y, u32, decay, lr, caution, g32)
     beta1 = beta_debias(beta1, step)
-    denom = torch._foreach_sqrt(exp_avg_sq32)
-    denom = [d.clamp(min=eps) for d in denom]
-    exp_avg32 = [ea32.lerp(g / d, 1 - beta1) for ea32, g, d in zip(exp_avg32, g32, denom)]
-    copy_stochastic_list_(exp_avg, exp_avg32)
+    denom = [eps_sqrt(d, eps) for d in exp_avg_sq32]
+    stochastic_lerp_(exp_avg, torch._foreach_div(g32, denom), 1 - beta1)
     beta2 = beta_debias(beta2, step + 1)
-    exp_avg_sq32 = [eas32.lerp(g * g, 1 - beta2) for eas32, g in zip(exp_avg_sq32, u32)]
-    copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
+    stochastic_lerp_(exp_avg_sq, torch._foreach_mul(g32, g32), 1 - beta2)
 def fused_adopt_(y, update, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, eps, decay, caution):
@@ -990,27 +996,23 @@ def fused_adopt_(y, update, grad, exp_avg_sq, exp_avg, beta1, beta2, step, lr, e
 @decorator_knowngood
-def _compilable_adopt_(grad, exp_avg_sq, exp_avg, beta1, beta2, step):
+def _compilable_adopt_(grad, exp_avg_sq, exp_avg, beta1, beta2, step, eps):
     g32, exp_avg32, exp_avg_sq32 = [list(map(promote, x)) for x in [grad, exp_avg, exp_avg_sq]]
     update = [e.clone() for e in exp_avg]
     beta1 = beta_debias(beta1, step)
-    denom = torch._foreach_sqrt(exp_avg_sq32)
-    denom = [d.clamp(min=1e-8) for d in denom]
-    exp_avg32 = [ea32.lerp(g / d, 1 - beta1) for ea32, g, d in zip(exp_avg32, g32, denom)]
-    copy_stochastic_list_(exp_avg, exp_avg32)
+    denom = [eps_sqrt(d, eps) for d in exp_avg_sq32]
+    stochastic_lerp_(exp_avg, torch._foreach_div(g32, denom), 1 - beta1)
-    beta2 = beta_debias(beta2, step + 1)
-    exp_avg_sq32 = [eas32.lerp(g * g, 1 - beta2) for eas32, g in zip(exp_avg_sq32, g32)]
-    copy_stochastic_list_(exp_avg_sq, exp_avg_sq32)
+    stochastic_lerp_(exp_avg_sq, torch._foreach_mul(g32, g32), 1 - beta2)
     copy_stochastic_list_(grad, update)
-def adopt(grad, exp_avg_sq, exp_avg, beta1, beta2, step):
+def adopt(grad, exp_avg_sq, exp_avg, beta1, beta2, step, eps: float = 1e-8):
     exp_avg, exp_avg_sq, grad = list_guard(exp_avg, exp_avg_sq, grad)
-    beta1, beta2, step = scalar_guard(beta1, beta2, step, exp_avg[0])
-    _compilable_adopt_(grad, exp_avg_sq, exp_avg, beta1, beta2, step)
+    beta1, beta2, step, eps = scalar_guard(beta1, beta2, step, eps, exp_avg[0])
+    _compilable_adopt_(grad, exp_avg_sq, exp_avg, beta1, beta2, step, eps)
     return grad

{heavyball-1.6.0.dist-info → heavyball-1.6.2.dist-info}/METADATA RENAMED Viewed

@@ -1,26 +1,26 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: heavyball
-Version: 1.6.0
-Summary: Efficient optimizers
-Home-page: https://github.com/HomebrewML/HeavyBall
-Author: HeavyBall Authors
-Author-email: github.heavyball@nestler.sh
-License: BSD
-Classifier: Development Status :: 5 - Production/Stable
-Classifier: License :: OSI Approved :: BSD License
-Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 3.7
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Topic :: Software Development :: Libraries
-Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Version: 1.6.2
+Summary: Efficient Optimizers
+Author-email: HeavyBall Authors <github.heavyball@nestler.sh>
+Project-URL: source, https://github.com/HomebrewML/HeavyBall
+Project-URL: tracker, https://github.com/HomebrewML/HeavyBall/issues
+Keywords: torch,optimizer,muon,soap,psgd
 Classifier: Intended Audience :: Developers
-Requires-Python: >=3.7
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: opt-einsum
-Requires-Dist: torch
-Requires-Dist: numpy
+Requires-Dist: opt-einsum>=3.0.0
+Requires-Dist: torch>=2.0.0
+Provides-Extra: dev
+Requires-Dist: pre-commit; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: ruff; extra == "dev"
 # `heavyball`: Efficient Optimizers

heavyball-1.6.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+heavyball/__init__.py,sha256=DKp8rEAf7mt2-j9XRVlgjaLjyfuwUsyl_uXJoOKWAHg,15362
+heavyball/chainable.py,sha256=n_u0QS92WitbtnENvNQ0m4dZTHuJ5ObQ88XA3cmhCfo,27298
+heavyball/utils.py,sha256=CFBFHTekWaqKhmrSLuMvRsxZ41YxPfsYihEPvJMKOQc,56088
+heavyball-1.6.2.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
+heavyball-1.6.2.dist-info/METADATA,sha256=q2CEAHIg6jdGn7dey36EMExwJBrNFTDgZFpEzEHDvBY,43479
+heavyball-1.6.2.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+heavyball-1.6.2.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
+heavyball-1.6.2.dist-info/RECORD,,

{heavyball-1.6.0.dist-info → heavyball-1.6.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (75.8.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

heavyball-1.6.0.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-heavyball/__init__.py,sha256=DKp8rEAf7mt2-j9XRVlgjaLjyfuwUsyl_uXJoOKWAHg,15362
-heavyball/chainable.py,sha256=n_u0QS92WitbtnENvNQ0m4dZTHuJ5ObQ88XA3cmhCfo,27298
-heavyball/utils.py,sha256=Nk0q_sfv47F-QC9Wwi5KCt-C_71OhuzM98XHlYGvl24,55905
-heavyball-1.6.0.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
-heavyball-1.6.0.dist-info/METADATA,sha256=5suezTlZCOBwCgHeFgkLaywYwjAWN1SPg6yhvAv1WgE,43441
-heavyball-1.6.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-heavyball-1.6.0.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
-heavyball-1.6.0.dist-info/RECORD,,

{heavyball-1.6.0.dist-info → heavyball-1.6.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{heavyball-1.6.0.dist-info → heavyball-1.6.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

heavyball 1.6.0__py3-none-any.whl → 1.6.2__py3-none-any.whl

heavyball 1.6.0py3-none-any.whl → 1.6.2py3-none-any.whl