PyPI - heavyball - Versions diffs - 0.14.7__py3-none-any.whl → 0.15.1__py3-none-any.whl - Mend

heavyball 0.14.7py3-none-any.whl → 0.15.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

heavyball/__init__.py +25 -3
heavyball/cached_psgd_kron.py +141 -0
heavyball/delayed_psgd.py +43 -51
heavyball/foreach_adamw.py +22 -32
heavyball/foreach_adopt.py +38 -48
heavyball/foreach_laprop.py +25 -35
heavyball/foreach_sfadamw.py +28 -38
heavyball/foreach_soap.py +56 -70
heavyball/p_adam.py +46 -50
heavyball/palm_foreach_sfadamw.py +31 -41
heavyball/palm_foreach_soap.py +56 -70
heavyball/precond_schedule_foreach_soap.py +57 -71
heavyball/precond_schedule_palm_foreach_soap.py +58 -73
heavyball/precond_schedule_sfpsoap.py +60 -72
heavyball/psgd_kron.py +43 -49
heavyball/pure_psgd.py +36 -43
heavyball/schedule_free_palm_foreach_soap.py +61 -72
heavyball/utils.py +23 -7
{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/METADATA +1 -1
heavyball-0.15.1.dist-info/RECORD +23 -0
heavyball-0.14.7.dist-info/RECORD +0 -22
{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/LICENSE +0 -0
{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/WHEEL +0 -0
{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/top_level.txt +0 -0

heavyball/schedule_free_palm_foreach_soap.py CHANGED Viewed

@@ -46,84 +46,73 @@ class SFPaLMForeachSOAP(ScheduleFree):
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
-    @torch.no_grad()
-    def step(self, closure=None):
-        """
-        Performs a single optimization step.
-        Arguments:
-            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
-        """
-        if closure is None:
-            loss = None
-        else:
-            loss = closure()
-        for group in self.param_groups:
-            vals = []
-            max_precond_dim = group['max_precond_dim']
-            precondition_1d = group['precondition_1d']
-            step = group['step'] = group.get("step", -1) + 1
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                grad = p.grad.float()
-                vals.append((p, grad))
-            p_list, grad = zip(*vals)
-            adaptive_gradient_clipping_(p_list, grad, group["gradient_clip_val"], eps=group["eps"])
-            vals = []
-            for p, g in split_p_and_g_in_group(group):
-                state = self.state_(p)
-                if "z" not in state:
-                    state["z"] = torch.clone(p).float()
-                    state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
-                    init_preconditioner(g, state, max_precond_dim, precondition_1d)
-                    update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
-                    continue  # first step is skipped so that we never use the current gradients in the projection.
-                # Projecting gradients to the eigenbases of Shampoo's preconditioner
-                # i.e. projecting to the eigenbases of matrices in state['GG']
-                grad_projected = project(g, state['Q'], False)
-                z, exp_avg_sq = state["z"], state["exp_avg_sq"]
-                vals.append((p, g, grad_projected, z, exp_avg_sq))
-            if not vals:
+    def _step(self, group):
+        vals = []
+        max_precond_dim = group['max_precond_dim']
+        precondition_1d = group['precondition_1d']
+        step = group['step'] = group.get("step", -1) + 1
+        for p in group["params"]:
+            if p.grad is None:
                 continue
+            grad = p.grad.float()
+            vals.append((p, grad))
+        if not vals:
+            return
+        p_list, grad = zip(*vals)
+        adaptive_gradient_clipping_(p_list, grad, group["gradient_clip_val"], eps=group["eps"])
+        vals = []
+        for p, g in split_p_and_g_in_group(group):
+            state = self.state_(p)
+            if "z" not in state:
+                state["z"] = torch.clone(p).float()
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                init_preconditioner(g, state, max_precond_dim, precondition_1d)
+                update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
+                continue  # first step is skipped so that we never use the current gradients in the projection.
+            # Projecting gradients to the eigenbases of Shampoo's preconditioner
+            # i.e. projecting to the eigenbases of matrices in state['GG']
+            grad_projected = project(g, state['Q'], False)
+            z, exp_avg_sq = state["z"], state["exp_avg_sq"]
+            vals.append((p, g, grad_projected, z, exp_avg_sq))
+        if not vals:
+            return
-            p_list, grad, grad_projected, z, exp_avg_sq = zip(*vals)
+        p_list, grad, grad_projected, z, exp_avg_sq = zip(*vals)
-            beta2 = 1 - max(step, 1) ** -group['beta2_scale']
-            new_debiased2 = beta_debias(beta2, step)
+        beta2 = 1 - max(step, 1) ** -group['beta2_scale']
+        new_debiased2 = beta_debias(beta2, step)
-            # Decay the first and second moment running average coefficient
-            # In-place operations to update the averages at the same time
-            denom = exp_avg_sq_(exp_avg_sq, grad, new_debiased2, group["eps"])
-            torch._foreach_div_(grad_projected, denom)
+        # Decay the first and second moment running average coefficient
+        # In-place operations to update the averages at the same time
+        denom = exp_avg_sq_(exp_avg_sq, grad, new_debiased2, group["eps"])
+        torch._foreach_div_(grad_projected, denom)
-            update_precond = group['step'] > 0 and group['step'] % group['precondition_frequency'] == 0
+        update_precond = group['step'] > 0 and group['step'] % group['precondition_frequency'] == 0
-            for p, g, gp in zip(p_list, grad, grad_projected):
-                state = self.state_(p)
-                # Projecting back the preconditioned (by Adam) exponential moving average of gradients
-                # to the original space
-                # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
-                set_(gp, project(gp, state['Q'], back=True))
+        for p, g, gp in zip(p_list, grad, grad_projected):
+            state = self.state_(p)
+            # Projecting back the preconditioned (by Adam) exponential moving average of gradients
+            # to the original space
+            # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
+            set_(gp, project(gp, state['Q'], back=True))
-                update_preconditioner(g, state, max_precond_dim, precondition_1d, 1 - new_debiased2,
-                                      update_precond)
+            update_preconditioner(g, state, max_precond_dim, precondition_1d, 1 - new_debiased2,
+                                  update_precond)
-            # Weight decay calculated at y
-            if group["weight_decay"] > 0:
-                torch._foreach_add_(grad, p_list, alpha=group["weight_decay"])
+        # Weight decay calculated at y
+        if group["weight_decay"] > 0:
+            torch._foreach_add_(grad, p_list, alpha=group["weight_decay"])
-            lr = warmup(group['lr'], step, group['warmup_steps'])
-            group['weight_sum'] = schedule_free_(lr, group['weight_lr_power'], group['weight_sum'], group['beta'],
-                                                 p_list, z, grad_projected, group['r'], step)
-        return loss
+        lr = warmup(group['lr'], step, group['warmup_steps'])
+        group['weight_sum'] = schedule_free_(lr, group['weight_lr_power'], group['weight_sum'], group['beta'],
+                                             p_list, z, grad_projected, group['r'], step)

heavyball/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ import gc
 import math
 import random
 import string
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Callable
 import numpy as np
 import torch
@@ -29,7 +29,7 @@ def decorator(func):
     return _fn
-_einsum_base = string.ascii_lowercase + string.ascii_uppercase
+einsum_base = string.ascii_lowercase + string.ascii_uppercase
 def warmup(lr: float, step: int, warmup_steps: int):
@@ -317,8 +317,8 @@ def compute_ggt(grad, GG, max_precond_dim, precondition_1d, beta):
     for idx, sh in enumerate(grad.shape):
         if sh > max_precond_dim:
             continue
-        b = _einsum_base[idx]
-        g0 = _einsum_base[:grad.dim()]
+        b = einsum_base[idx]
+        g0 = einsum_base[:grad.dim()]
         g1 = g0.replace(b, b.upper())
         outer_product = torch.einsum(f'{g0},{g1}->{b + b.upper()}', grad, grad)
         GG[idx].lerp_(promote(outer_product), 1 - beta)
@@ -374,7 +374,7 @@ def project(grad, Q, back: bool):
     :param back: whether to project to Shampoo eigenbases or back to original space
     :return:
     """
-    param = _einsum_base[:grad.dim()]
+    param = einsum_base[:grad.dim()]
     preconditioners = ",".join([(g + g.upper())[::-1 if back else 1] for m, g in zip(Q, param) if len(m) > 0])
     if preconditioners:
         out = ''.join([c.upper() if c.upper() in preconditioners else c for c in param])
@@ -399,6 +399,20 @@ class StatefulOptimizer(torch.optim.Optimizer):
                 tree_map(_add, self.state_(p))
         return total_bytes
+    def _step(self, group):
+        raise NotImplementedError
+    def step(self, closure: Optional[Callable] = None):
+        if closure is None:
+            loss = None
+        else:
+            with torch.enable_grad():
+                loss = closure()
+        with torch.no_grad():
+            for group in self.param_groups:
+                self._step(group)
+        return loss
 class ScheduleFree(StatefulOptimizer):
     def eval(self):
@@ -684,9 +698,11 @@ def a_law_compress(x, A=87.6):
     torch._foreach_mul_(xa, 1 / (1 + math.log(A)))
     return xa
 def identity(x):
     return x
 def trust_region_clip_(grad, lerp: float = 0.9, scale: float = 1.5):
     torch._foreach_mul_(grad, 1 / scale)
     tanh = torch._foreach_tanh(grad)
@@ -743,8 +759,8 @@ class PSGDBase(StatefulOptimizer):
         self.rng = random.Random(0x1923213)
         self._tiny = torch.finfo(torch.bfloat16).tiny
-    def balance(self, do_update, grad_list, Q_list):
-        if not do_update or self.rng.random() > 0.01:
+    def balance(self, grad_list, Q_list):
+        if self.rng.random() > 0.01:
             return
         for g, q in zip(grad_list, Q_list):

{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 0.14.7
+Version: 0.15.1
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

heavyball-0.15.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,23 @@
+heavyball/__init__.py,sha256=KbT0GMU0DKqZxq9laCrD7XgiqS9yxC1W52zhte5kjKs,2054
+heavyball/cached_psgd_kron.py,sha256=mXDtxq2WJST_aUJhrLr_xCCXSFaDvD5gCTSEveBUtac,6754
+heavyball/delayed_psgd.py,sha256=dN3NW1jmjxmUkgqxPwUVrqLY8nnBOFp4TVtJ_BhPDR4,5814
+heavyball/foreach_adamw.py,sha256=NSzoIgNm7eavzbJgkAF0k7TUEnWAgOpt9-4juIFoaSA,1729
+heavyball/foreach_adopt.py,sha256=WA07m5jocLfb1GPU8s6mJ2PteS-03ronkKm-VJrAm5I,1863
+heavyball/foreach_laprop.py,sha256=mE2NDGX9XgvRhsewcWnk_-FulZPqGA65ejYF_9-A1Xk,1768
+heavyball/foreach_sfadamw.py,sha256=ussHfPd99u3RTfMrCuu5oIbwNFLXK19wO1Fbz3JShlc,2097
+heavyball/foreach_soap.py,sha256=WWvssYKg607uoEJHftp8ag8mtKSKSeHrT0QTgqBucVg,4587
+heavyball/p_adam.py,sha256=ms7BoMHu3jKGsuztUeECrsXufGAwBpqGsxgZ5LBXLQg,6073
+heavyball/palm_foreach_sfadamw.py,sha256=wjUb_fNZNUmzWXyKvwB0unP9lvNMmaYSQo5YoeS5cj0,2200
+heavyball/palm_foreach_soap.py,sha256=2Sb4hUHQeexJcCgjHeQM_ENkZ6lG1DVxW72ryrvR6iY,5890
+heavyball/precond_schedule_foreach_soap.py,sha256=bHsDyh-UvHpHjumjqqy0PePoR1ZMsJV6o5wWvpLAA04,4815
+heavyball/precond_schedule_palm_foreach_soap.py,sha256=myLTJNQKLtZ3Xi3MVTB-RYtx_XeMRJw5CIMJW75ndUY,6163
+heavyball/precond_schedule_sfpsoap.py,sha256=xeNWetBzBEYqfOSzl98aAVJsHk43QkrUUhHH_YD_mS4,6740
+heavyball/psgd_kron.py,sha256=rMG5UPEgyfQs_n1MHSEicekVDpbbIzinlL8akEyY918,5795
+heavyball/pure_psgd.py,sha256=LLVJhUAb04hgAmT3BTz_faswwQEQUkLhm_VwGQmbBUo,5088
+heavyball/schedule_free_palm_foreach_soap.py,sha256=w0P7lMmoijTpL9V7NwOHcNBFJQ7S1TS9aCiwPhY2yVw,6319
+heavyball/utils.py,sha256=PWmwjZPL4oxMjK79a5R1e7JHykphNi5GdpYqO_xmmFU,27829
+heavyball-0.15.1.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
+heavyball-0.15.1.dist-info/METADATA,sha256=0wImMJNYM-Zg0akh9hRf7X8ofVW6zlmpyDGgAkK5GFA,11667
+heavyball-0.15.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+heavyball-0.15.1.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
+heavyball-0.15.1.dist-info/RECORD,,

heavyball-0.14.7.dist-info/RECORD DELETED Viewed

@@ -1,22 +0,0 @@
-heavyball/__init__.py,sha256=ef7IWcPF8Uh3WQHzMiMqOFvUnU_LdG7BO9XVecJOph4,1156
-heavyball/delayed_psgd.py,sha256=Gfa1ogkFPPL7ohayYAwbugB8hyLRUI5FgcJfsK69KGI,5936
-heavyball/foreach_adamw.py,sha256=L727chOuVqdiVbYYzloy4g3oKH2FmQh40o_bqWeZtk8,2106
-heavyball/foreach_adopt.py,sha256=M4zZVcYlhGjqO6qekivCuYfX6JtMcp4cJi_RrSHT3H8,2268
-heavyball/foreach_laprop.py,sha256=htYGqgvlZsF_JpILdPMTnX72SqfrokBJ2J4nDeT0JVU,2157
-heavyball/foreach_sfadamw.py,sha256=KIGol7Phfq1DHE_nEle4wDuqNdbGsT3kUcMKzJX3msg,2498
-heavyball/foreach_soap.py,sha256=Ccz9Mc_xaHnrJ_7jUq9ZVxyR0WEqopzOXTUqUY-V8G8,5137
-heavyball/p_adam.py,sha256=jQgTkKekqnLj1XPA4-fgpWG8P_BtUq2976zEt2QymTo,6060
-heavyball/palm_foreach_sfadamw.py,sha256=8IGlRCdkfMzUqcSfmTM3Ce04NLNyrT2xfiBcPkrWwqc,2605
-heavyball/palm_foreach_soap.py,sha256=NEJ3Xeh7pqURUk3cAP2qJe8z2WzYKg60pQe4bsGiaY4,6441
-heavyball/precond_schedule_foreach_soap.py,sha256=H6Oc5IAL5MR-fgu92AboPs3Xm8mBmYUMPLsEcuJ12VI,5370
-heavyball/precond_schedule_palm_foreach_soap.py,sha256=v81hRjcqS6Mm-KxT5Rk3TEiKAE8WI2IbmVbSa-YfBkE,6760
-heavyball/precond_schedule_sfpsoap.py,sha256=7ScnN0in8u9hPiJE7QnOoZOH6Tn-6HeVy4f-bO3bHzY,7279
-heavyball/psgd_kron.py,sha256=AH8ugd_IxKGVtY9y_Ot7myVSxFDbLlRJIqr2bBlAYy8,5911
-heavyball/pure_psgd.py,sha256=jp5fnawUdgccEFlZDPrZr4ZbxYV85IIrev4tybZxBVU,5185
-heavyball/schedule_free_palm_foreach_soap.py,sha256=bV7H-FNNoH5WpposLrNhkqU7mBicMorqKEALBSdROEM,6853
-heavyball/utils.py,sha256=y5VAd9CQjcl_a1WUcORviAYf7Jz_c7n3-b7i5kLUJIA,27464
-heavyball-0.14.7.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
-heavyball-0.14.7.dist-info/METADATA,sha256=5QWB3nuNAp8YjeX0-Y5Uzkek_wjuGF3XG6UWrQk8R0c,11667
-heavyball-0.14.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-heavyball-0.14.7.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
-heavyball-0.14.7.dist-info/RECORD,,

{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

heavyball 0.14.7__py3-none-any.whl → 0.15.1__py3-none-any.whl

heavyball 0.14.7py3-none-any.whl → 0.15.1py3-none-any.whl