PyPI - heavyball - Versions diffs - 0.14.6__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

heavyball 0.14.6py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

heavyball/__init__.py +1 -1
heavyball/delayed_psgd.py +39 -48
heavyball/foreach_adamw.py +22 -32
heavyball/foreach_adopt.py +38 -48
heavyball/foreach_laprop.py +25 -35
heavyball/foreach_sfadamw.py +28 -38
heavyball/foreach_soap.py +56 -70
heavyball/p_adam.py +39 -48
heavyball/palm_foreach_sfadamw.py +31 -41
heavyball/palm_foreach_soap.py +56 -70
heavyball/precond_schedule_foreach_soap.py +57 -71
heavyball/precond_schedule_palm_foreach_soap.py +58 -73
heavyball/precond_schedule_sfpsoap.py +60 -72
heavyball/psgd_kron.py +39 -47
heavyball/pure_psgd.py +32 -41
heavyball/schedule_free_palm_foreach_soap.py +61 -72
heavyball/utils.py +17 -2
{heavyball-0.14.6.dist-info → heavyball-0.15.0.dist-info}/METADATA +1 -1
heavyball-0.15.0.dist-info/RECORD +22 -0
heavyball-0.14.6.dist-info/RECORD +0 -22
{heavyball-0.14.6.dist-info → heavyball-0.15.0.dist-info}/LICENSE +0 -0
{heavyball-0.14.6.dist-info → heavyball-0.15.0.dist-info}/WHEEL +0 -0
{heavyball-0.14.6.dist-info → heavyball-0.15.0.dist-info}/top_level.txt +0 -0

heavyball/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from .delayed_psgd import ForeachDelayedPSGD
 from .foreach_adamw import ForeachAdamW
 from .foreach_adopt import ForeachADOPT
 from .foreach_laprop import ForeachLaProp
@@ -12,7 +13,6 @@ from .precond_schedule_sfpsoap import PrecondScheduleSFPaLMSOAP
 from .psgd_kron import ForeachPSGDKron
 from .pure_psgd import ForeachPurePSGD
 from .schedule_free_palm_foreach_soap import SFPaLMForeachSOAP
-from .delayed_psgd import ForeachDelayedPSGD
 PalmForEachSoap = PaLMForeachSOAP

heavyball/delayed_psgd.py CHANGED Viewed

@@ -5,8 +5,8 @@ Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5e
 """
 import torch
 from heavyball.utils import copy_stochastic_list_
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
     precond_update_prob_schedule, split_p_and_g_in_group, triu_to_line, line_to_triu, set_
@@ -63,13 +63,7 @@ class ForeachDelayedPSGD(PSGDBase):
         self._prob_step = 0
-    @torch.no_grad()
-    def step(self, closure=None):
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
+    def _step(self, group):
         # update preconditioners all together
         update_prob = self.preconditioner_update_probability
         if callable(update_prob):
@@ -77,55 +71,52 @@ class ForeachDelayedPSGD(PSGDBase):
         do_update = self.rng.random() < update_prob
         self._prob_step += 1
-        for group in self.param_groups:
-            momentum_into_precond_update = group.get("momentum_into_precond_update", True)
-            precond_init_scale = group['precond_init_scale']
-            max_size_triangular = group['max_size_triangular']
-            min_ndim_triangular = group['min_ndim_triangular']
-            memory_save_mode = group['memory_save_mode']
-            precond_lr = group['precond_lr']
-            weight_decay = group['weight_decay']
-            lr = group['lr']
-            beta = group['beta']
-            vals = []
+        momentum_into_precond_update = group.get("momentum_into_precond_update", True)
+        precond_init_scale = group['precond_init_scale']
+        max_size_triangular = group['max_size_triangular']
+        min_ndim_triangular = group['min_ndim_triangular']
+        memory_save_mode = group['memory_save_mode']
+        precond_lr = group['precond_lr']
+        weight_decay = group['weight_decay']
+        lr = group['lr']
+        beta = group['beta']
-            for p, g in split_p_and_g_in_group(group):
-                state = self.state_(p)
+        vals = []
-                if 'Q' not in state:
-                    state["exp_avg"] = torch.zeros_like(g)
-                    Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
-                                                     memory_save_mode, dtype=g.dtype)
-                    state["Q"] = triu_to_line(Q)
+        for p, g in split_p_and_g_in_group(group):
+            state = self.state_(p)
-                vals.append((p, g, state["exp_avg"], state["Q"]))
+            if 'Q' not in state:
+                state["exp_avg"] = torch.zeros_like(g)
+                Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
+                                                 memory_save_mode, dtype=g.dtype)
+                state["Q"] = triu_to_line(Q)
-            if not vals:
-                continue
+            vals.append((p, g, state["exp_avg"], state["Q"]))
-            p_list, grad_list, exp_avg_list, Q_list = zip(*vals)
-            del vals
+        if not vals:
+            return
-            group["step"] += 1
+        p_list, grad_list, exp_avg_list, Q_list = zip(*vals)
+        del vals
-            torch._foreach_lerp_(exp_avg_list, grad_list, (1 - beta) / (1 - beta ** group["step"]))
+        group["step"] += 1
-            Q_list, exp_avg_list = list(Q_list), list(exp_avg_list)
-            for i, (p, g) in enumerate(zip(p_list, grad_list)):
-                q_orig = Q_list.pop(0)
-                ea = exp_avg_list.pop(0)
-                q = line_to_triu(q_orig)
-                self.balance(do_update, [g], [q])
-                new = psgd_precond_grad(q, self.state_(p)["exprs"], ea)
+        torch._foreach_lerp_(exp_avg_list, grad_list, (1 - beta) / (1 - beta ** group["step"]))
-                if do_update:
-                    self.do_update([p], [ea if momentum_into_precond_update else g], [q], precond_lr, [q_orig])
-                set_(g, new)
+        Q_list, exp_avg_list = list(Q_list), list(exp_avg_list)
+        for i, (p, g) in enumerate(zip(p_list, grad_list)):
+            q_orig = Q_list.pop(0)
+            ea = exp_avg_list.pop(0)
+            q = line_to_triu(q_orig)
+            self.balance(do_update, [g], [q])
+            new = psgd_precond_grad(q, self.state_(p)["exprs"], ea)
-            grad_list = self.clip_fn(grad_list)
+            if do_update:
+                self.do_update([p], [ea if momentum_into_precond_update else g], [q], precond_lr, [q_orig])
+            set_(g, new)
-            lr = -warmup(lr, group['step'], group['warmup_steps'])
-            update_param_(p_list, grad_list, lr, weight_decay)
+        grad_list = self.clip_fn(grad_list)
-        return loss
+        lr = -warmup(lr, group['step'], group['warmup_steps'])
+        update_param_(p_list, grad_list, lr, weight_decay)

heavyball/foreach_adamw.py CHANGED Viewed

@@ -10,42 +10,32 @@ class ForeachAdamW(StatefulOptimizer):
                         lr_max=-1.0, weight_decay=weight_decay)
         super().__init__(params, defaults)
-    def step(self, closure=None):
-        """Performs a single optimization step.
+    def _step(self, group):
+        eps = group['eps']
+        decay = group['weight_decay']
+        k = group['k']
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
+        if not group['train_mode']:
+            raise Exception("Not in train mode!")
-        loss = None
-        if closure is not None:
-            loss = closure()
+        active_p = [p for p in group['params'] if p.grad is not None]
-        for group in self.param_groups:
-            eps = group['eps']
-            decay = group['weight_decay']
-            k = group['k']
+        if not active_p:
+            return
-            if not group['train_mode']:
-                raise Exception("Not in train mode!")
+        for p in active_p:
+            if 'exp_avg' not in self.state_(p):
+                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float32)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float32)
-            active_p = [p for p in group['params'] if p.grad is not None]
+        y, grad, exp_avg_sq, exp_avg = zip(
+            *[(p.data, p.grad.float(), self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])
-            for p in active_p:
-                if 'exp_avg' not in self.state_(p):
-                    self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float32)
-                    self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float32)
+        # Decay the first and second moment running average coefficient
+        torch._foreach_lerp_(exp_avg, grad, 1 - beta_debias(group['betas'][0], k + 1))
+        denom = exp_avg_sq_(exp_avg_sq, grad, beta_debias(group['betas'][1], k + 1), eps)
-            y, grad, exp_avg_sq, exp_avg = zip(
-                *[(p.data, p.grad.float(), self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])
-            # Decay the first and second moment running average coefficient
-            torch._foreach_lerp_(exp_avg, grad, 1 - beta_debias(group['betas'][0], k + 1))
-            denom = exp_avg_sq_(exp_avg_sq, grad, beta_debias(group['betas'][1], k + 1), eps)
-            # Normalize grad in-place for memory efficiency
-            lr = -warmup(group['lr'], k + 1, group['warmup_steps'])
-            update_param_(y, exp_avg, lr, decay, lambda p, e, l: torch._foreach_addcdiv_(p, e, denom, l))
-            group['k'] = k + 1
-        return loss
+        # Normalize grad in-place for memory efficiency
+        lr = -warmup(group['lr'], k + 1, group['warmup_steps'])
+        update_param_(y, exp_avg, lr, decay, lambda p, e, l: torch._foreach_addcdiv_(p, e, denom, l))
+        group['k'] = k + 1

heavyball/foreach_adopt.py CHANGED Viewed

@@ -11,51 +11,41 @@ class ForeachADOPT(StatefulOptimizer):
                         lr_max=-1.0, weight_decay=weight_decay)
         super().__init__(params, defaults)
-    def step(self, closure=None):
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-        for group in self.param_groups:
-            eps = group['eps']
-            decay = group['weight_decay']
-            k = group['k']
-            if not group['train_mode']:
-                raise Exception("Not in train mode!")
-            active_p = [p for p in group['params'] if p.grad is not None]
-            for p in active_p:
-                if 'exp_avg' not in self.state_(p):
-                    self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float32)
-                    self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float32)
-            y, grad, exp_avg_sq, exp_avg = zip(
-                *[(p.data, p.grad.float(), self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])
-            if k > 1:
-                lr = -warmup(group['lr'], k - 1, group['warmup_steps'])
-                update_param_(y, exp_avg, lr, decay)
-            if k > 0:
-                beta1 = beta_debias(group['betas'][0], k)
-                denom = torch._foreach_sqrt(exp_avg_sq)
-                torch._foreach_maximum_(denom, eps)
-                torch._foreach_mul_(exp_avg, beta1)
-                torch._foreach_addcdiv_(exp_avg, grad, denom, 1 - beta1)
-            beta2 = beta_debias(group['betas'][1], k + 1)
-            torch._foreach_mul_(exp_avg_sq, beta2)
-            torch._foreach_addcmul_(exp_avg_sq, grad, grad, value=1 - beta2)
-            del grad
-            group['k'] = k + 1
-        return loss
+    def _step(self, group):
+        eps = group['eps']
+        decay = group['weight_decay']
+        k = group['k']
+        if not group['train_mode']:
+            raise Exception("Not in train mode!")
+        active_p = [p for p in group['params'] if p.grad is not None]
+        if not active_p:
+            return
+        for p in active_p:
+            if 'exp_avg' not in self.state_(p):
+                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float32)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float32)
+        y, grad, exp_avg_sq, exp_avg = zip(
+            *[(p.data, p.grad.float(), self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])
+        if k > 1:
+            lr = -warmup(group['lr'], k - 1, group['warmup_steps'])
+            update_param_(y, exp_avg, lr, decay)
+        if k > 0:
+            beta1 = beta_debias(group['betas'][0], k)
+            denom = torch._foreach_sqrt(exp_avg_sq)
+            torch._foreach_maximum_(denom, eps)
+            torch._foreach_mul_(exp_avg, beta1)
+            torch._foreach_addcdiv_(exp_avg, grad, denom, 1 - beta1)
+        beta2 = beta_debias(group['betas'][1], k + 1)
+        torch._foreach_mul_(exp_avg_sq, beta2)
+        torch._foreach_addcmul_(exp_avg_sq, grad, grad, value=1 - beta2)
+        del grad
+        group['k'] = k + 1

heavyball/foreach_laprop.py CHANGED Viewed

@@ -11,46 +11,36 @@ class ForeachLaProp(StatefulOptimizer):
                         lr_max=-1.0, weight_decay=weight_decay)
         super().__init__(params, defaults)
-    def step(self, closure=None):
-        """Performs a single optimization step.
+    def _step(self, group):
+        eps = group['eps']
+        decay = group['weight_decay']
+        k = group['k']
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
+        if not group['train_mode']:
+            raise Exception("Not in train mode!")
-        loss = None
-        if closure is not None:
-            loss = closure()
+        active_p = [p for p in group['params'] if p.grad is not None]
-        for group in self.param_groups:
-            eps = group['eps']
-            decay = group['weight_decay']
-            k = group['k']
+        if not active_p:
+            return
-            if not group['train_mode']:
-                raise Exception("Not in train mode!")
+        for p in active_p:
+            if 'exp_avg' not in self.state_(p):
+                self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float32)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float32)
-            active_p = [p for p in group['params'] if p.grad is not None]
+        y, grad, exp_avg_sq, exp_avg = zip(
+            *[(p.data, p.grad.float(), self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])
-            for p in active_p:
-                if 'exp_avg' not in self.state_(p):
-                    self.state_(p)['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float32)
-                    self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float32)
+        # Decay the first and second moment running average coefficient
+        denom = exp_avg_sq_(exp_avg_sq, grad, beta_debias(group['betas'][1], k + 1), eps)
+        beta1 = beta_debias(group['betas'][0], k + 1)
+        torch._foreach_mul_(exp_avg, beta1)
+        torch._foreach_addcdiv_(exp_avg, grad, denom, 1 - beta1)
+        del grad
-            y, grad, exp_avg_sq, exp_avg = zip(
-                *[(p.data, p.grad.float(), self.state_(p)['exp_avg_sq'], self.state_(p)['exp_avg']) for p in active_p])
+        # Normalize grad in-place for memory efficiency
+        lr = -warmup(group['lr'], k + 1, group['warmup_steps'])
+        update_param_(y, exp_avg, lr, decay)
-            # Decay the first and second moment running average coefficient
-            denom = exp_avg_sq_(exp_avg_sq, grad, beta_debias(group['betas'][1], k + 1), eps)
-            beta1 = beta_debias(group['betas'][0], k + 1)
-            torch._foreach_mul_(exp_avg, beta1)
-            torch._foreach_addcdiv_(exp_avg, grad, denom, 1 - beta1)
-            del grad
-            # Normalize grad in-place for memory efficiency
-            lr = -warmup(group['lr'], k + 1, group['warmup_steps'])
-            update_param_(y, exp_avg, lr, decay)
-            group['k'] = k + 1
-        return loss
+        group['k'] = k + 1

heavyball/foreach_sfadamw.py CHANGED Viewed

@@ -13,52 +13,42 @@ class ForeachSFAdamW(ScheduleFree):
                         foreach=foreach)
         super().__init__(params, defaults)
-    def step(self, closure=None):
-        """Performs a single optimization step.
+    def _step(self, group):
+        eps = group['eps']
+        decay = group['weight_decay']
+        k = group['k']
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
+        if not group['train_mode']:
+            raise Exception("Not in train mode!")
-        loss = None
-        if closure is not None:
-            loss = closure()
+        active_p = [p for p in group['params'] if p.grad is not None]
-        for group in self.param_groups:
-            eps = group['eps']
-            decay = group['weight_decay']
-            k = group['k']
+        if not active_p:
+            return
-            if not group['train_mode']:
-                raise Exception("Not in train mode!")
+        for p in active_p:
+            if 'z' not in self.state_(p):
+                self.state_(p)['z'] = torch.clone(p.data)
+                self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float32)
-            active_p = [p for p in group['params'] if p.grad is not None]
+        y, grad, exp_avg_sq, z = zip(
+            *[(p.data, p.grad.float(), self.state_(p)['exp_avg_sq'], self.state_(p)['z']) for p in active_p])
-            for p in active_p:
-                if 'z' not in self.state_(p):
-                    self.state_(p)['z'] = torch.clone(p.data)
-                    self.state_(p)['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float32)
+        # Decay the first moment running average coefficient
+        old_debiased = beta_debias(group['betas'][1], k + 1)
-            y, grad, exp_avg_sq, z = zip(
-                *[(p.data, p.grad.float(), self.state_(p)['exp_avg_sq'], self.state_(p)['z']) for p in active_p])
+        # Decay the first and second moment running average coefficient
+        denom = exp_avg_sq_(exp_avg_sq, grad, old_debiased, eps)
-            # Decay the first moment running average coefficient
-            old_debiased = beta_debias(group['betas'][1], k + 1)
+        # Normalize grad in-place for memory efficiency
+        torch._foreach_div_(grad, denom)
-            # Decay the first and second moment running average coefficient
-            denom = exp_avg_sq_(exp_avg_sq, grad, old_debiased, eps)
+        # Weight decay calculated at y
+        if decay != 0:
+            torch._foreach_add_(grad, y, alpha=decay)
-            # Normalize grad in-place for memory efficiency
-            torch._foreach_div_(grad, denom)
+        lr = warmup(group['lr'], k + 1, group['warmup_steps'])
+        group['weight_sum'] = schedule_free_(lr, group['weight_lr_power'], group['weight_sum'], group['betas'][0],
+                                             y, z, grad, group['r'], k + 1)
-            # Weight decay calculated at y
-            if decay != 0:
-                torch._foreach_add_(grad, y, alpha=decay)
-            lr = warmup(group['lr'], k + 1, group['warmup_steps'])
-            group['weight_sum'] = schedule_free_(lr, group['weight_lr_power'], group['weight_sum'], group['betas'][0],
-                                                 y, z, grad, group['r'], k + 1)
-            group['k'] = k + 1
-        return loss
+        group['k'] = k + 1

heavyball/foreach_soap.py CHANGED Viewed

@@ -34,73 +34,59 @@ class ForeachSOAP(StatefulOptimizer):
         super().__init__(params, defaults)
         self._data_format = data_format
-    @torch.no_grad()
-    def step(self, closure=None):
-        """
-        Performs a single optimization step.
-        Arguments:
-            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
-        """
-        if closure is None:
-            loss = None
-        else:
-            loss = closure()
-        for group in self.param_groups:
-            vals = []
-            step = 0
-            max_precond_dim = group['max_precond_dim']
-            precondition_1d = group['precondition_1d']
-            for p, g in split_p_and_g_in_group(group):
-                state = self.state_(p)
-                step = state['step'] = state.get("step", -1) + 1
-                if "exp_avg" not in state:
-                    state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
-                    state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
-                    init_preconditioner(g, state, max_precond_dim, precondition_1d)
-                    update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
-                    continue  # first step is skipped so that we never use the current gradients in the projection.
-                # Projecting gradients to the eigenbases of Shampoo's preconditioner
-                # i.e. projecting to the eigenbases of matrices in state['GG']
-                grad_projected = project(g, state['Q'], False)
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-                vals.append((p, g, grad_projected, exp_avg, exp_avg_sq))
-            if not vals:
-                continue
-            p_list, grad, grad_projected, exp_avg, exp_avg_sq = zip(*vals)
-            beta1, beta2 = group["betas"]
-            old_debiased1 = beta_debias(beta1, step)
-            old_debiased2 = beta_debias(beta2, step)
-            # Decay the first and second moment running average coefficient
-            # In-place operations to update the averages at the same time
-            torch._foreach_mul_(exp_avg, old_debiased1)
-            torch._foreach_add_(exp_avg, grad, alpha=1 - old_debiased1)
-            denom = exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, group['eps'])
-            for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
-                state = self.state_(p)
-                # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
-                # i.e. projecting to the eigenbases of matrices in state['GG']
-                exp_avg_projected = project(ea, state['Q'], False)
-                # Projecting back the preconditioned (by Adam) exponential moving average of gradients
-                # to the original space
-                # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
-                set_(d, project(exp_avg_projected / d, state['Q'], True))
-                update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2,
-                                      step > 0 and step % group['precondition_frequency'] == 0)
-            # Why does this have to be rebiased here?
-            step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
-            update_param_(p_list, denom, step_size, group["weight_decay"])
-        return loss
+    def _step(self, group):
+        vals = []
+        step = 0
+        max_precond_dim = group['max_precond_dim']
+        precondition_1d = group['precondition_1d']
+        for p, g in split_p_and_g_in_group(group):
+            state = self.state_(p)
+            step = state['step'] = state.get("step", -1) + 1
+            if "exp_avg" not in state:
+                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                init_preconditioner(g, state, max_precond_dim, precondition_1d)
+                update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
+                continue  # first step is skipped so that we never use the current gradients in the projection.
+            # Projecting gradients to the eigenbases of Shampoo's preconditioner
+            # i.e. projecting to the eigenbases of matrices in state['GG']
+            grad_projected = project(g, state['Q'], False)
+            exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+            vals.append((p, g, grad_projected, exp_avg, exp_avg_sq))
+        if not vals:
+            return
+        p_list, grad, grad_projected, exp_avg, exp_avg_sq = zip(*vals)
+        beta1, beta2 = group["betas"]
+        old_debiased1 = beta_debias(beta1, step)
+        old_debiased2 = beta_debias(beta2, step)
+        # Decay the first and second moment running average coefficient
+        # In-place operations to update the averages at the same time
+        torch._foreach_mul_(exp_avg, old_debiased1)
+        torch._foreach_add_(exp_avg, grad, alpha=1 - old_debiased1)
+        denom = exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, group['eps'])
+        for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
+            state = self.state_(p)
+            # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
+            # i.e. projecting to the eigenbases of matrices in state['GG']
+            exp_avg_projected = project(ea, state['Q'], False)
+            # Projecting back the preconditioned (by Adam) exponential moving average of gradients
+            # to the original space
+            # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
+            set_(d, project(exp_avg_projected / d, state['Q'], True))
+            update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2,
+                                  step > 0 and step % group['precondition_frequency'] == 0)
+        # Why does this have to be rebiased here?
+        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
+        update_param_(p_list, denom, step_size, group["weight_decay"])

heavyball 0.14.6__py3-none-any.whl → 0.15.0__py3-none-any.whl

heavyball 0.14.6py3-none-any.whl → 0.15.0py3-none-any.whl