PyPI - heavyball - Versions diffs - 0.14.7__py3-none-any.whl → 0.15.1__py3-none-any.whl - Mend

heavyball 0.14.7py3-none-any.whl → 0.15.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

heavyball/__init__.py +25 -3
heavyball/cached_psgd_kron.py +141 -0
heavyball/delayed_psgd.py +43 -51
heavyball/foreach_adamw.py +22 -32
heavyball/foreach_adopt.py +38 -48
heavyball/foreach_laprop.py +25 -35
heavyball/foreach_sfadamw.py +28 -38
heavyball/foreach_soap.py +56 -70
heavyball/p_adam.py +46 -50
heavyball/palm_foreach_sfadamw.py +31 -41
heavyball/palm_foreach_soap.py +56 -70
heavyball/precond_schedule_foreach_soap.py +57 -71
heavyball/precond_schedule_palm_foreach_soap.py +58 -73
heavyball/precond_schedule_sfpsoap.py +60 -72
heavyball/psgd_kron.py +43 -49
heavyball/pure_psgd.py +36 -43
heavyball/schedule_free_palm_foreach_soap.py +61 -72
heavyball/utils.py +23 -7
{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/METADATA +1 -1
heavyball-0.15.1.dist-info/RECORD +23 -0
heavyball-0.14.7.dist-info/RECORD +0 -22
{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/LICENSE +0 -0
{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/WHEEL +0 -0
{heavyball-0.14.7.dist-info → heavyball-0.15.1.dist-info}/top_level.txt +0 -0

heavyball/precond_schedule_palm_foreach_soap.py CHANGED Viewed

@@ -44,76 +44,61 @@ class PrecondSchedulePaLMForeachSOAP(StatefulOptimizer):
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
-    @torch.no_grad()
-    def step(self, closure=None):
-        """
-        Performs a single optimization step.
-        Arguments:
-            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
-        """
-        if closure is None:
-            loss = None
-        else:
-            loss = closure()
-        for group in self.param_groups:
-            vals = []
-            step = 0
-            max_precond_dim = group['max_precond_dim']
-            precondition_1d = group['precondition_1d']
-            for p, g in split_p_and_g_in_group(group):
-                state = self.state_(p)
-                step = state['step'] = state.get("step", -1) + 1
-                if "exp_avg" not in state:
-                    state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
-                    state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
-                    init_preconditioner(g, state, max_precond_dim, precondition_1d)
-                    update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
-                    continue  # first step is skipped so that we never use the current gradients in the projection.
-                # Projecting gradients to the eigenbases of Shampoo's preconditioner
-                # i.e. projecting to the eigenbases of matrices in state['GG']
-                grad_projected = project(g, state['Q'], False)
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-                vals.append((p, g, grad_projected, exp_avg, exp_avg_sq))
-            if not vals:
-                continue
-            p_list, grad, grad_projected, exp_avg, exp_avg_sq = zip(*vals)
-            beta1 = group["beta"]
-            beta2 = 1 - max(step, 1) ** -group['beta2_scale']
-            old_debiased1 = beta_debias(beta1, step)
-            old_debiased2 = beta_debias(beta2, step)
-            # Decay the first and second moment running average coefficient
-            # In-place operations to update the averages at the same time
-            torch._foreach_mul_(exp_avg, old_debiased1)
-            torch._foreach_add_(exp_avg, grad, alpha=1 - old_debiased1)
-            denom = exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, group['eps'])
-            update_precond = precond_schedule(step, group['precond_scheduler'], self.rng)
-            for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
-                state = self.state_(p)
-                # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
-                # i.e. projecting to the eigenbases of matrices in state['GG']
-                exp_avg_projected = project(ea, state['Q'], False)
-                # Projecting back the preconditioned (by Adam) exponential moving average of gradients
-                # to the original space
-                # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
-                exp_avg_projected = exp_avg_projected / d
-                set_(d, project(exp_avg_projected, state['Q'], True))
-                update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2,
-                                      update_precond)
-            # Why does this have to be rebiased here?
-            step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
-            update_param_(p_list, denom, step_size, group["weight_decay"])
-        return loss
+    def _step(self, group):
+        vals = []
+        step = 0
+        max_precond_dim = group['max_precond_dim']
+        precondition_1d = group['precondition_1d']
+        for p, g in split_p_and_g_in_group(group):
+            state = self.state_(p)
+            step = state['step'] = state.get("step", -1) + 1
+            if "exp_avg" not in state:
+                state["exp_avg"] = torch.zeros_like(g, dtype=torch.float32)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                init_preconditioner(g, state, max_precond_dim, precondition_1d)
+                update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
+                continue  # first step is skipped so that we never use the current gradients in the projection.
+            # Projecting gradients to the eigenbases of Shampoo's preconditioner
+            # i.e. projecting to the eigenbases of matrices in state['GG']
+            grad_projected = project(g, state['Q'], False)
+            exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+            vals.append((p, g, grad_projected, exp_avg, exp_avg_sq))
+        if not vals:
+            return
+        p_list, grad, grad_projected, exp_avg, exp_avg_sq = zip(*vals)
+        beta1 = group["beta"]
+        beta2 = 1 - max(step, 1) ** -group['beta2_scale']
+        old_debiased1 = beta_debias(beta1, step)
+        old_debiased2 = beta_debias(beta2, step)
+        # Decay the first and second moment running average coefficient
+        # In-place operations to update the averages at the same time
+        torch._foreach_mul_(exp_avg, old_debiased1)
+        torch._foreach_add_(exp_avg, grad, alpha=1 - old_debiased1)
+        denom = exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, group['eps'])
+        update_precond = precond_schedule(step, group['precond_scheduler'], self.rng)
+        for p, g, ea, d in zip(p_list, grad, exp_avg, denom):
+            state = self.state_(p)
+            # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
+            # i.e. projecting to the eigenbases of matrices in state['GG']
+            exp_avg_projected = project(ea, state['Q'], False)
+            # Projecting back the preconditioned (by Adam) exponential moving average of gradients
+            # to the original space
+            # CANT DO /= HERE AS EXP_AVG MAY POINT TO THE BUFFER
+            exp_avg_projected = exp_avg_projected / d
+            set_(d, project(exp_avg_projected, state['Q'], True))
+            update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2, update_precond)
+        # Why does this have to be rebiased here?
+        step_size = -group["lr"] * min(step / group['warmup_steps'], 1)
+        update_param_(p_list, denom, step_size, group["weight_decay"])

heavyball/precond_schedule_sfpsoap.py CHANGED Viewed

@@ -54,85 +54,73 @@ class PrecondScheduleSFPaLMSOAP(ScheduleFree):
         self._data_format = data_format
         self.rng = random.Random(0x120983109)
-    @torch.no_grad()
-    def step(self, closure=None):
-        """
-        Performs a single optimization step.
-        Arguments:
-            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
-        """
-        if closure is None:
-            loss = None
-        else:
-            loss = closure()
-        for group in self.param_groups:
-            vals = []
-            max_precond_dim = group['max_precond_dim']
-            precondition_1d = group['precondition_1d']
-            step = group['step'] = group.get("step", -1) + 1
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                grad = p.grad.float()
-                vals.append((p, grad))
-            p_list, grad = zip(*vals)
-            vals = []
-            # adaptive gradient clipping
-            adaptive_gradient_clipping_(p_list, grad, group["gradient_clip_val"], eps=group["eps"])
-            for p, g in split_p_and_g_in_group(group):
-                state = self.state_(p)
-                if "z" not in state:
-                    state["z"] = torch.clone(p.data)
-                    state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
-                    init_preconditioner(g, state, max_precond_dim, precondition_1d)
-                    update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
-                    continue  # first step is skipped so that we never use the current gradients in the projection.
-                # Projecting gradients to the eigenbases of Shampoo's preconditioner
-                # i.e. projecting to the eigenbases of matrices in state['GG']
-                grad_projected = project(g, state['Q'], False)
-                z, exp_avg_sq = state["z"], state["exp_avg_sq"]
-                vals.append((p, g, grad_projected, z, exp_avg_sq))
-            if not vals:
+    def _step(self, group):
+        vals = []
+        max_precond_dim = group['max_precond_dim']
+        precondition_1d = group['precondition_1d']
+        step = group['step'] = group.get("step", -1) + 1
+        for p in group["params"]:
+            if p.grad is None:
                 continue
+            grad = p.grad.float()
+            vals.append((p, grad))
+        if not vals:
+            return
+        p_list, grad = zip(*vals)
+        vals = []
+        # adaptive gradient clipping
+        adaptive_gradient_clipping_(p_list, grad, group["gradient_clip_val"], eps=group["eps"])
+        for p, g in split_p_and_g_in_group(group):
+            state = self.state_(p)
+            if "z" not in state:
+                state["z"] = torch.clone(p.data)
+                state["exp_avg_sq"] = torch.zeros_like(g, dtype=torch.float32)
+                init_preconditioner(g, state, max_precond_dim, precondition_1d)
+                update_preconditioner(g, state, max_precond_dim, precondition_1d, 0, True)
+                continue  # first step is skipped so that we never use the current gradients in the projection.
+            # Projecting gradients to the eigenbases of Shampoo's preconditioner
+            # i.e. projecting to the eigenbases of matrices in state['GG']
+            grad_projected = project(g, state['Q'], False)
+            z, exp_avg_sq = state["z"], state["exp_avg_sq"]
+            vals.append((p, g, grad_projected, z, exp_avg_sq))
-            p_list, grad, grad_projected, z, exp_avg_sq = zip(*vals)
-            del vals
+        if not vals:
+            return
-            beta2 = 1 - max(step, 1) ** -group['beta2_scale']
-            old_debiased2 = beta_debias(beta2, step)
+        p_list, grad, grad_projected, z, exp_avg_sq = zip(*vals)
+        del vals
-            # Decay the first and second moment running average coefficient
-            # In-place operations to update the averages at the same time
-            denom = exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, group['eps'])
-            torch._foreach_div_(grad_projected, denom)
+        beta2 = 1 - max(step, 1) ** -group['beta2_scale']
+        old_debiased2 = beta_debias(beta2, step)
-            update_precond = precond_schedule(step, group['precond_scheduler'], self.rng)
+        # Decay the first and second moment running average coefficient
+        # In-place operations to update the averages at the same time
+        denom = exp_avg_sq_(exp_avg_sq, grad_projected, old_debiased2, group['eps'])
+        torch._foreach_div_(grad_projected, denom)
-            for p, g, gp in zip(p_list, grad, grad_projected):
-                state = self.state_(p)
-                # Projecting back the preconditioned (by Adam) exponential moving average of gradients
-                # to the original space
-                set_(gp, project(gp, state['Q'], back=True))
+        update_precond = precond_schedule(step, group['precond_scheduler'], self.rng)
-                update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2,
-                                      update_precond)
+        for p, g, gp in zip(p_list, grad, grad_projected):
+            state = self.state_(p)
+            # Projecting back the preconditioned (by Adam) exponential moving average of gradients
+            # to the original space
+            set_(gp, project(gp, state['Q'], back=True))
-            # Weight decay calculated at y
-            if group["weight_decay"] > 0:
-                torch._foreach_add_(grad, p_list, alpha=group["weight_decay"])
+            update_preconditioner(g, state, max_precond_dim, precondition_1d, old_debiased2,
+                                  update_precond)
-            lr = warmup(group['lr'], step, group['warmup_steps'])
-            group['weight_sum'] = schedule_free_(lr, group['weight_lr_power'], group['weight_sum'], group['beta'],
-                                                 p_list, z, grad_projected, group['r'], step)
+        # Weight decay calculated at y
+        if group["weight_decay"] > 0:
+            torch._foreach_add_(grad, p_list, alpha=group["weight_decay"])
-        return loss
+        lr = warmup(group['lr'], step, group['warmup_steps'])
+        group['weight_sum'] = schedule_free_(lr, group['weight_lr_power'], group['weight_sum'], group['beta'],
+                                             p_list, z, grad_projected, group['r'], step)

heavyball/psgd_kron.py CHANGED Viewed

@@ -4,9 +4,10 @@ Modified under Creative Commons Attribution 4.0 International
 Source available at https://github.com/evanatyourservice/kron_torch/blob/97a2b5ee8a1a4c29e4780bbf6c521e545189eff9/kron_torch/kron.py
 """
-import torch
 from typing import Optional
+import torch
 from .utils import update_param_, warmup, psgd_precond_grad, init_Q_exprs, trust_region_clip_, PSGDBase, \
     precond_update_prob_schedule, split_p_and_g_in_group, line_to_triu, triu_to_line, set_
@@ -37,7 +38,7 @@ class ForeachPSGDKron(PSGDBase):
     def __init__(self, params, lr=0.001, beta=0.9, weight_decay=0.0, preconditioner_update_probability=None,
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
-                 split: bool = False, clip_fn: Optional[callable] = None):
+                 split: bool = False, clip_fn: Optional[callable] = None, store_triu_as_line: bool = True):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= beta < 1.0:
@@ -57,18 +58,13 @@ class ForeachPSGDKron(PSGDBase):
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=0.1,
                         # precond lr hardcoded to 0.1
                         precond_init_scale=1.0,  # precond init scale hardcoded to 1.0
-                        step=0, warmup_steps=warmup_steps, merge_dims=merge_dims, split=split)
+                        step=0, warmup_steps=warmup_steps, merge_dims=merge_dims, split=split,
+                        store_triu_as_line=store_triu_as_line)
         super().__init__(params, defaults)
         self._prob_step = 0
-    @torch.no_grad()
-    def step(self, closure=None):
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
+    def _step(self, group):
         # update preconditioners all together
         update_prob = self.preconditioner_update_probability
         if callable(update_prob):
@@ -76,54 +72,52 @@ class ForeachPSGDKron(PSGDBase):
         do_update = self.rng.random() < update_prob
         self._prob_step += 1
-        for group in self.param_groups:
-            momentum_into_precond_update = group.get("momentum_into_precond_update", True)
-            precond_init_scale = group['precond_init_scale']
-            max_size_triangular = group['max_size_triangular']
-            min_ndim_triangular = group['min_ndim_triangular']
-            memory_save_mode = group['memory_save_mode']
-            precond_lr = group['precond_lr']
-            weight_decay = group['weight_decay']
-            lr = group['lr']
-            beta = group['beta']
-            vals = []
+        momentum_into_precond_update = group.get("momentum_into_precond_update", True)
+        precond_init_scale = group['precond_init_scale']
+        max_size_triangular = group['max_size_triangular']
+        min_ndim_triangular = group['min_ndim_triangular']
+        memory_save_mode = group['memory_save_mode']
+        precond_lr = group['precond_lr']
+        weight_decay = group['weight_decay']
+        lr = group['lr']
+        beta = group['beta']
+        store_triu_as_line = group['store_triu_as_line']
-            for p, g in split_p_and_g_in_group(group):
-                state = self.state_(p)
+        vals = []
-                if 'Q' not in state:
-                    state["exp_avg"] = torch.zeros_like(g)
-                    Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
-                                                     memory_save_mode, dtype=g.dtype)
-                    state['Q'] = triu_to_line(Q)
+        for p, g in split_p_and_g_in_group(group):
+            state = self.state_(p)
-                vals.append((p, g, state["exp_avg"], state["Q"]))
+            if 'Q' not in state:
+                state["exp_avg"] = torch.zeros_like(g)
+                Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
+                                                 memory_save_mode, dtype=g.dtype)
+                state['Q'] = triu_to_line(Q) if store_triu_as_line else Q
-            if not vals:
-                continue
+            vals.append((p, g, state["exp_avg"], state["Q"]))
-            p_list, grad_list, exp_avg_list, Q_list = zip(*vals)
-            del vals
+        if not vals:
+            return
-            group["step"] += 1
+        p_list, grad_list, exp_avg_list, Q_list = zip(*vals)
+        del vals
-            torch._foreach_lerp_(exp_avg_list, grad_list, (1 - beta) / (1 - beta ** group["step"]))
+        group["step"] += 1
-            grad_list, Q_list, exp_avg_list = list(grad_list), list(Q_list), list(exp_avg_list)
-            for i, (p, g) in enumerate(zip(p_list, grad_list)):
-                q_orig = Q_list.pop(0)
-                ea = exp_avg_list.pop(0)
-                q = line_to_triu(q_orig)
+        torch._foreach_lerp_(exp_avg_list, grad_list, (1 - beta) / (1 - beta ** group["step"]))
-                self.balance(do_update, [g], [q])
-                if do_update:
-                    self.do_update([p], [ea if momentum_into_precond_update else g], [q], precond_lr, [q_orig])
-                set_(g, psgd_precond_grad(q, self.state_(p)["exprs"], ea))
+        grad_list, Q_list, exp_avg_list = list(grad_list), list(Q_list), list(exp_avg_list)
+        for i, (p, g) in enumerate(zip(p_list, grad_list)):
+            q_orig = Q_list.pop(0)
+            ea = exp_avg_list.pop(0)
+            q = line_to_triu(q_orig) if store_triu_as_line else q_orig
-            grad_list = self.clip_fn(grad_list)
+            if do_update:
+                self.balance([g], [q])
+                self.do_update([p], [ea if momentum_into_precond_update else g], [q], precond_lr, [q_orig] if store_triu_as_line else None)
+            set_(g, psgd_precond_grad(q, self.state_(p)["exprs"], ea))
-            lr = -warmup(lr, group['step'], group['warmup_steps'])
-            update_param_(p_list, grad_list, lr, weight_decay)
+        grad_list = self.clip_fn(grad_list)
-        return loss
+        lr = -warmup(lr, group['step'], group['warmup_steps'])
+        update_param_(p_list, grad_list, lr, weight_decay)

heavyball/pure_psgd.py CHANGED Viewed

@@ -36,7 +36,7 @@ class ForeachPurePSGD(PSGDBase):
     def __init__(self, params, lr=0.001, weight_decay=0.0, preconditioner_update_probability=None,
                  max_size_triangular=2048, min_ndim_triangular=2, memory_save_mode=None,
                  momentum_into_precond_update=True, warmup_steps: int = 1, merge_dims: bool = False,
-                 split: bool = False, clip_fn: callable = None):
+                 split: bool = False, clip_fn: callable = None, store_triu_as_line: bool = True):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= weight_decay:
@@ -54,18 +54,13 @@ class ForeachPurePSGD(PSGDBase):
                         momentum_into_precond_update=momentum_into_precond_update, precond_lr=0.1,
                         # precond lr hardcoded to 0.1
                         precond_init_scale=1.0,  # precond init scale hardcoded to 1.0
-                        step=0, warmup_steps=warmup_steps, merge_dims=merge_dims, split=split)
+                        step=0, warmup_steps=warmup_steps, merge_dims=merge_dims, split=split,
+                        store_triu_as_line=store_triu_as_line)
         super().__init__(params, defaults)
         self._prob_step = 0
-    @torch.no_grad()
-    def step(self, closure=None):
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
+    def _step(self, group):
         # update preconditioners all together
         update_prob = self.preconditioner_update_probability
         if callable(update_prob):
@@ -73,48 +68,46 @@ class ForeachPurePSGD(PSGDBase):
         do_update = self.rng.random() < update_prob
         self._prob_step += 1
-        for group in self.param_groups:
-            precond_init_scale = group['precond_init_scale']
-            max_size_triangular = group['max_size_triangular']
-            min_ndim_triangular = group['min_ndim_triangular']
-            memory_save_mode = group['memory_save_mode']
-            precond_lr = group['precond_lr']
-            weight_decay = group['weight_decay']
-            lr = group['lr']
-            vals = []
+        precond_init_scale = group['precond_init_scale']
+        max_size_triangular = group['max_size_triangular']
+        min_ndim_triangular = group['min_ndim_triangular']
+        memory_save_mode = group['memory_save_mode']
+        precond_lr = group['precond_lr']
+        weight_decay = group['weight_decay']
+        lr = group['lr']
+        store_triu_as_line = group['store_triu_as_line']
-            for p, g in split_p_and_g_in_group(group):
-                state = self.state_(p)
+        vals = []
-                if 'Q' not in state:
-                    Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
-                                                     memory_save_mode, dtype=g.dtype)
-                    state['Q'] = triu_to_line(Q)
+        for p, g in split_p_and_g_in_group(group):
+            state = self.state_(p)
-                vals.append((p, g, state["Q"]))
+            if 'Q' not in state:
+                Q, state["exprs"] = init_Q_exprs(p, precond_init_scale, max_size_triangular, min_ndim_triangular,
+                                                 memory_save_mode, dtype=g.dtype)
+                state['Q'] = triu_to_line(Q) if store_triu_as_line else Q
-            if not vals:
-                continue
+            vals.append((p, g, state["Q"]))
-            p_list, grad_list, Q_list = zip(*vals)
-            del vals
+        if not vals:
+            return
-            group["step"] += 1
+        p_list, grad_list, Q_list = zip(*vals)
+        del vals
-            Q_list = list(Q_list)
-            for i, (p, g) in enumerate(zip(p_list, grad_list)):
-                q_orig = Q_list.pop(0)
-                q = line_to_triu(q_orig)
+        group["step"] += 1
-                self.balance(do_update, [g], [q])
-                if do_update:
-                    self.do_update([p], [g], [q], precond_lr, [q_orig])
-                psgd_precond_grad(q, self.state_(p)["exprs"], g, inplace=True)
+        Q_list = list(Q_list)
+        for i, (p, g) in enumerate(zip(p_list, grad_list)):
+            q_orig = Q_list.pop(0)
+            q = line_to_triu(q_orig) if store_triu_as_line else q_orig
-            grad_list = self.clip_fn(grad_list)
+            if do_update:
+                self.balance([g], [q])
+                self.do_update([p], [g], [q], precond_lr, [q_orig] if store_triu_as_line else None)
+            psgd_precond_grad(q, self.state_(p)["exprs"], g, inplace=True)
-            lr = -warmup(lr, group['step'], group['warmup_steps'])
-            update_param_(p_list, grad_list, lr, weight_decay)
+        grad_list = self.clip_fn(grad_list)
-        return loss
+        lr = -warmup(lr, group['step'], group['warmup_steps'])
+        update_param_(p_list, grad_list, lr, weight_decay)

heavyball 0.14.7__py3-none-any.whl → 0.15.1__py3-none-any.whl

heavyball 0.14.7py3-none-any.whl → 0.15.1py3-none-any.whl