PyPI - heavyball - Versions diffs - 1.2.1__py3-none-any.whl → 1.2.3__py3-none-any.whl - Mend

heavyball 1.2.1py3-none-any.whl → 1.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

heavyball/chainable.py CHANGED Viewed

@@ -183,8 +183,8 @@ def update_by_laprop(group, update, grad, param, exp_avg, exp_avg_sq):
 @no_state
 def update_by_schedule_free(group, update, grad, param, z):
     group['weight_sum'] = utils.schedule_free_(group['lr'], group['weight_lr_power'], group.get('weight_sum', 0),
-                                               utils.get_beta1(group), param, z, update, group['r'], group['step'],
-                                               group['weight_decay'])
+                                               utils.get_beta1(group), param, z, update, grad, group['caution'],
+                                               group['r'], group['step'], group['weight_decay'])
     raise SkipUpdate
@@ -287,16 +287,19 @@ def heavyball_momentum(group, updates, grads, params, momentum):
     return utils.heavyball_momentum(momentum, updates, utils.get_beta1(group))
+_optim_fns = {'adam': utils.adam_, 'laprop': utils.laprop_}
 @zero_guard("exp_avg", "exp_avg_sq")
 @general_guard("Q", "GG", init_fn=_init_soap)
 @no_state
-def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG):
+def scale_by_soap(group, update, grad, param, exp_avg, exp_avg_sq, Q, GG, inner: str = 'adam'):
     update = utils.promote(update)
     grad_projected = [utils.project(u, q, False) for u, q in zip(update, Q)]
-    precond = utils.adam_(exp_avg, exp_avg_sq, grad_projected, utils.get_beta1(group), utils.get_beta2(group),
-                          utils.scalar_guard(group['step'], exp_avg[0]))
-    precond = [utils.project(p, q, False) for p, q in zip(precond, Q)]
+    fn = _optim_fns[inner]
+    precond = fn(exp_avg, exp_avg_sq, grad_projected, utils.get_beta1(group), utils.get_beta2(group), group['step'])
+    precond = [utils.project(p, q, True) for p, q in zip(precond, Q)]
     for u, q, gg, eas in zip(update, Q, GG, exp_avg_sq):
         utils.update_preconditioner(u, q, gg, eas, group['max_precond_dim'], group['precondition_1d'],
@@ -355,8 +358,7 @@ def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str
     update = update.to(memory_format=torch.contiguous_format)
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     _update_psgd_precond(group, param, update, Q_mat, Q, exprs, prob)
-    out = _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
-    return torch.as_strided(out, old.shape, old.stride())
+    return _cached_psgd_precond_grad(cached, cache_expr, exprs, update, Q_mat, Q_cache)
 @general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd)
@@ -512,7 +514,7 @@ class BaseOpt(ChainOpt):
         fns = tuple(fns)
-        self.compile_step =  default(compile_step, self.compile_step)
+        self.compile_step = default(compile_step, self.compile_step)
         if default(palm, self.palm):
             fns = (palm_beta2,) + fns
         if default(gradient_clipping, self.gradient_clipping) is not None:

heavyball/utils.py CHANGED Viewed

@@ -61,22 +61,25 @@ def warmup(lr: float, step: int, warmup_steps: int):
 @decorator_knowngood
-def _compilable_schedule_free_(p: List[Tensor], z: List[Tensor], ckp1: Tensor, grad: List[Tensor], lr: Tensor,
-                               beta1: Tensor, decay: float):
-    for op, oz, g_ in zip(p, z, grad):
-        g_ = g_.view_as(op)
-        p_, z_, g_ = map(promote, (op, oz, g_))
+def _compilable_schedule_free_(p: List[Tensor], z: List[Tensor], ckp1: Tensor, update: List[Tensor], lr: Tensor,
+                               beta1: Tensor, decay: float, grad: List[Tensor], caution):
+    for op, oz, u_, g_ in zip(p, z, update, grad):
+        u_ = u_.view_as(op)
+        p_, z_, u_ = map(promote, (op, oz, u_))
         if decay != 0:
-            g_ = g_ + p_ * decay
+            u_ = u_ + p_ * decay
+        if caution:
+            u_ = _compilable_cautioning(u_, g_)
         p_ = p_.lerp(z_, ckp1)
-        p_ = p_ + g_ * (lr * (beta1 * (1 - ckp1)) - lr)
-        z_ = z_ + g_ * -lr
+        p_ = p_ + u_ * (lr * (beta1 * (1 - ckp1)) - lr)
+        z_ = z_ + u_ * -lr
         copy_stochastic_(op, p_)
         copy_stochastic_(oz, z_)
 def schedule_free_(lr: float, weight_lr_power: float, weight_sum: float, beta1: float, parameters: List[Tensor],
-                   z: List[Tensor], grad: List[Tensor], r: float = 0.0, step: int = 0, decay: float = 0.0):
+                   z: List[Tensor], update: List[Tensor], grad: List[Tensor], caution: bool = False, r: float = 0.0,
+                   step: int = 0, decay: float = 0.0):
     weight = abs(lr) ** weight_lr_power * max(step, 1) ** r
     weight_sum = weight_sum + weight
@@ -85,9 +88,9 @@ def schedule_free_(lr: float, weight_lr_power: float, weight_sum: float, beta1:
     except ZeroDivisionError:
         ckp1 = 0
-    grad, parameters, z = list_guard(grad, parameters, z)
+    update, parameters, z = list_guard(update, parameters, z)
     lr, ckp1, beta1 = scalar_guard(lr, ckp1, beta1, grad[0])
-    _compilable_schedule_free_(parameters, z, ckp1, grad, lr, beta1, decay)
+    _compilable_schedule_free_(parameters, z, ckp1, update, lr, beta1, decay, grad, caution)
     return weight_sum
@@ -1220,13 +1223,16 @@ def update_triu_(q_state, materialised):
         assert shape0 == shape1
         copy_stochastic_(q, m)
 _warned = set()
 def warn_once(msg):
     if msg not in _warned:
         warnings.warn(msg)
         _warned.add(msg)
 def psgd_should_update(group, prob: Union[float, callable], rng: Optional[random.Random] = None,
                        name: str = 'cumulative_prob'):
     group[f'{name}_prob_step'] = group.get(f'{name}_prob_step', 0) + 1
@@ -1369,4 +1375,16 @@ def fused_hook(parameters, optimizer, *args, **kwargs):
             seen_params.clear()
     for p in parameters:
-        p.register_post_accumulate_grad_hook(_step)
+        p.register_post_accumulate_grad_hook(_step)
+@decorator_knowngood
+def _compilable_caution_no_scale(g: Tensor, update: Tensor):
+    mask = g.signbit() ^ update.signbit()  # "Mask if they point in different directions"
+    update = update.masked_fill(mask, 0)
+    return update
+def disable_caution_scaling():
+    global _compilable_cautioning
+    _compilable_cautioning = _compilable_caution_no_scale

{heavyball-1.2.1.dist-info → heavyball-1.2.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.2.1
+Version: 1.2.3
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

heavyball-1.2.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+heavyball/__init__.py,sha256=miRgcXlzLWTNzojeRF5hEcg-x_GqfMHjRzOaiR_zO3U,10981
+heavyball/chainable.py,sha256=u9w2z_aSslcokWVCiiXQJ8GSPlOhgrOFUYAwt2JfTzI,21100
+heavyball/utils.py,sha256=I2zfiB_-EP35LYr-vLyxPNl8_uJo2se3Id0IWjZeVjg,47951
+heavyball-1.2.3.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
+heavyball-1.2.3.dist-info/METADATA,sha256=EtW_3QIUKrKpyYUfXmGQm3_EpZkr8oQyow7gAyC4Ges,12022
+heavyball-1.2.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+heavyball-1.2.3.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
+heavyball-1.2.3.dist-info/RECORD,,

heavyball-1.2.1.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-heavyball/__init__.py,sha256=miRgcXlzLWTNzojeRF5hEcg-x_GqfMHjRzOaiR_zO3U,10981
-heavyball/chainable.py,sha256=5CrtSVaTI9DIgPPy0DD3WbWyVmc6-3jd2E5zM2frQlI,21092
-heavyball/utils.py,sha256=n80bsTZ2NUD4L3YERaC7ydKaQzW4kSDNoBTLF0DfE1g,47393
-heavyball-1.2.1.dist-info/LICENSE,sha256=CGdGJim64YifGmUVPaeyRsxkvyExtClswhRNIp8FY_U,1322
-heavyball-1.2.1.dist-info/METADATA,sha256=eBqL5rxjms6zSclp1IjQKI8vmv1OOgvrmdOroZnE1GQ,12022
-heavyball-1.2.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-heavyball-1.2.1.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
-heavyball-1.2.1.dist-info/RECORD,,

{heavyball-1.2.1.dist-info → heavyball-1.2.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{heavyball-1.2.1.dist-info → heavyball-1.2.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{heavyball-1.2.1.dist-info → heavyball-1.2.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

heavyball 1.2.1__py3-none-any.whl → 1.2.3__py3-none-any.whl

heavyball 1.2.1py3-none-any.whl → 1.2.3py3-none-any.whl