PyPI - heavyball - Versions diffs - 1.2.2__tar.gz → 1.3.0__tar.gz - Mend

heavyball 1.2.2tar.gz → 1.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{heavyball-1.2.2 → heavyball-1.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.2.2
+Version: 1.3.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.2.2 → heavyball-1.3.0}/heavyball/chainable.py RENAMED Viewed

@@ -183,8 +183,8 @@ def update_by_laprop(group, update, grad, param, exp_avg, exp_avg_sq):
 @no_state
 def update_by_schedule_free(group, update, grad, param, z):
     group['weight_sum'] = utils.schedule_free_(group['lr'], group['weight_lr_power'], group.get('weight_sum', 0),
-                                               utils.get_beta1(group), param, z, update, group['r'], group['step'],
-                                               group['weight_decay'])
+                                               utils.get_beta1(group), param, z, update, grad, group['caution'],
+                                               group['r'], group['step'], group['weight_decay'])
     raise SkipUpdate
@@ -438,14 +438,15 @@ class ChainOpt(utils.StatefulOptimizer):
         for param in p:
             state = self.state_(param)
-            if 'step' not in state:
-                if self.compile_step:
-                    step = utils.scalar_guard(0, param)
-                state['step'] = step
-            step = state['step'].add_(1)
+            if 'step' in state:
+                step = state['step']
+            elif self.compile_step:
+                step = utils.scalar_guard(0, param)
+            else:
+                step = 0
             break
-        group['step'] = step
+        group['step'] = state['step'] = step = step + 1
         if group['warmup_steps'] and step < group['warmup_steps']:
             group['lr'] = group['base_lr'] * step / group['warmup_steps']
@@ -494,7 +495,7 @@ class BaseOpt(ChainOpt):
     auto_fuse: bool = True
     def __init__(self, params, defaults, foreach: bool, gradient_clipping: str_or_fn, update_clipping: str_or_fn,
-                 palm: bool = use_default, compile_step: bool = use_default, *fns):
+                 palm: bool = use_default, *fns, compile_step: bool = use_default):
         if default(update_clipping, self.update_clipping) is None:
             if fns and self.auto_fuse:
                 args, kwargs = None, None

{heavyball-1.2.2 → heavyball-1.3.0}/heavyball/utils.py RENAMED Viewed

@@ -61,22 +61,25 @@ def warmup(lr: float, step: int, warmup_steps: int):
 @decorator_knowngood
-def _compilable_schedule_free_(p: List[Tensor], z: List[Tensor], ckp1: Tensor, grad: List[Tensor], lr: Tensor,
-                               beta1: Tensor, decay: float):
-    for op, oz, g_ in zip(p, z, grad):
-        g_ = g_.view_as(op)
-        p_, z_, g_ = map(promote, (op, oz, g_))
+def _compilable_schedule_free_(p: List[Tensor], z: List[Tensor], ckp1: Tensor, update: List[Tensor], lr: Tensor,
+                               beta1: Tensor, decay: float, grad: List[Tensor], caution):
+    for op, oz, u_, g_ in zip(p, z, update, grad):
+        u_ = u_.view_as(op)
+        p_, z_, u_ = map(promote, (op, oz, u_))
         if decay != 0:
-            g_ = g_ + p_ * decay
+            u_ = u_ + p_ * decay
+        if caution:
+            u_ = _compilable_cautioning(u_, g_)
         p_ = p_.lerp(z_, ckp1)
-        p_ = p_ + g_ * (lr * (beta1 * (1 - ckp1)) - lr)
-        z_ = z_ + g_ * -lr
+        p_ = p_ + u_ * (lr * (beta1 * (1 - ckp1)) - lr)
+        z_ = z_ + u_ * -lr
         copy_stochastic_(op, p_)
         copy_stochastic_(oz, z_)
 def schedule_free_(lr: float, weight_lr_power: float, weight_sum: float, beta1: float, parameters: List[Tensor],
-                   z: List[Tensor], grad: List[Tensor], r: float = 0.0, step: int = 0, decay: float = 0.0):
+                   z: List[Tensor], update: List[Tensor], grad: List[Tensor], caution: bool = False, r: float = 0.0,
+                   step: int = 0, decay: float = 0.0):
     weight = abs(lr) ** weight_lr_power * max(step, 1) ** r
     weight_sum = weight_sum + weight
@@ -85,9 +88,9 @@ def schedule_free_(lr: float, weight_lr_power: float, weight_sum: float, beta1:
     except ZeroDivisionError:
         ckp1 = 0
-    grad, parameters, z = list_guard(grad, parameters, z)
+    update, parameters, z, grad = list_guard(update, parameters, z, grad)
     lr, ckp1, beta1 = scalar_guard(lr, ckp1, beta1, grad[0])
-    _compilable_schedule_free_(parameters, z, ckp1, grad, lr, beta1, decay)
+    _compilable_schedule_free_(parameters, z, ckp1, update, lr, beta1, decay, grad, caution)
     return weight_sum
@@ -909,13 +912,12 @@ def copy_stochastic_(target: Tensor, source: Tensor):
 @decorator_knowngood
 def _compilable_update_(p: List[Tensor], u: List[Tensor], decay: Tensor, lr: Tensor, caution: bool,
                         g: List[Optional[Tensor]]):
-    u = [u_.view_as(p_) for u_, p_ in zip(u, p)]
-    p32, u32 = [list(map(promote, x)) for x in [p, u]]
-    for p32_, u32_, g_, p_ in zip(p32, u32, g, p):  # lr is data-dependent -> can't compile a foreach
+    for u_, g_, p_ in zip(u, g, p):  # lr is data-dependent -> can't compile a foreach
+        u_ = promote(u_.view_as(p_))
+        p32_ = promote(p_)
         if caution:
-            u32_ = _compilable_cautioning(promote(g_), u32_)
-        p32_ = p32_ * (1 - decay * lr) + u32_ * -lr
+            u_ = _compilable_cautioning(promote(g_), u_)
+        p32_ = p32_ * (1 - decay * lr) + u_ * -lr
         copy_stochastic_(p_, p32_)
@@ -1220,13 +1222,16 @@ def update_triu_(q_state, materialised):
         assert shape0 == shape1
         copy_stochastic_(q, m)
 _warned = set()
 def warn_once(msg):
     if msg not in _warned:
         warnings.warn(msg)
         _warned.add(msg)
 def psgd_should_update(group, prob: Union[float, callable], rng: Optional[random.Random] = None,
                        name: str = 'cumulative_prob'):
     group[f'{name}_prob_step'] = group.get(f'{name}_prob_step', 0) + 1
@@ -1369,4 +1374,16 @@ def fused_hook(parameters, optimizer, *args, **kwargs):
             seen_params.clear()
     for p in parameters:
-        p.register_post_accumulate_grad_hook(_step)
+        p.register_post_accumulate_grad_hook(_step)
+@decorator_knowngood
+def _compilable_caution_no_scale(g: Tensor, update: Tensor):
+    mask = g.signbit() ^ update.signbit()  # "Mask if they point in different directions"
+    update = update.masked_fill(mask, 0)
+    return update
+def disable_caution_scaling():
+    global _compilable_cautioning
+    _compilable_cautioning = _compilable_caution_no_scale

{heavyball-1.2.2 → heavyball-1.3.0}/heavyball.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: heavyball
-Version: 1.2.2
+Version: 1.3.0
 Summary: Efficient optimizers
 Home-page: https://github.com/clashluke/heavyball
 Author: Lucas Nestler

{heavyball-1.2.2 → heavyball-1.3.0}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setuptools.setup(
     name='heavyball',
     license='BSD',
     description='Efficient optimizers',
-    version='1.2.2',
+    version='1.3.0',
     long_description=README,
     url='https://github.com/clashluke/heavyball',
     packages=setuptools.find_packages(),