heavyball 1.7.0__tar.gz → 1.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {heavyball-1.7.0 → heavyball-1.7.2}/PKG-INFO +1 -1
- {heavyball-1.7.0 → heavyball-1.7.2}/heavyball/__init__.py +20 -1
- {heavyball-1.7.0 → heavyball-1.7.2}/heavyball/chainable.py +50 -8
- {heavyball-1.7.0 → heavyball-1.7.2}/heavyball/utils.py +589 -180
- {heavyball-1.7.0 → heavyball-1.7.2}/heavyball.egg-info/PKG-INFO +1 -1
- {heavyball-1.7.0 → heavyball-1.7.2}/heavyball.egg-info/SOURCES.txt +1 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/pyproject.toml +1 -1
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_memory.py +12 -6
- heavyball-1.7.2/test/test_memory_leak.py +68 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/LICENSE +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/README.md +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/heavyball.egg-info/dependency_links.txt +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/heavyball.egg-info/requires.txt +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/heavyball.egg-info/top_level.txt +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/setup.cfg +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_bf16_params.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_bf16_q.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_bf16_storage.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_caution.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_channels_last.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_closure.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_ema.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_foreach.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_hook.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_mars.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_merge.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_no_grad.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_soap.py +0 -0
- {heavyball-1.7.0 → heavyball-1.7.2}/test/test_stochastic_updates.py +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
import functools
|
2
|
+
import math
|
2
3
|
from typing import Optional
|
3
4
|
|
4
5
|
from . import chainable as C
|
@@ -564,6 +565,10 @@ class ForeachCachedNewtonPSGD(ForeachCachedPSGDKron):
|
|
564
565
|
hessian_approx = True
|
565
566
|
|
566
567
|
|
568
|
+
class NewtonHybrid2PSGDKron(ForeachCachedNewtonPSGD):
|
569
|
+
hvp_interval = 2
|
570
|
+
|
571
|
+
|
567
572
|
class ForeachPSGDLRA(C.BaseOpt):
|
568
573
|
"""
|
569
574
|
Originally from Evan Walters and Omead Pooladzandi, 2024
|
@@ -582,7 +587,7 @@ class ForeachPSGDLRA(C.BaseOpt):
|
|
582
587
|
weight_decay=0.0,
|
583
588
|
preconditioner_update_probability=None,
|
584
589
|
momentum_into_precond_update=True,
|
585
|
-
rank: int =
|
590
|
+
rank: Optional[int] = None,
|
586
591
|
warmup_steps: int = 0,
|
587
592
|
foreach: bool = True,
|
588
593
|
q_dtype="float32",
|
@@ -608,6 +613,14 @@ class ForeachPSGDLRA(C.BaseOpt):
|
|
608
613
|
)
|
609
614
|
params = defaults.pop("params")
|
610
615
|
|
616
|
+
if rank is None:
|
617
|
+
utils.warn_once(
|
618
|
+
f"{rank=}. It will be set to log2(param_count). This requires `params` to be of type list. Currently, {type(params)=}"
|
619
|
+
)
|
620
|
+
params = list(params)
|
621
|
+
defaults["rank"] = round(math.log2(sum(p.numel() for p in params)))
|
622
|
+
utils.warn_once(f"rank was set to {defaults['rank']}")
|
623
|
+
|
611
624
|
delayed = C.default(delayed, self.delayed)
|
612
625
|
exp_avg_input = C.default(exp_avg_input, self.exp_avg_input)
|
613
626
|
update_clipping = C.default(update_clipping, utils.trust_region_clip_)
|
@@ -632,6 +645,10 @@ class ForeachNewtonPSGDLRA(ForeachPSGDLRA):
|
|
632
645
|
hessian_approx = True
|
633
646
|
|
634
647
|
|
648
|
+
class NewtonHybrid2PSGDLRA(ForeachNewtonPSGDLRA):
|
649
|
+
hvp_interval = 2
|
650
|
+
|
651
|
+
|
635
652
|
PalmForEachSoap = PaLMForeachSOAP
|
636
653
|
PaLMSOAP = PaLMForeachSOAP
|
637
654
|
PaLMSFAdamW = PaLMForeachSFAdamW
|
@@ -696,4 +713,6 @@ __all__ = [
|
|
696
713
|
"DelayedPSGD",
|
697
714
|
"PSGDLRA",
|
698
715
|
"NewtonPSGDLRA",
|
716
|
+
"NewtonHybrid2PSGDLRA",
|
717
|
+
"NewtonHybrid2PSGDKron",
|
699
718
|
]
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import functools
|
2
|
+
import math
|
2
3
|
import random
|
3
4
|
from typing import List, Literal, Optional, Union
|
4
5
|
|
@@ -43,7 +44,7 @@ class FunctionTransform:
|
|
43
44
|
raise NotImplementedError
|
44
45
|
|
45
46
|
def get_fn(self):
|
46
|
-
if
|
47
|
+
if utils.hasattr_none(self.fn, "get_fn"):
|
47
48
|
return self.fn.get_fn()
|
48
49
|
return self.fn
|
49
50
|
|
@@ -426,7 +427,7 @@ def _store_std(state, group, update, grad, param):
|
|
426
427
|
state["init_std"] = torch.std(grad, dim=0)
|
427
428
|
|
428
429
|
|
429
|
-
@general_guard("init_std", init_fn=_store_std)
|
430
|
+
@general_guard("init_std", init_fn=_store_std, skip_first=False)
|
430
431
|
@no_state
|
431
432
|
def mup_approx(group, updates, grads, params, init_std):
|
432
433
|
_updates = [(u, i) for u, i in zip(updates, init_std) if u.ndim > 1]
|
@@ -435,6 +436,40 @@ def mup_approx(group, updates, grads, params, init_std):
|
|
435
436
|
return updates
|
436
437
|
|
437
438
|
|
439
|
+
def _init_delta(state, group, update, grad, param, log_space: bool):
|
440
|
+
val = group["initial_d"]
|
441
|
+
state["delta"] = torch.full((), math.log(val) if log_space else val, dtype=param.dtype, device=param.device)
|
442
|
+
|
443
|
+
|
444
|
+
def _init_full_delta(state, group, update, grad, param, log_space: bool):
|
445
|
+
val = group["initial_d"]
|
446
|
+
state["delta"] = torch.full_like(param, math.log(val) if log_space else val)
|
447
|
+
|
448
|
+
|
449
|
+
@zero_guard("state")
|
450
|
+
@general_guard("delta", init_fn=functools.partial(_init_delta, log_space=False), skip_first=False)
|
451
|
+
@no_state
|
452
|
+
def scale_by_d_adaptation(group, update, grad, param, state, delta):
|
453
|
+
utils.d_adaptation(grad, update, state, delta)
|
454
|
+
return update
|
455
|
+
|
456
|
+
|
457
|
+
@zero_guard("state")
|
458
|
+
@general_guard("delta", init_fn=functools.partial(_init_delta, log_space=True), skip_first=False)
|
459
|
+
@no_state
|
460
|
+
def scale_by_lr_adaptation(group, update, grad, param, state, delta):
|
461
|
+
utils.lr_adaptation(grad, update, state, delta, group["lr_lr"])
|
462
|
+
return update
|
463
|
+
|
464
|
+
|
465
|
+
@zero_guard("state")
|
466
|
+
@general_guard("delta", init_fn=functools.partial(_init_full_delta, log_space=True), skip_first=False)
|
467
|
+
@no_state
|
468
|
+
def scale_by_pointwise_lr_adaptation(group, update, grad, param, state, delta):
|
469
|
+
utils.pointwise_lr_adaptation(grad, update, state, delta, group["lr_lr"])
|
470
|
+
return update
|
471
|
+
|
472
|
+
|
438
473
|
@zero_guard("momentum")
|
439
474
|
@no_state
|
440
475
|
def heavyball_momentum(group, updates, grads, params, momentum):
|
@@ -484,18 +519,22 @@ def _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, p
|
|
484
519
|
if not group["is_preconditioning"]:
|
485
520
|
return Q_mat
|
486
521
|
|
522
|
+
if utils.hasattr_none(param, "vector"):
|
523
|
+
vector, hessian_vector = param.vector, param.hessian_vector
|
524
|
+
del param.vector
|
525
|
+
del param.hessian_vector
|
526
|
+
else:
|
527
|
+
vector, hessian_vector = utils.dampen_grad(grad)
|
528
|
+
|
487
529
|
utils.psgd_update_precond(
|
488
530
|
Q_mat,
|
489
531
|
exprs,
|
490
|
-
|
532
|
+
hessian_vector,
|
491
533
|
group["precond_lr"],
|
492
534
|
Q,
|
493
535
|
group["store_triu_as_line"],
|
494
|
-
|
536
|
+
vector,
|
495
537
|
)
|
496
|
-
if hasattr(param, "vector"):
|
497
|
-
del param.vector
|
498
|
-
del param.hessian_vector
|
499
538
|
|
500
539
|
if grad.dim() > 1 and precond_schedule(group, balance_probability, f"balance_prob_{id(Q)}"):
|
501
540
|
if group["store_triu_as_line"]:
|
@@ -566,9 +605,12 @@ def _update_lra(
|
|
566
605
|
if not group["is_preconditioning"]:
|
567
606
|
return utils.flatten(U, 1), utils.flatten(V, 1), utils.flatten(d)
|
568
607
|
|
569
|
-
if
|
608
|
+
if utils.hasattr_none(params[0], "hessian_vector"):
|
570
609
|
vector = utils.flatten([p.vector for p in params])
|
571
610
|
hessian_vector = utils.flatten([p.hessian_vector for p in params])
|
611
|
+
for p in params:
|
612
|
+
del p.vector
|
613
|
+
del p.hessian_vector
|
572
614
|
else:
|
573
615
|
vector, hessian_vector = utils.dampen_multiple(grads)
|
574
616
|
return utils.update_lra_precond_(U, V, d, vector, hessian_vector, group["eps"], group["precond_lr"], delayed)
|