PyPI - heavyball - Versions diffs - 2.1.2__py3-none-any.whl → 2.1.4__py3-none-any.whl - Mend

heavyball 2.1.2py3-none-any.whl → 2.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

heavyball/__init__.py +56 -89
heavyball/chainable.py +6 -4
heavyball/helpers.py +127 -56
heavyball/utils.py +74 -61
{heavyball-2.1.2.dist-info → heavyball-2.1.4.dist-info}/METADATA +2 -1
heavyball-2.1.4.dist-info/RECORD +9 -0
heavyball-2.1.2.dist-info/RECORD +0 -9
{heavyball-2.1.2.dist-info → heavyball-2.1.4.dist-info}/WHEEL +0 -0
{heavyball-2.1.2.dist-info → heavyball-2.1.4.dist-info}/licenses/LICENSE +0 -0
{heavyball-2.1.2.dist-info → heavyball-2.1.4.dist-info}/top_level.txt +0 -0

heavyball/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import functools
 import math
-from typing import Optional
+from typing import Optional, Type, Union
 import torch.optim
@@ -8,39 +8,6 @@ from . import chainable as C
 from . import utils
-class SAMWrapper(torch.optim.Optimizer):
-    def __init__(self, params, wrapped_optimizer: utils.StatefulOptimizer, ball: float = 0.1):
-        if not isinstance(wrapped_optimizer, utils.StatefulOptimizer):
-            raise ValueError(f"{wrapped_optimizer.__class__.__name__} is not a HeavyBall optimizer")
-        super().__init__(params, {"ball": ball})
-        self.wrapped_optimizer = wrapped_optimizer
-    @torch.no_grad()
-    def step(self, closure=None):
-        if closure is None:
-            raise ValueError("SAM requires closure")
-        with torch.enable_grad():
-            closure()
-        old_params = [utils.sam_step(group["params"], group["ball"]) for group in self.param_groups]
-        originaL_handle_closure = self.wrapped_optimizer._handle_closure
-        def _handle_closure(closure):
-            originaL_handle_closure(closure)
-            for group, old in zip(self.param_groups, old_params):
-                utils.copy_stochastic_list_(group["params"], old)
-        try:
-            self.wrapped_optimizer._handle_closure = _handle_closure
-            loss = self.wrapped_optimizer.step(closure)
-        finally:
-            self.wrapped_optimizer._handle_closure = originaL_handle_closure
-        return loss
-    def zero_grad(self, set_to_none: bool = True):
-        self.wrapped_optimizer.zero_grad()
 class SGD(C.BaseOpt):
     def __init__(
         self,
@@ -778,7 +745,7 @@ class ForeachPSGDKron(C.BaseOpt):
         beta=None,
         betas=(0.9, 0.999),
         weight_decay=0.0,
-        preconditioner_update_probability=None,
+        preconditioner_update_probability=C.use_default,
         max_size_triangular=2048,
         min_ndim_triangular=2,
         memory_save_mode=None,
@@ -830,8 +797,8 @@ class ForeachPSGDKron(C.BaseOpt):
         if kwargs:
             utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
-        self.precond_schedule = (
-            defaults.pop("preconditioner_update_probability") or utils.precond_update_prob_schedule()
+        self.precond_schedule = C.default(
+            defaults.pop("preconditioner_update_probability"), utils.precond_update_prob_schedule()
         )
         params = defaults.pop("params")
@@ -890,7 +857,7 @@ class ForeachPSGDLRA(C.BaseOpt):
         lr=0.001,
         beta=0.9,
         weight_decay=0.0,
-        preconditioner_update_probability=None,
+        preconditioner_update_probability=C.use_default,
         momentum_into_precond_update=True,
         rank: Optional[int] = None,
         warmup_steps: int = 0,
@@ -924,8 +891,8 @@ class ForeachPSGDLRA(C.BaseOpt):
         if kwargs:
             utils.warn_once(f"Working with uncaptured keyword arguments: {kwargs}")
-        self.precond_schedule = (
-            defaults.pop("preconditioner_update_probability") or utils.precond_update_prob_schedule()
+        self.precond_schedule = C.default(
+            defaults.pop("preconditioner_update_probability"), utils.precond_update_prob_schedule()
         )
         params = defaults.pop("params")
@@ -960,6 +927,54 @@ class NewtonHybrid2PSGDLRA(ForeachNewtonPSGDLRA):
     hvp_interval = 2
+class SAMWrapper(torch.optim.Optimizer):
+    def __init__(
+        self,
+        params,
+        wrapped_optimizer: Union[utils.StatefulOptimizer, Type[utils.StatefulOptimizer]] = ForeachAdamW,
+        ball: float = 0.1,
+    ):
+        params = list(params)
+        super().__init__(params, {"ball": ball})
+        if isinstance(wrapped_optimizer, type):
+            if not issubclass(wrapped_optimizer, utils.StatefulOptimizer):
+                raise ValueError(f"{wrapped_optimizer.__name__} is not a HeavyBall optimizer")
+            wrapped_optimizer = wrapped_optimizer(params)
+        elif not isinstance(wrapped_optimizer, utils.StatefulOptimizer):
+            raise ValueError(f"{wrapped_optimizer.__class__.__name__} is not a HeavyBall optimizer")
+        self.wrapped_optimizer = wrapped_optimizer
+    @torch.no_grad()
+    def step(self, closure=None):
+        if closure is None:
+            raise ValueError("SAM requires closure")
+        with torch.enable_grad():
+            closure()
+        old_params = [utils.sam_step(group["params"], group["ball"]) for group in self.param_groups]
+        original_handle_closure = self.wrapped_optimizer._handle_closure
+        def _handle_closure(closure):
+            try:
+                _loss = original_handle_closure(closure)
+            finally:
+                for group, old in zip(self.param_groups, old_params):
+                    utils.copy_stochastic_list_(group["params"], old)
+            return _loss
+        try:
+            self.wrapped_optimizer._handle_closure = _handle_closure
+            loss = self.wrapped_optimizer.step(closure)
+        finally:
+            self.wrapped_optimizer._handle_closure = original_handle_closure
+        return loss
+    def zero_grad(self, set_to_none: bool = True):
+        self.wrapped_optimizer.zero_grad(set_to_none=set_to_none)
 PalmForEachSoap = PaLMForeachSOAP
 PaLMSOAP = PaLMForeachSOAP
 PaLMSFAdamW = PaLMForeachSFAdamW
@@ -983,52 +998,4 @@ PSGDLRA = ForeachPSGDLRA
 NewtonPSGDLRA = ForeachNewtonPSGDLRA
 NewtonPSGDKron = ForeachCachedNewtonPSGD
-__all__ = [
-    "Muon",
-    "RMSprop",
-    "PrecondSchedulePaLMSOAP",
-    "PSGDKron",
-    "PurePSGD",
-    "DelayedPSGD",
-    "CachedPSGDKron",
-    "CachedDelayedPSGDKron",
-    "PalmForEachSoap",
-    "PaLMSOAP",
-    "PaLMSFAdamW",
-    "LaProp",
-    "ADOPT",
-    "PrecondScheduleSOAP",
-    "PrecondSchedulePaLMSOAP",
-    "RMSprop",
-    "MuonLaProp",
-    "ForeachSignLaProp",
-    "ForeachDelayedPSGDLRA",
-    "ForeachPSGDLRA",
-    "ForeachPSGDLRA",
-    "ForeachNewtonPSGDLRA",  #
-    "ForeachAdamW",
-    "ForeachSFAdamW",
-    "ForeachLaProp",
-    "ForeachADOPT",
-    "ForeachSOAP",
-    "ForeachPSGDKron",
-    "ForeachPurePSGD",
-    "ForeachDelayedPSGD",
-    "ForeachCachedPSGDKron",
-    "ForeachCachedDelayedPSGDKron",
-    "ForeachRMSprop",
-    "ForeachMuon",
-    "ForeachCachedNewtonPSGD",
-    "OrthoLaProp",
-    "LaPropOrtho",
-    "SignLaProp",
-    "DelayedPSGD",
-    "PSGDLRA",
-    "NewtonPSGDLRA",
-    "NewtonHybrid2PSGDLRA",
-    "NewtonHybrid2PSGDKron",
-    "MSAMLaProp",
-    "NewtonPSGDKron",
-    "ForeachAdamC",
-    "SGD",
-]
+__all__ = [k for k, v in globals().items() if isinstance(v, type) and issubclass(v, torch.optim.Optimizer)]

heavyball/chainable.py CHANGED Viewed

@@ -62,8 +62,6 @@ class FunctionTransform:
                     self._init(st, group, *a, **kwargs)
                 except SkipUpdate:
                     skip_update = True
-                except:
-                    raise
                 finally:
                     if "is_initialized" not in st:
                         st["is_initialized"] = set()
@@ -499,6 +497,7 @@ def scale_by_suds(group, update, grad, param, exp_avg, exp_avg_sq, fisher_approx
     precond_update, _ = utils.eigvecs_product_rank1(precond_update.flatten(), fisher_approx.flatten(), w)
     new_approx = utils.oja_update(fisher_approx.flatten().to(update.dtype), update.flatten(), group["precond_lr"])
+    new_approx = new_approx.view_as(fisher_approx)
     utils.copy_stochastic_(fisher_approx, new_approx)
     return precond_update
@@ -565,7 +564,7 @@ def _init_psgd_kron(state, group, update, grad, param, cached: bool = False, pro
     )
     state["Q"] = utils.triu_to_line(Q) if group["store_triu_as_line"] else Q
     state["running_lower_bound"] = [torch.zeros((1,), device=q.device, dtype=torch.float64) for q in Q]
-    state["step"] = torch.zeros((), device=param.device, dtype=torch.int64)
+    state["step"] = torch.zeros((), device=param.device, dtype=torch.float64)  # torch casts int to float in ckpt load
     if group["adaptive"]:
         state["velocity"] = [torch.zeros((), device=q.device, dtype=q.dtype) for q in Q]
     if not cached:
@@ -750,7 +749,9 @@ def _update_psgd_precond(
     if isinstance(prob, float):
         float_prob = prob
     else:
-        float_prob = prob(group.get(f"cumulative_prob_{id(Q)}_prob_step", 1))
+        prob_step = group.get(f"cumulative_prob_{id(Q)}_prob_step", 1)
+        float_prob = prob(prob_step)
+        group[f"cumulative_prob_{id(Q)}_prob_step"] = prob_step + 1
     group["is_cached"] = should_use_cache = cached and float_prob < 0.5
     if precond is not None:
@@ -1086,6 +1087,7 @@ class ChainOpt(utils.StatefulOptimizer):
         if not group["foreach"] or len(p) == 1:
             for param, grad in zip(p, g):
                 chain(self.state_, group, [grad], [param], *self.fns)
+                group["caution"] = caution
         else:
             chain(self.state_, group, g, p, *self.fns)

heavyball/helpers.py CHANGED Viewed

@@ -1,9 +1,9 @@
-from __future__ import annotations
+import copy
 import functools
 import math
 import threading
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Sequence, Tuple, Union
 import numpy
 import numpy as np
@@ -11,23 +11,15 @@ import optuna
 import optunahub
 import pandas as pd
 import torch
-from botorch.utils.sampling import manual_seed
 from hebo.design_space.design_space import DesignSpace
 from hebo.optimizers.hebo import HEBO
-from optuna._transform import _SearchSpaceTransform
+from optuna._transform import _SearchSpaceTransform as SearchSpaceTransform
 from optuna.distributions import BaseDistribution, CategoricalDistribution, FloatDistribution, IntDistribution
 from optuna.samplers import BaseSampler, CmaEsSampler, RandomSampler
 from optuna.samplers._lazy_random_state import LazyRandomState
 from optuna.study import Study
 from optuna.study._study_direction import StudyDirection
 from optuna.trial import FrozenTrial, TrialState
-from optuna_integration.botorch import (
-    ehvi_candidates_func,
-    logei_candidates_func,
-    qehvi_candidates_func,
-    qei_candidates_func,
-    qparego_candidates_func,
-)
 from torch import Tensor
 from torch.nn import functional as F
@@ -37,12 +29,39 @@ _MAXINT32 = (1 << 31) - 1
 _SAMPLER_KEY = "auto:sampler"
+@contextmanager
+def manual_seed(seed: int | None = None) -> Generator[None, None, None]:
+    r"""
+    Contextmanager for manual setting the torch.random seed.
+    Args:
+        seed: The seed to set the random number generator to.
+    Returns:
+        Generator
+    Example:
+        >>> with manual_seed(1234):
+        >>>     X = torch.rand(3)
+    copied as-is from https://github.com/meta-pytorch/botorch/blob/a42cd65f9b704cdb6f2ee64db99a022eb15295d5/botorch/utils/sampling.py#L53C1-L75C50 under the MIT License
+    """
+    old_state = torch.random.get_rng_state()
+    try:
+        if seed is not None:
+            torch.random.manual_seed(seed)
+        yield
+    finally:
+        if seed is not None:
+            torch.random.set_rng_state(old_state)
 class SimpleAPIBaseSampler(BaseSampler):
     def __init__(
         self,
-        search_space: dict[str, BaseDistribution] = None,
+        search_space: Optional[dict[str, BaseDistribution]] = None,
     ):
-        self.search_space = search_space
+        self.search_space = {} if search_space is None else dict(search_space)
     def suggest_all(self, trial: FrozenTrial):
         return {k: trial._suggest(k, dist) for k, dist in self.search_space.items()}
@@ -65,6 +84,16 @@ def _get_default_candidates_func(
     """
     The original is available at https://github.com/optuna/optuna-integration/blob/156a8bc081322791015d2beefff9373ed7b24047/optuna_integration/botorch/botorch.py under the MIT License
     """
+    # lazy import
+    from optuna_integration.botorch import (
+        ehvi_candidates_func,
+        logei_candidates_func,
+        qehvi_candidates_func,
+        qei_candidates_func,
+        qparego_candidates_func,
+    )
     if n_objectives > 3 and not has_constraint and not consider_running_trials:
         return ehvi_candidates_func
     elif n_objectives > 3:
@@ -124,7 +153,7 @@ def _untransform_numerical_param_torch(
 @torch.no_grad()
-def untransform(self: _SearchSpaceTransform, trans_params: Tensor) -> dict[str, Any]:
+def untransform(self: SearchSpaceTransform, trans_params: Tensor) -> dict[str, Any]:
     assert trans_params.shape == (self._raw_bounds.shape[0],)
     if self._transform_0_1:
@@ -152,29 +181,31 @@ class BoTorchSampler(SimpleAPIBaseSampler):
     def __init__(
         self,
-        search_space: dict[str, BaseDistribution] = None,
+        search_space: Optional[dict[str, BaseDistribution]] = None,
         *,
-        candidates_func: None = None,
-        constraints_func: None = None,
+        candidates_func: Optional[Callable[..., Tensor]] = None,
+        constraints_func: Optional[Callable[..., Tensor]] = None,
         n_startup_trials: int = 10,
         consider_running_trials: bool = False,
-        independent_sampler: None = None,
+        independent_sampler: Optional[BaseSampler] = None,
         seed: int | None = None,
         device: torch.device | str | None = None,
         trial_chunks: int = 128,
     ):
-        assert constraints_func is None
-        assert candidates_func is None
-        assert consider_running_trials is False
-        assert independent_sampler is None
-        self._candidates_func = None
-        self._independent_sampler = RandomSampler(seed=seed)
+        if constraints_func is not None:
+            raise NotImplementedError("constraints_func is currently not supported by BoTorchSampler.")
+        if consider_running_trials:
+            raise NotImplementedError("consider_running_trials is currently not supported by BoTorchSampler.")
+        if candidates_func is not None and not callable(candidates_func):
+            raise TypeError("candidates_func must be callable.")
+        self._candidates_func = candidates_func
+        self._independent_sampler = independent_sampler or RandomSampler(seed=seed)
         self._n_startup_trials = n_startup_trials
         self._seed = seed
         self.trial_chunks = trial_chunks
         self._study_id: int | None = None
-        self.search_space = search_space
+        self.search_space = {} if search_space is None else dict(search_space)
         if isinstance(device, str):
             device = torch.device(device)
         self._device = device or torch.device("cpu")
@@ -182,14 +213,24 @@ class BoTorchSampler(SimpleAPIBaseSampler):
         self._values = None
         self._params = None
         self._index = 0
+        self._bounds_dim: int | None = None
     def infer_relative_search_space(self, study: Study, trial: FrozenTrial) -> dict[str, BaseDistribution]:
         return self.search_space
     @torch.no_grad()
     def _preprocess_trials(
-        self, trans: _SearchSpaceTransform, study: Study, trials: list[FrozenTrial]
+        self, trans: SearchSpaceTransform, study: Study, trials: list[FrozenTrial]
     ) -> Tuple[int, Tensor, Tensor]:
+        bounds_dim = trans.bounds.shape[0]
+        if self._bounds_dim is not None and self._bounds_dim != bounds_dim:
+            self._values = None
+            self._params = None
+            self._index = 0
+            self.seen_trials = set()
+        if self._bounds_dim is None:
+            self._bounds_dim = bounds_dim
         new_trials = []
         for trial in trials:
             tid: int = trial._trial_id
@@ -200,6 +241,10 @@ class BoTorchSampler(SimpleAPIBaseSampler):
         n_objectives = len(study.directions)
         if not new_trials:
+            if self._values is None or self._params is None:
+                empty_values = torch.zeros((0, n_objectives), dtype=torch.float64, device=self._device)
+                empty_params = torch.zeros((0, bounds_dim), dtype=torch.float64, device=self._device)
+                return n_objectives, empty_values, empty_params
             return n_objectives, self._values[: self._index], self._params[: self._index]
         n_completed_trials = len(trials)
@@ -216,18 +261,28 @@ class BoTorchSampler(SimpleAPIBaseSampler):
             if direction == StudyDirection.MINIMIZE:  # BoTorch always assumes maximization.
                 values[:, obj_idx] *= -1
-        if self._values is None:
+        bounds_dim = trans.bounds.shape[0]
+        cache_stale = (
+            self._values is None
+            or self._params is None
+            or self._values.size(1) != n_objectives
+            or self._params.size(1) != bounds_dim
+        )
+        if cache_stale:
             self._values = torch.zeros((self.trial_chunks, n_objectives), dtype=torch.float64, device=self._device)
-            self._params = torch.zeros(
-                (self.trial_chunks, trans.bounds.shape[0]), dtype=torch.float64, device=self._device
-            )
+            self._params = torch.zeros((self.trial_chunks, bounds_dim), dtype=torch.float64, device=self._device)
+            self._index = 0
+            self.seen_trials = set()
+            self._bounds_dim = bounds_dim
         spillage = (self._index + n_completed_trials) - self._values.size(0)
         if spillage > 0:
             pad = int(math.ceil(spillage / self.trial_chunks) * self.trial_chunks)
             self._values = F.pad(self._values, (0, 0, 0, pad))
             self._params = F.pad(self._params, (0, 0, 0, pad))
-        self._values[self._index : self._index + n_completed_trials] = torch.from_numpy(values)
-        self._params[self._index : self._index + n_completed_trials] = torch.from_numpy(params)
+        values_tensor = torch.from_numpy(values).to(self._device)
+        params_tensor = torch.from_numpy(params).to(self._device)
+        self._values[self._index : self._index + n_completed_trials] = values_tensor
+        self._params[self._index : self._index + n_completed_trials] = params_tensor
         self._index += n_completed_trials
         return n_objectives, self._values[: self._index], self._params[: self._index]
@@ -246,7 +301,7 @@ class BoTorchSampler(SimpleAPIBaseSampler):
         if n_completed_trials < self._n_startup_trials:
             return {}
-        trans = _SearchSpaceTransform(search_space)
+        trans = SearchSpaceTransform(search_space)
         n_objectives, values, params = self._preprocess_trials(trans, study, completed_trials)
         if self._candidates_func is None:
@@ -349,10 +404,10 @@ class HEBOSampler(optunahub.samplers.SimpleBaseSampler, SimpleAPIBaseSampler):
         independent_sampler: BaseSampler | None = None,
     ) -> None:
         super().__init__(search_space, seed)
-        assert constant_liar is False
-        assert independent_sampler is None
+        if constant_liar:
+            raise NotImplementedError("constant_liar is not supported by HEBOSampler.")
         self._hebo = HEBO(_convert_to_hebo_design_space(search_space), scramble_seed=self._seed)
-        self._independent_sampler = optuna.samplers.RandomSampler(seed=seed)
+        self._independent_sampler = independent_sampler or optuna.samplers.RandomSampler(seed=seed)
         self._rng = np.random.default_rng(seed)
     def sample_relative(
@@ -421,10 +476,12 @@ class FastINGO:
         learning_rate: Optional[float] = None,
         last_n: int = 4096,
         loco_step_size: float = 0.1,
-        device="cuda",
+        device: str | None = None,
         batchnorm_decay: float = 0.99,
         score_decay: float = 0.99,
     ) -> None:
+        if device is None:
+            device = _use_cuda()
         n_dimension = len(mean)
         if population_size is None:
             population_size = 4 + int(np.floor(3 * np.log(n_dimension)))
@@ -491,8 +548,14 @@ class FastINGO:
         if y.numel() <= 2:
             return
-        y = y + torch.where(y.min() <= 0, 1e-8 - y.min(), 0)
-        y = y.log()
+        min_y = y.min()
+        max_y = y.max()
+        if torch.isclose(max_y, min_y, rtol=0.0, atol=1e-12):
+            return
+        if min_y <= 0:
+            y = y + (1e-8 - min_y)
+        y = y.clamp_min_(1e-8).log()
         ema = -torch.arange(y.size(0), device=y.device, dtype=y.dtype)
         weight = self.batchnorm_decay**ema
@@ -553,7 +616,7 @@ class ImplicitNaturalGradientSampler(BaseSampler):
     def reseed_rng(self) -> None:
         self._independent_sampler.reseed_rng()
         if self._optimizer:
-            self._optimizer._rng.seed()
+            self._optimizer.generator.seed()
     def infer_relative_search_space(
         self, study: "optuna.Study", trial: "optuna.trial.FrozenTrial"
@@ -603,14 +666,11 @@ class ImplicitNaturalGradientSampler(BaseSampler):
             self._warn_independent_sampling = False
             return {}
-        trans = _SearchSpaceTransform(search_space)
+        trans = SearchSpaceTransform(search_space)
-        if self._optimizer is None:
+        if self._optimizer is None or self._optimizer.dim != len(trans.bounds):
             self._optimizer = self._init_optimizer(trans, population_size=self._population_size)
-        if self._optimizer.dim != len(trans.bounds):
-            self._warn_independent_sampling = False
-            return {}
+            self._param_queue.clear()
         solution_trials = [t for t in completed_trials if self._check_trial_is_generation(t)]
         for t in solution_trials:
@@ -621,7 +681,7 @@ class ImplicitNaturalGradientSampler(BaseSampler):
     def _init_optimizer(
         self,
-        trans: _SearchSpaceTransform,
+        trans: SearchSpaceTransform,
         population_size: Optional[int] = None,
     ) -> FastINGO:
         lower_bounds = trans.bounds[:, 0]
@@ -675,6 +735,7 @@ class ThreadLocalSampler(threading.local):
 def init_cmaes(study, seed, trials, search_space):
+    trials = copy.deepcopy(trials)
     trials.sort(key=lambda trial: trial.datetime_complete)
     return CmaEsSampler(seed=seed, source_trials=trials, lr_adapt=True)
@@ -686,8 +747,14 @@ def init_hebo(study, seed, trials, search_space):
     return sampler
+def _use_cuda():
+    return "cuda" if torch.cuda.is_available() else "cpu"
 def init_botorch(study, seed, trials, search_space):
-    return BoTorchSampler(search_space=search_space, seed=seed, device="cuda")  # will automatically pull in latest data
+    return BoTorchSampler(
+        search_space=search_space, seed=seed, device=_use_cuda()
+    )  # will automatically pull in latest data
 def init_nsgaii(study, seed, trials, search_space):
@@ -709,17 +776,20 @@ class AutoSampler(BaseSampler):
     def __init__(
         self,
         samplers: Iterable[Tuple[int, Callable]] | None = None,
-        search_space: dict[str, BaseDistribution] = None,
+        search_space: Optional[dict[str, BaseDistribution]] = None,
         *,
         seed: int | None = None,
-        constraints_func: None = None,
+        constraints_func: Optional[Callable[..., Any]] = None,
     ) -> None:
-        assert constraints_func is None
+        if constraints_func is not None:
+            raise NotImplementedError("constraints_func is not supported by AutoSampler.")
         if samplers is None:
+            if search_space is None:
+                raise ValueError("AutoSampler requires a search_space when using the default sampler schedule.")
             samplers = ((0, init_hebo), (100, init_nsgaii))
         self.sampler_indices = np.sort(np.array([x[0] for x in samplers], dtype=np.int32))
         self.samplers = [x[1] for x in sorted(samplers, key=lambda x: x[0])]
-        self.search_space = search_space
+        self.search_space = {} if search_space is None else dict(search_space)
         self._rng = LazyRandomState(seed)
         self._random_sampler = RandomSampler(seed=seed)
         self._thread_local_sampler = ThreadLocalSampler()
@@ -762,7 +832,7 @@ class AutoSampler(BaseSampler):
         complete_trials = study._get_trials(deepcopy=False, states=(TrialState.COMPLETE,), use_cache=True)
         self._completed_trials = max(self._completed_trials, len(complete_trials))
         new_index = (self._completed_trials >= self.sampler_indices).sum() - 1
-        if new_index == self._current_index:
+        if new_index == self._current_index or new_index < 0:
             return
         self._current_index = new_index
         self._sampler = self.samplers[new_index](
@@ -775,7 +845,7 @@ class AutoSampler(BaseSampler):
     def sample_relative(
         self, study: Study, trial: FrozenTrial, search_space: dict[str, BaseDistribution]
     ) -> dict[str, Any]:
-        return self._sampler.sample_relative(study, trial, self.search_space)
+        return self._sampler.sample_relative(study, trial, search_space or self.search_space)
     def sample_independent(
         self,
@@ -804,5 +874,6 @@ class AutoSampler(BaseSampler):
         state: TrialState,
         values: Sequence[float] | None,
     ) -> None:
-        assert state in [TrialState.COMPLETE, TrialState.FAIL, TrialState.PRUNED]
+        if state not in (TrialState.COMPLETE, TrialState.FAIL, TrialState.PRUNED):
+            raise ValueError(f"Unsupported trial state: {state}.")
         self._sampler.after_trial(study, trial, state, values)

heavyball/utils.py CHANGED Viewed

@@ -47,7 +47,7 @@ _cudnn_double_backward_pattern = re.compile(
 )
 _torch_compile_double_backward_pattern = re.compile(r"compile.*does not currently support double backward")
 _fd_error = (
-    "You can accelerate startup by globally enabling finite_differences first "  #
+    "You can accelerate startup by globally enabling finite_differences first "
     "(via opt.finite_differences=True or by subclassing it)\n"
     "Original Error: "
 )
@@ -343,7 +343,8 @@ def set_(dst: Tensor, src: Tensor):
 def clean():
-    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     gc.collect()
@@ -418,9 +419,13 @@ def zeropower_via_newtonschulz5(G, steps=5, eps=1e-7):
 ###### START
-# Taken from https://github.com/thinking-machines-lab/manifolds/blob/89dcae50f01af59f1e0570289474da3a2ecaa60b/src/msign.py#L47
+# Based on https://arxiv.org/pdf/2505.16932v3
+# and https://github.com/NoahAmsel/PolarExpress/blob/5454910920ca8c65afda28820cdf9e49b9436ed0/polar_express.py#L69-L82
+# and https://github.com/thinking-machines-lab/manifolds/blob/89dcae50f01af59f1e0570289474da3a2ecaa60b/src/msign.py#L47
+#
 # under the MIT License
+# Coefficients are from https://arxiv.org/pdf/2505.16932v3
 ABC_LIST: list[tuple[float, float, float]] = [
     (8.28721201814563, -23.595886519098837, 17.300387312530933),
     (4.107059111542203, -2.9478499167379106, 0.5448431082926601),
@@ -438,7 +443,7 @@ ABC_LIST_STABLE: list[tuple[float, float, float]] = [
 ] + [ABC_LIST[-1]]
-def msign(G: torch.Tensor, steps: int = 10) -> torch.Tensor:
+def msign(G: torch.Tensor, steps: int = 10, eps: float = 1e-7) -> torch.Tensor:
     """
     Polar Express algorithm for the matrix sign function:
     https://arxiv.org/abs/2505.16932
@@ -450,7 +455,9 @@ def msign(G: torch.Tensor, steps: int = 10) -> torch.Tensor:
     if should_transpose:
         x = x.mT
-    x /= x.norm(dim=(-2, -1), keepdim=True) * 1.01
+    # x = x / (x.norm(dim=(-2, -1), keepdim=True) * 1.01 + eps)
+    stochastic_divide_with_eps_(x, x.norm(dim=(-2, -1)) * 1.01, eps)
     for step in range(steps):
         a, b, c = ABC_LIST_STABLE[step] if step < len(ABC_LIST_STABLE) else ABC_LIST_STABLE[-1]
         s = x @ x.mT
@@ -464,8 +471,7 @@ def msign(G: torch.Tensor, steps: int = 10) -> torch.Tensor:
     if should_transpose:
         x = x.mT
-    x = torch.nan_to_num(x)
-    return x.float()
+    return x.to(G.dtype)
 ###### END
@@ -665,9 +671,9 @@ def get_orthogonal_matrix(mat, max_eps: float = 1e-3, min_eps: float = 1e-30):
             final.append(None)
             continue
+        device, dtype = m.device, m.dtype
         m = promote(m.data)
-        device, dtype = m.device, m.dtype
         eps = min_eps
         while True:
             try:
@@ -695,7 +701,6 @@ def get_orthogonal_matrix(mat, max_eps: float = 1e-3, min_eps: float = 1e-30):
                     raise
             clean()
-        eigvec = eigvec.to(device=m.device, dtype=m.dtype)
         eigvec = torch.flip(eigvec, [1])
         final.append(eigvec)
@@ -1048,13 +1053,15 @@ class StatefulOptimizer(torch.optim.Optimizer):
     def get_groups(self, group):
         return [group]
-    @functools.lru_cache(maxsize=None)
     def state_(self, arg: Tensor, fail: bool = True):
-        if not fail and arg not in self.mapping:
-            return {}
-        if _tensor_key(arg) not in self.mapping_inverse:
+        key = _tensor_key(arg)
+        if key not in self.mapping_inverse:
             self._init_mapping()
-        state_param, index = self.mapping_inverse[_tensor_key(arg)]
+        if key not in self.mapping_inverse:
+            if not fail:
+                return {}
+            raise KeyError("Tensor has no tracked state.")
+        state_param, index = self.mapping_inverse[key]
         if state_param not in self.state:
             self.state[state_param] = collections.defaultdict(dict)
         return self.state[state_param][index]
@@ -1142,7 +1149,7 @@ class StatefulOptimizer(torch.optim.Optimizer):
                 active_p = [p for p in group["params"]]
                 if not active_p:
-                    return
+                    continue
                 k = group["ema_step"] = group.get("ema_step", -1) + 1
@@ -1159,7 +1166,7 @@ class StatefulOptimizer(torch.optim.Optimizer):
                 active_p = [p for p in group["params"]]
                 if not active_p:
-                    return
+                    continue
                 for p in active_p:
                     if "param_ema" in self.state_(p):
@@ -1173,7 +1180,7 @@ class StatefulOptimizer(torch.optim.Optimizer):
                 active_p = [p for p in group["params"]]
                 if not active_p:
-                    return
+                    continue
                 for p in active_p:
                     if "param_ema" in self.state_(p):
@@ -1202,7 +1209,7 @@ class StatefulOptimizer(torch.optim.Optimizer):
         for group in self.param_groups:
             for p, g in self.split_p_and_g_in_group(group, skip_none=True, raw=True):
                 p.grad = grads.pop(0)
-                stochastic_add_(g, p.grad, -1)  # technically, we have to divide by the scale here
+                stochastic_add_divide_(g, p.grad, -1, torch.finfo(p.dtype).eps ** 0.5)
                 p.hessian_vector = g
                 p.data.copy_(p.orig)
                 del p.orig
@@ -1292,6 +1299,8 @@ class StatefulOptimizer(torch.optim.Optimizer):
             self._is_preconditioning = psgd_should_update(self.inner_group, self.precond_schedule, self.precond_rng)
         loss = self._handle_closure(closure)
+        if self.use_ema:
+            self.ema_update()
         # we assume that parameters are constant and that there are no excessive recompiles
         with torch.no_grad(), torch._dynamo.utils.disable_cache_limit():
             for group in self.param_groups:
@@ -1299,8 +1308,6 @@ class StatefulOptimizer(torch.optim.Optimizer):
                     group["param_count"] = sum(p.numel() for p in group["params"])
                 group["is_preconditioning"] = self._is_preconditioning
                 self._step(group)
-                if self.use_ema:
-                    self.ema_update()
                 for real, views in self.mapping.items():
                     for tensor in (real, *views):
                         for key in ("grad", "vector", "hessian_vector", "orig"):
@@ -1564,18 +1571,20 @@ def stochastic_round_list_(ref: List[Tensor], source: List[Tensor]):
 @decorator_knowngood
 def stochastic_round_(ref: Tensor, source: Tensor | None = None):
-    if source is not None:
-        if source.dtype == torch.bfloat16 or ref.dtype == source.dtype:
-            return source
-        if ref.dtype != torch.bfloat16:
-            return source.to(ref.dtype)
-    else:
+    if source is None:
         source = ref
-    source = source.float()
-    result = torch.randint_like(source, dtype=torch.int32, low=0, high=(1 << 16))
-    result.add_(source.view(dtype=torch.int32))
-    result.bitwise_and_(-65536)  # -65536 = FFFF0000 as a signed int32
-    return result.view(dtype=torch.float32).bfloat16()
+    if ref.dtype != torch.bfloat16:
+        return source.to(ref.dtype)
+    if source.dtype == torch.bfloat16:
+        return source
+    if source.dtype in (torch.float16, torch.float32, torch.float64):
+        source = source.to(torch.float32)
+        noise = torch.randint_like(source, dtype=torch.int32, low=0, high=(1 << 16))
+        bits = source.view(dtype=torch.int32)
+        bits.add_(noise)
+        bits.bitwise_and_(-65536)  # FFFF0000 mask, preserves sign+exp+7 mantissa bits
+        return bits.view(dtype=torch.float32).bfloat16()
+    return source.to(ref.dtype)
 @decorator_knowngood
@@ -1585,7 +1594,7 @@ def _compilable_copy_stochastic_(target: Tensor, source: Tensor):
 def copy_stochastic_(target: Tensor, source: Tensor):
     if target.dtype == torch.bfloat16 and source.dtype in (torch.float16, torch.float32, torch.float64):
-        _compilable_copy_stochastic_(target, source.float())
+        source = stochastic_round_(target, source)
     set_(target, source)
@@ -1908,7 +1917,8 @@ def update_lra_precond_(
     # LU factorization to reuse computation
     try:
-        LU, pivots = torch.linalg.lu_factor(IpVtU)
+        lu_matrix = promote(IpVtU)  # operate in fp32 when inputs are bf16/half
+        LU, pivots = torch.linalg.lu_factor(lu_matrix)
     except RuntimeError:
         # Error:
         # U[2,2] is zero and using it on lu_solve would result in a division by zero.
@@ -1918,8 +1928,13 @@ def update_lra_precond_(
         # So, we skip this step and reattempt on the next one
         return U.to(U_orig[0].dtype), V.to(V_orig[0].dtype), d.to(d_orig[0].dtype)
-    invQtv = invQtv - V @ torch.linalg.lu_solve(LU, pivots, (U.T @ invQtv).view(-1, 1), adjoint=True).flatten()
-    invPv = U @ torch.linalg.lu_solve(LU, pivots, (V.T @ invQtv).view(-1, 1)).flatten()
+    solve_dtype = LU.dtype
+    rhs = (U.T @ invQtv).view(-1, 1).to(solve_dtype)
+    correction = torch.linalg.lu_solve(LU, pivots, rhs, adjoint=True).to(V.dtype)
+    invQtv = invQtv - (V @ correction).flatten()
+    rhs = (V.T @ invQtv).view(-1, 1).to(solve_dtype)
+    solution = torch.linalg.lu_solve(LU, pivots, rhs).to(U.dtype)
+    invPv = (U @ solution).flatten()
     eps, step = scalar_guard(eps, step, vector)
     _compilable_d_step(d, d_orig, invQtv, vector, invPv, hessian_vector, Ph, eps, step, delayed)
@@ -2039,7 +2054,10 @@ def extract_from_flat_update(params: List[Tensor], update: Tensor):
 @decorator_knowngood
 def flatten(x: List[Tensor], remaining: int = 0) -> Tensor:
     last_dim = x[0].shape[-remaining:] if remaining else []
-    return torch.cat([i.reshape(-1, *last_dim) for i in x if i.numel()], 0)
+    tensors = [i.reshape(-1, *last_dim) for i in x if i.numel()]
+    if not tensors:
+        return torch.zeros((), dtype=x[0].device, device=x[0].device)
+    return torch.cat(tensors, 0)
 @decorator_knowngood
@@ -2111,16 +2129,6 @@ def psgd_calc_A_and_conjB(G: Tensor, Q, conjB: Tensor | None):  # conjB ("V", "v
     return A, conjB
-@decorator_knowngood
-def _random_projection(x: Tensor, scale: Optional[Tensor]):
-    if scale is None:
-        scale = x.norm(float("inf")).clamp(min=1e-8)
-    k = 2 ** math.ceil(math.log2(math.log2(min(x.shape))))  # next-largest-power-of-2 of log2-of-size
-    norm = x.square().sum(0)
-    indices = torch.topk(norm, k, largest=True).indices
-    return x.index_select(1, indices).contiguous() / scale, scale
 def max_singular_value_exact(A, use_lobpcg: bool = False):
     try:
         if use_lobpcg:
@@ -2164,7 +2172,15 @@ def max_singular_value_cholesky(A: Tensor, max_abs: Optional[Tensor] = None):
     """
     Adapted from @evanatyourservice
     """
-    Y, max_abs = _random_projection(A, max_abs)
+    if max_abs is None:
+        max_abs = A.norm(float("inf")).clamp(min=1e-8)
+    # cholesky uses random projection, but this uses topk -- topk is a warm start, which may converge to a biased result
+    k = 2 ** math.ceil(math.log2(math.log2(min(A.shape))))  # next-largest-power-of-2 of log2-of-size
+    norm = A.square().sum(0)
+    indices = torch.topk(norm, k, largest=True).indices
+    Y = A.index_select(1, indices).contiguous() / max_abs
     Q = inplace_orthogonal_(Y, precise_zeroth_power_mode)
     Q = Q / max_abs
     Z = A.T @ Q
@@ -2412,10 +2428,11 @@ def bf16_matmul(x: Tensor, y: Tensor):
 def if_iscompiling(fn):
     base = getattr(torch, fn.__name__, None)
-    def _fn(x):
-        if torch.compiler.is_compiling() and hasattr(torch, fn.__name__):
-            return base(x)
-        return fn(x)
+    @functools.wraps(fn)
+    def _fn(*args, **kwargs):
+        if torch.compiler.is_compiling() and base is not None:
+            return base(*args, **kwargs)
+        return fn(*args, **kwargs)
     return _fn
@@ -2551,7 +2568,7 @@ def _psgd_quad_preconditioner_grad(GG: List[Tensor], Q: List[Tensor], numel: int
         else:
             scale = gg.size(0) / numel
             gg = 2 * torch.eye(gg.size(0), device=gg.device, dtype=gg.dtype) - gg * scale
-            update = q - gg @ q @ gg
+            update = q - casted_einsum("ab,cd,bc", gg, gg, q)
             out.append(update + update.T)  # make matrix symmetric
     return out
@@ -3105,7 +3122,7 @@ def pointwise_lr_adaptation(
 ):
     grads, update, state, delta = list_guard(grads, update, state, delta)
     lr_lr = scalar_guard(lr_lr, grads[0])
-    _compilable_lr_adapt_(grads, update, state, delta, lr_lr)
+    _compilable_pointwise_lr_adapt_(grads, update, state, delta, lr_lr)
 def hook_optimizer_into_model(model, optimizer, *args, **kwargs):
@@ -3125,8 +3142,6 @@ def hook_optimizer_into_model(model, optimizer, *args, **kwargs):
 def fused_hook(parameters, optimizer, *args, **kwargs):
     parameters = list(parameters)
-    param_count = len(parameters)
-    seen_params = set()
     o = optimizer(parameters, *args, **kwargs)
     step_fn = o.step
@@ -3135,12 +3150,8 @@ def fused_hook(parameters, optimizer, *args, **kwargs):
     )
     def _step(p: Tensor):
-        seen_params.add(p)
-        if len(seen_params) < param_count:
-            step_fn()
-            o.zero_grad()
-            seen_params.clear()
+        step_fn()
+        o.zero_grad()
     for p in parameters:
         p.register_post_accumulate_grad_hook(_step)
@@ -3165,6 +3176,8 @@ def sam_step(parameters, ball_size, adaptive: bool = True):
     old_params = []
     for p in parameters:
         old_params.append(p.detach().clone())
+        if not hasattr_none(p, "grad"):
+            continue
         grad = promote(p.grad)
         if adaptive:
             grad = grad * promote(p).square()

{heavyball-2.1.2.dist-info → heavyball-2.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: heavyball
-Version: 2.1.2
+Version: 2.1.4
 Summary: Efficient Optimizers
 Author-email: HeavyBall Authors <github.heavyball@nestler.sh>
 Project-URL: source, https://github.com/HomebrewML/HeavyBall
@@ -21,6 +21,7 @@ Requires-Dist: numpy<2.0.0
 Provides-Extra: dev
 Requires-Dist: pre-commit; extra == "dev"
 Requires-Dist: pytest; extra == "dev"
+Requires-Dist: hypothesis; extra == "dev"
 Requires-Dist: ruff; extra == "dev"
 Requires-Dist: matplotlib; extra == "dev"
 Requires-Dist: seaborn; extra == "dev"

heavyball-2.1.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+heavyball/__init__.py,sha256=9VgWebob-zO7hKg_KmQuSOB4Z_Rh-gCDs_V2TTfQKSo,30123
+heavyball/chainable.py,sha256=O8QiHJ-E5RD-fzo3iulSHgvKgtRZ1Lff2ls3iLmXcoI,42695
+heavyball/helpers.py,sha256=eiotfrJz4V6ewfF9ZboC_JEUi_TCmO195uT6sqqohTE,33429
+heavyball/utils.py,sha256=u4RFOdmYkhsjPE4M_N53oDnuh-vHbvRHc6OLQTEeq-c,105239
+heavyball-2.1.4.dist-info/licenses/LICENSE,sha256=G9fFZcNIVWjU7o6Pr_4sJBRCNDU5X-zelSxIJ2D48ms,1323
+heavyball-2.1.4.dist-info/METADATA,sha256=MxDWUcqFgMWmG3FXtf0UVzDy9qsAWea4tPgnDnx9wXQ,5088
+heavyball-2.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+heavyball-2.1.4.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
+heavyball-2.1.4.dist-info/RECORD,,

heavyball-2.1.2.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-heavyball/__init__.py,sha256=1BTb7G-VcfcMyS4EpuVnhE5DBp2fj_Zzs9EQr6slPzg,30491
-heavyball/chainable.py,sha256=8S-7QRZYiy_ARhQ8uDu5G0Eg3ouT9Vcfk-rxbKlp4zI,42510
-heavyball/helpers.py,sha256=zk_S84wpGcvO9P6kn4UeaQUIDowHxcbM9qQITEm2g5I,30267
-heavyball/utils.py,sha256=Lx9XlfkyQbfYMPtqiA0rNIz4PXQe_bpLqKFby3upHMw,104514
-heavyball-2.1.2.dist-info/licenses/LICENSE,sha256=G9fFZcNIVWjU7o6Pr_4sJBRCNDU5X-zelSxIJ2D48ms,1323
-heavyball-2.1.2.dist-info/METADATA,sha256=EMM0OI4cPeaQlMkts2j9CCp9KxhJm-o_9VDNLm4ySQg,5046
-heavyball-2.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-heavyball-2.1.2.dist-info/top_level.txt,sha256=SzCxSVg_qCUPA4kZObW3Zyo4v-d_mMOD-p7a-WXTl2E,10
-heavyball-2.1.2.dist-info/RECORD,,

{heavyball-2.1.2.dist-info → heavyball-2.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{heavyball-2.1.2.dist-info → heavyball-2.1.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{heavyball-2.1.2.dist-info → heavyball-2.1.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

heavyball 2.1.2__py3-none-any.whl → 2.1.4__py3-none-any.whl

heavyball 2.1.2py3-none-any.whl → 2.1.4py3-none-any.whl