PyPI - heavyball - Versions diffs - 1.7.0__tar.gz → 1.7.1__tar.gz - Mend

heavyball 1.7.0tar.gz → 1.7.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{heavyball-1.7.0 → heavyball-1.7.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: heavyball
-Version: 1.7.0
+Version: 1.7.1
 Summary: Efficient Optimizers
 Author-email: HeavyBall Authors <github.heavyball@nestler.sh>
 Project-URL: source, https://github.com/HomebrewML/HeavyBall

{heavyball-1.7.0 → heavyball-1.7.1}/heavyball/__init__.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import functools
+import math
 from typing import Optional
 from . import chainable as C
@@ -564,6 +565,10 @@ class ForeachCachedNewtonPSGD(ForeachCachedPSGDKron):
     hessian_approx = True
+class NewtonHybrid2PSGDKron(ForeachCachedNewtonPSGD):
+    hvp_interval = 2
 class ForeachPSGDLRA(C.BaseOpt):
     """
     Originally from Evan Walters and Omead Pooladzandi, 2024
@@ -582,7 +587,7 @@ class ForeachPSGDLRA(C.BaseOpt):
         weight_decay=0.0,
         preconditioner_update_probability=None,
         momentum_into_precond_update=True,
-        rank: int = 4,
+        rank: Optional[int] = None,
         warmup_steps: int = 0,
         foreach: bool = True,
         q_dtype="float32",
@@ -608,6 +613,14 @@ class ForeachPSGDLRA(C.BaseOpt):
         )
         params = defaults.pop("params")
+        if rank is None:
+            utils.warn_once(
+                f"{rank=}. It will be set to log2(param_count). This requires `params` to be of type list. Currently, {type(params)=}"
+            )
+            params = list(params)
+            defaults["rank"] = round(math.log2(sum(p.numel() for p in params)))
+            utils.warn_once(f"rank was set to {defaults['rank']}")
         delayed = C.default(delayed, self.delayed)
         exp_avg_input = C.default(exp_avg_input, self.exp_avg_input)
         update_clipping = C.default(update_clipping, utils.trust_region_clip_)
@@ -632,6 +645,10 @@ class ForeachNewtonPSGDLRA(ForeachPSGDLRA):
     hessian_approx = True
+class NewtonHybrid2PSGDLRA(ForeachNewtonPSGDLRA):
+    hvp_interval = 2
 PalmForEachSoap = PaLMForeachSOAP
 PaLMSOAP = PaLMForeachSOAP
 PaLMSFAdamW = PaLMForeachSFAdamW
@@ -696,4 +713,6 @@ __all__ = [
     "DelayedPSGD",
     "PSGDLRA",
     "NewtonPSGDLRA",
+    "NewtonHybrid2PSGDLRA",
+    "NewtonHybrid2PSGDKron",
 ]

{heavyball-1.7.0 → heavyball-1.7.1}/heavyball/chainable.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import functools
+import math
 import random
 from typing import List, Literal, Optional, Union
@@ -43,7 +44,7 @@ class FunctionTransform:
         raise NotImplementedError
     def get_fn(self):
-        if hasattr(self.fn, "get_fn"):
+        if utils.hasattr_none(self.fn, "get_fn"):
             return self.fn.get_fn()
         return self.fn
@@ -426,7 +427,7 @@ def _store_std(state, group, update, grad, param):
     state["init_std"] = torch.std(grad, dim=0)
-@general_guard("init_std", init_fn=_store_std)
+@general_guard("init_std", init_fn=_store_std, skip_first=False)
 @no_state
 def mup_approx(group, updates, grads, params, init_std):
     _updates = [(u, i) for u, i in zip(updates, init_std) if u.ndim > 1]
@@ -435,6 +436,40 @@ def mup_approx(group, updates, grads, params, init_std):
     return updates
+def _init_delta(state, group, update, grad, param, log_space: bool):
+    val = group["initial_d"]
+    state["delta"] = torch.full((), math.log(val) if log_space else val, dtype=param.dtype, device=param.device)
+def _init_full_delta(state, group, update, grad, param, log_space: bool):
+    val = group["initial_d"]
+    state["delta"] = torch.full_like(param, math.log(val) if log_space else val)
+@zero_guard("state")
+@general_guard("delta", init_fn=functools.partial(_init_delta, log_space=False), skip_first=False)
+@no_state
+def scale_by_d_adaptation(group, update, grad, param, state, delta):
+    utils.d_adaptation(grad, update, state, delta)
+    return update
+@zero_guard("state")
+@general_guard("delta", init_fn=functools.partial(_init_delta, log_space=True), skip_first=False)
+@no_state
+def scale_by_lr_adaptation(group, update, grad, param, state, delta):
+    utils.lr_adaptation(grad, update, state, delta, group["lr_lr"])
+    return update
+@zero_guard("state")
+@general_guard("delta", init_fn=functools.partial(_init_full_delta, log_space=True), skip_first=False)
+@no_state
+def scale_by_pointwise_lr_adaptation(group, update, grad, param, state, delta):
+    utils.pointwise_lr_adaptation(grad, update, state, delta, group["lr_lr"])
+    return update
 @zero_guard("momentum")
 @no_state
 def heavyball_momentum(group, updates, grads, params, momentum):
@@ -484,18 +519,22 @@ def _update_psgd_precond(cached, Q_cache, group, param, grad, Q_mat, Q, exprs, p
     if not group["is_preconditioning"]:
         return Q_mat
+    if utils.hasattr_none(param, "vector"):
+        vector, hessian_vector = param.vector, param.hessian_vector
+        del param.vector
+        del param.hessian_vector
+    else:
+        vector, hessian_vector = utils.dampen_grad(grad)
     utils.psgd_update_precond(
         Q_mat,
         exprs,
-        getattr(param, "hessian_vector", grad),
+        hessian_vector,
         group["precond_lr"],
         Q,
         group["store_triu_as_line"],
-        getattr(param, "vector", None),
+        vector,
     )
-    if hasattr(param, "vector"):
-        del param.vector
-        del param.hessian_vector
     if grad.dim() > 1 and precond_schedule(group, balance_probability, f"balance_prob_{id(Q)}"):
         if group["store_triu_as_line"]:
@@ -566,9 +605,12 @@ def _update_lra(
     if not group["is_preconditioning"]:
         return utils.flatten(U, 1), utils.flatten(V, 1), utils.flatten(d)
-    if hasattr(params[0], "hessian_vector") and params[0].hessian_vector is not None:
+    if utils.hasattr_none(params[0], "hessian_vector"):
         vector = utils.flatten([p.vector for p in params])
         hessian_vector = utils.flatten([p.hessian_vector for p in params])
+        for p in params:
+            del p.vector
+            del p.hessian_vector
     else:
         vector, hessian_vector = utils.dampen_multiple(grads)
     return utils.update_lra_precond_(U, V, d, vector, hessian_vector, group["eps"], group["precond_lr"], delayed)

heavyball-1.7.1/heavyball/optimizations/__init__.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""
+PSGD optimization module - optimized implementations of PSGD functions
+to improve execution speed while maintaining numerical equivalence.
+"""
+# Import optimized functions
+# Import integrator API
+from .integrator import (
+    enable_optimizations,
+    get_optimization_status,
+    restore_original_functions,
+)
+from .optimizations import (
+    # LRA optimizations
+    low_rank_mm_optimized,
+    lra_precond_optimized,
+    precond_grad_cached_optimized,
+    # KRON optimizations
+    psgd_calc_A_and_conjB_optimized,
+    psgd_precond_grad_optimized,
+    psgd_update_precond_optimized,
+    update_lra_precond_optimized,
+)
+__all__ = [
+    # Optimized functions
+    "low_rank_mm_optimized",
+    "update_lra_precond_optimized",
+    "lra_precond_optimized",
+    "psgd_calc_A_and_conjB_optimized",
+    "psgd_update_precond_optimized",
+    "psgd_precond_grad_optimized",
+    "precond_grad_cached_optimized",
+    # Integrator API
+    "enable_optimizations",
+    "restore_original_functions",
+    "get_optimization_status",
+]

heavyball-1.7.1/heavyball/optimizations/integrator.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""
+Integration module to selectively enable optimized implementations
+of PSGD functions while maintaining API compatibility.
+"""
+import os
+import sys
+from typing import Any, Dict
+import torch
+from . import optimizations
+from .. import utils
+# Store original function references
+_original_functions = {}
+_optimized_functions = {}
+# Mapping of original functions to their optimized versions
+OPTIMIZATION_MAP = {
+    # LRA functions
+    utils.update_lra_precond_: optimizations.update_lra_precond_optimized,
+    utils.lra_precond: optimizations.lra_precond_optimized,
+    # KRON functions
+    utils.psgd_update_precond: optimizations.psgd_update_precond_optimized,
+    utils.psgd_precond_grad: optimizations.psgd_precond_grad_optimized,
+    utils.precond_grad_cached_: optimizations.precond_grad_cached_optimized,
+}
+# Config for enabling/disabling optimizations
+_config = {
+    "enabled": os.environ.get("HEAVYBALL_OPTIMIZE", "1") == "1",
+    "torch_compile_allowed": os.environ.get("HEAVYBALL_USE_COMPILE", "1") == "1",
+    "enable_lra": True,
+    "enable_kron": True,
+    "verbose": os.environ.get("HEAVYBALL_VERBOSE", "0") == "1",
+}
+def _apply_monkey_patch(original_func, optimized_func):
+    """Monkey patch a function with its optimized version."""
+    if original_func not in _original_functions:
+        _original_functions[original_func] = original_func
+    # Store reference to the optimized function
+    _optimized_functions[original_func] = optimized_func
+    # Get the module where the original function is defined
+    module = original_func.__module__
+    func_name = original_func.__name__
+    # Replace the function in its module
+    if hasattr(sys.modules[module], func_name):
+        setattr(sys.modules[module], func_name, optimized_func)
+        if _config["verbose"]:
+            print(f"Replaced {module}.{func_name} with optimized version")
+    else:
+        if _config["verbose"]:
+            print(f"Warning: Could not find {func_name} in module {module}")
+def enable_optimizations(
+    enable: bool = True, lra: bool = True, kron: bool = True, torch_compile: bool = True, verbose: bool = False
+):
+    """
+    Enable or disable PSGD optimizations.
+    Args:
+        enable: Whether to enable optimizations at all
+        lra: Whether to enable LRA-specific optimizations
+        kron: Whether to enable Kron-specific optimizations
+        torch_compile: Whether to allow torch.compile optimizations
+        verbose: Whether to print optimization status messages
+    """
+    _config["enabled"] = enable
+    _config["enable_lra"] = lra
+    _config["enable_kron"] = kron
+    _config["torch_compile_allowed"] = torch_compile
+    _config["verbose"] = verbose
+    if verbose:
+        print(f"PSGD Optimizations: {'enabled' if enable else 'disabled'}")
+        print(f"  - LRA optimizations: {'enabled' if lra else 'disabled'}")
+        print(f"  - KRON optimizations: {'enabled' if kron else 'disabled'}")
+        print(f"  - torch.compile: {'allowed' if torch_compile else 'disabled'}")
+    if not enable:
+        # Restore original functions
+        restore_original_functions()
+        return
+    # Apply optimizations based on config
+    for orig_func, opt_func in OPTIMIZATION_MAP.items():
+        # Skip LRA functions if disabled
+        if not _config["enable_lra"] and orig_func in [utils.update_lra_precond_, utils.lra_precond]:
+            continue
+        # Skip KRON functions if disabled
+        if not _config["enable_kron"] and orig_func in [
+            utils.psgd_update_precond,
+            utils.psgd_precond_grad,
+            utils.precond_grad_cached_,
+        ]:
+            continue
+        _apply_monkey_patch(orig_func, opt_func)
+    # Disable torch.compile if not allowed
+    if not _config["torch_compile_allowed"]:
+        # Monkey patch torch.compile to be a no-op
+        def _noop_compile(fn, **kwargs):
+            return fn
+        if not hasattr(torch, "_original_compile"):
+            torch._original_compile = torch.compile
+            torch.compile = _noop_compile
+            if verbose:
+                print("Disabled torch.compile (replaced with no-op)")
+    else:
+        # Restore original torch.compile
+        if hasattr(torch, "_original_compile"):
+            torch.compile = torch._original_compile
+            del torch._original_compile
+            if verbose:
+                print("Restored original torch.compile")
+def restore_original_functions():
+    """Restore all original function implementations."""
+    for orig_func, func_ref in _original_functions.items():
+        module = orig_func.__module__
+        func_name = orig_func.__name__
+        if hasattr(sys.modules[module], func_name):
+            setattr(sys.modules[module], func_name, func_ref)
+            if _config["verbose"]:
+                print(f"Restored original implementation of {module}.{func_name}")
+    # Also restore torch.compile if it was modified
+    if hasattr(torch, "_original_compile"):
+        torch.compile = torch._original_compile
+        del torch._original_compile
+        if _config["verbose"]:
+            print("Restored original torch.compile")
+def get_optimization_status() -> Dict[str, Any]:
+    """Get current optimization status."""
+    return {
+        "enabled": _config["enabled"],
+        "lra_enabled": _config["enable_lra"],
+        "kron_enabled": _config["enable_kron"],
+        "torch_compile_allowed": _config["torch_compile_allowed"],
+        "optimized_functions": list(_optimized_functions.keys()),
+        "original_functions": list(_original_functions.keys()),
+    }
+# Auto-initialize optimizations based on environment
+if os.environ.get("HEAVYBALL_AUTO_OPTIMIZE", "1") == "1":
+    enable_optimizations(
+        enable=_config["enabled"],
+        lra=_config["enable_lra"],
+        kron=_config["enable_kron"],
+        torch_compile=_config["torch_compile_allowed"],
+        verbose=_config["verbose"],
+    )

heavyball 1.7.0__tar.gz → 1.7.1__tar.gz

heavyball 1.7.0tar.gz → 1.7.1tar.gz