PyPI - adv-optm - Versions diffs - 1.2.dev2__py3-none-any.whl → 1.2.dev4__py3-none-any.whl - Mend

adv-optm 1.2.dev2py3-none-any.whl → 1.2.dev4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of adv-optm might be problematic. Click here for more details.

Files changed (16) hide show

adv_optm/__init__.py +3 -1
adv_optm/optim/AdaMuon_adv.py +465 -0
adv_optm/optim/AdamW_adv.py +3 -3
adv_optm/optim/Adopt_adv.py +3 -6
adv_optm/optim/Muon_adv.py +74 -20
adv_optm/optim/Prodigy_adv.py +3 -3
adv_optm/optim/Simplified_AdEMAMix.py +3 -3
adv_optm/optim/__init__.py +2 -0
adv_optm/util/Kourkoutas.py +5 -5
adv_optm/util/MuonAdam_helper.py +1 -0
{adv_optm-1.2.dev2.dist-info → adv_optm-1.2.dev4.dist-info}/METADATA +1 -1
adv_optm-1.2.dev4.dist-info/RECORD +24 -0
adv_optm-1.2.dev2.dist-info/RECORD +0 -23
{adv_optm-1.2.dev2.dist-info → adv_optm-1.2.dev4.dist-info}/WHEEL +0 -0
{adv_optm-1.2.dev2.dist-info → adv_optm-1.2.dev4.dist-info}/licenses/LICENSE +0 -0
{adv_optm-1.2.dev2.dist-info → adv_optm-1.2.dev4.dist-info}/top_level.txt +0 -0

adv_optm/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@ from .optim import (
     Lion_adv,
     Lion_Prodigy_adv,
     Muon_adv,
+    AdaMuon_adv,
 )
 __all__ = [
@@ -16,6 +17,7 @@ __all__ = [
     "Lion_adv",
     "Lion_Prodigy_adv",
     "Muon_adv",
+    "AdaMuon_adv",
 ]
-__version__ = "1.2.dev2"
+__version__ = "1.2.dev4"

adv_optm/optim/AdaMuon_adv.py ADDED Viewed

@@ -0,0 +1,465 @@
+import torch
+from typing import Optional, Callable
+from .AdamW_adv import AdamW_adv
+from ..util.MuonAdam_helper import MuonAdamHelper
+from ..util.Kourkoutas import KourkoutasHelper
+from ..util.BF16_Stochastic_Rounding import add_stochastic_
+from ..util.Newton_Schulz import _newton_schulz_iteration
+from ..util.Effective_Shape import _get_effective_shape
+from ..util.NNMF import _nnmf,_unnmf
+from ..util.One_Bit_Boolean import _pack_bools, _unpack_bools
+class AdaMuon_adv(torch.optim.Optimizer):
+    """
+    Implements the AdaMuon optimizer algorithm.
+    AdaMuon combines the geometry-aware updates of Muon with the element-wise
+    adaptivity of Adam. It is designed for 2D parameters (e.g., linear layers)
+    and can handle higher-dimensional parameters by flattening.
+    The algorithm incorporates three key mechanisms:
+    1.  A sign-stabilized orthogonal update, where the sign of the momentum is
+        orthogonalized instead of the momentum itself.
+    2.  An element-wise second momentum estimator applied to the orthogonalized
+        update directions.
+    3.  An RMS-aligned rescaling strategy to match the update magnitude of Adam,
+        allowing for reuse of learning rate schedules.
+    Can also operate in a hybrid mode, using an auxiliary AdamW
+    optimizer for specific parameters (e.g., biases, norms, embeddings) as
+    defined by a `layer_key_fn`.
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float): learning rate (default: 1e-3).
+        betas (tuple[float, float]): coefficients used for both first and second moment
+            estimation (default: (0.95, 0.95))
+        weight_decay (float): weight decay (L2 penalty) (default: 0.1).
+        eps (float): term added to the denominator for adaptive scaling to improve
+            numerical stability (default: 1e-8).
+        rms_target (float): The target Root-Mean-Square value for the final update
+            vector, used for RMS-aligned rescaling. Allows for the reuse of existing Adam
+            learning rate schedules. (default: 0.2).
+        ns_steps (int): number of Newton-Schulz iterations to perform (default: 5).
+        ns_eps (float): epsilon for Newton-Schulz normalization stability (default: 1e-7).
+        ns_coeffs (tuple[float, float, float]): The (a, b, c) coefficients for the
+            quintic polynomial in the Newton-Schulz iteration.
+            (default: (3.4445, -4.7750, 2.0315)).
+        stochastic_rounding (bool): whether to use stochastic rounding for
+            BF16 parameter updates (default: True).
+        nesterov (bool): enables Nesterov momentum (default: False).
+        use_atan2 (bool): whether to use the atan2 update rule. (default: False)
+        Simplified_AdEMAMix (bool): whether to use the Simplified AdEMAMix update rule.
+            This changes the update  to `alpha_grad * grad + mt`, which can be
+            more responsive, especially for small batch sizes. (default: False)
+        alpha_grad (float): Mixing coefficient for the Simplified AdEMAMix update rule
+            (only used when `Simplified_AdEMAMix` is `True`). Controls the weight of the
+            current gradient. For small batch sizes, use high values (e.g., 10-100) to be
+            more responsive. For large batch sizes, use low values (e.g., 0-1) for
+            stability. (default: 100.0)
+        vector_reshape_muon (bool): whether to reshape 1D vectors into 2D
+            matrices for muon NewtonSchulz (default: False).
+        vector_reshape (bool): whether to reshape 1D vectors into 2D
+            matrices to apply low-rank compression (default: True).
+        nnmf_factor (bool): whether to use the factorization or disable it to use
+            the uncompressed optimizer. (default: False)
+        kourkoutas_beta (bool): whether to enable the layer-wise dynamic β₂ logic.
+            If `False`, the optimizer behaves as standard AdamW. (default: False)
+        beta2_min (float): The minimum value for dynamic β₂, used during periods of
+            high gradient variance ("sunspikes"). Must be less than `betas[1]`.
+            (default: 0.88)
+        ema_alpha (float): The decay rate for the Exponential Moving Average (EMA) of
+            the pooled gradient norms. Corresponds to `α` in the paper.
+            (default: 0.93)
+        tiny_spike (float): A small constant added to the denominator of the
+            "sunspike" ratio calculation to prevent division by zero. Corresponds
+            to `ε_spike` in the paper. (default: 1e-9)
+        k_warmup_steps (int): The number of initial steps during which β₂ is held
+            at a fixed beta2 value before the
+            dynamic logic activates. (default: 0)
+        MuonWithAuxAdam (bool): If True, enables the hybrid optimizer mode.
+            Parameters designated by `layer_key_fn` will be optimized with
+            AdamW_adv instead of Muon. (default: False)
+        layer_key_fn (Optional[Callable]): A function that takes a parameter `p`
+            and returns a key. If the key is 'adam', the parameter is handled by
+            the auxiliary AdamW optimizer. All other keys are handled by Muon.
+            Only used when `MuonWithAuxAdam` is True. (default: None)
+        adam_kwargs (Optional[dict]): A dictionary of keyword arguments to pass
+            to the auxiliary AdamW_adv optimizer. Only used when
+            `MuonWithAuxAdam` is True. (default: None)
+    """
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-3,
+        betas: tuple[float, float] = (0.95, 0.95),
+        weight_decay: float = 0.1,
+        eps: float = 1e-8,
+        rms_target: float = 0.2,
+        ns_steps: int = 5,
+        ns_eps: float = 1e-7,
+        ns_coeffs: tuple[float, float, float] = (3.4445, -4.7750, 2.0315),
+        stochastic_rounding: bool = True,
+        use_atan2: bool = False,
+        nesterov: bool = False,
+        Simplified_AdEMAMix: bool = False,
+        alpha_grad: float = 100.0,
+        vector_reshape_muon: bool = False,
+        vector_reshape: bool = False,
+        nnmf_factor: bool = False,
+        # K-b parameters
+        kourkoutas_beta: bool = False,
+        beta2_min: float = 0.9,
+        ema_alpha: float = 0.95,
+        tiny_spike: float = 1e-9,
+        k_warmup_steps: int = 0,
+        k_logging: int = 0,
+        layer_key_kb_fn: Optional[Callable] = None,
+        # hybrid optimizer mode
+        MuonWithAuxAdam: bool = False,
+        layer_key_fn: Optional[Callable] = None,
+        muon_adam_lr: float = 1e-4,
+        adam_kwargs: Optional[dict] = None,
+    ):
+        if not (lr >= 0.0):
+            raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
+        if not (weight_decay >= 0.0):
+            raise ValueError(f"Weight-decay should be >= 0.0. Got {weight_decay}")
+        if not (ns_steps > 0):
+            raise ValueError(f"Newton-Schulz steps should be > 0. Got {ns_steps}")
+        if Simplified_AdEMAMix and nesterov:
+            print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling cautious.")
+            nesterov = False
+        muon_defaults = {
+            "lr": lr, "betas": betas, "weight_decay": weight_decay,
+            "eps": eps, "rms_target": rms_target, "ns_steps": ns_steps,
+            "ns_eps": ns_eps, "ns_coeffs": ns_coeffs, "nnmf_factor": nnmf_factor,
+            "vector_reshape": vector_reshape,
+            "vector_reshape_muon": vector_reshape_muon,
+            "nesterov":nesterov, "use_atan2":use_atan2,
+            "Simplified_AdEMAMix": Simplified_AdEMAMix, "alpha_grad": alpha_grad,
+            "_kourkoutas_beta": kourkoutas_beta, "beta2_min": beta2_min, "ema_alpha": ema_alpha,
+            "tiny_spike": tiny_spike, "k_warmup_steps": k_warmup_steps, "k_logging": k_logging,
+        }
+        self.stochastic_rounding = stochastic_rounding
+        self._kourkoutas_beta = kourkoutas_beta
+        self._kourkoutas_helper = None
+        self.layer_key_kb_fn = layer_key_kb_fn
+        self.MuonWithAuxAdam = MuonWithAuxAdam
+        self.helper = None
+        self.aux_adam = None
+        if not self.MuonWithAuxAdam:
+            super().__init__(params, muon_defaults)
+            return
+        # HYBRID OPTIMIZER LOGIC
+        adam_kwargs = adam_kwargs or {}
+        self.aux_adam = AdamW_adv(
+            [],
+            lr=muon_adam_lr,
+            **adam_kwargs,
+            _is_delegate=True
+        )
+        adam_defaults = self.aux_adam.defaults
+        final_param_groups = []
+        _layer_key_fn = layer_key_fn if layer_key_fn is not None else lambda p: 'muon'
+        for group in params:
+            # All params in a group are of the same type
+            first_param = group['params'][0]
+            key = _layer_key_fn(first_param)
+            optim_type = 'adam' if key == 'adam' else 'muon'
+            new_group = group.copy()
+            defaults_to_use = adam_defaults if optim_type == 'adam' else muon_defaults
+            for key, value in defaults_to_use.items():
+                new_group.setdefault(key, value)
+            final_param_groups.append(new_group)
+        super().__init__(final_param_groups, {})
+        # Now that self is initialized, create the helper
+        self.helper = MuonAdamHelper(self, layer_key_fn)
+    @property
+    def supports_fused_back_pass(self):
+        return True
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+    @property
+    def supports_flat_params(self):
+        return False
+    @property
+    def kourkoutas_helper(self):
+        """
+        Exposes the kourkoutas_helper from the auxiliary AdamW optimizer,
+        if it exists. This allows external access for logging K-b.
+        """
+        if self.aux_adam and hasattr(self.aux_adam, 'kourkoutas_helper'):
+            return self.aux_adam.kourkoutas_helper
+        return None
+    @torch.no_grad()
+    def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
+        if group['_kourkoutas_beta'] and self._kourkoutas_helper is None:
+            self._kourkoutas_helper = KourkoutasHelper(self)
+        if self.MuonWithAuxAdam:
+            optim_type = self.helper.get_optimizer_type(p)
+            if optim_type == 'adam':
+                # Delegate to the AdamW_adv optimizer's logic.
+                # We need to temporarily "lend" our state and param_groups
+                self.aux_adam.state = self.state
+                self.aux_adam.param_groups = self.param_groups
+                # Ensure the aux optimizer uses the same Kourkoutas helper instance.
+                if self._kourkoutas_helper is not None:
+                    self.aux_adam.kourkoutas_helper = self._kourkoutas_helper
+                self.aux_adam.step_parameter(p, group, i)
+                return
+        if p.grad is None:
+            return
+        grad = p.grad
+        state = self.state[p]
+        # State Initialization
+        if 'step' not in state:
+            state['step'] = 0
+            should_factor = (
+                group['nnmf_factor'] and
+                not (len(p.shape) == 1 and not group['vector_reshape'])
+            )
+            state['factored'] = should_factor
+            state['reshaped_1d_muon'] = len(p.shape) == 1 and group['vector_reshape_muon']
+            dtype = torch.float32 if group['nnmf_factor'] else p.dtype
+            device = p.device
+            if state['factored'] or state['reshaped_1d_muon']:
+                    state['effective_shape'] = _get_effective_shape(p.numel())
+                    d1, d2 = state['effective_shape']
+            if state['factored']:
+                    state['mu_m_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                    state['mv_m_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+                    packed_d2 = (d2 + 7) // 8
+                    state['sign'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=device)
+                    state['mu_v_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                    state['mv_v_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+            else:
+                if len(p.shape) >= 2:
+                    state['momentum_buffer'] = torch.zeros_like(p)
+                    state['second_momentum_buffer'] = torch.zeros_like(p)
+                if state['reshaped_1d_muon']:
+                    state['momentum_buffer'] = torch.zeros((d1, d2), device=device, dtype=dtype)
+                    state['second_momentum_buffer'] = torch.zeros((d1, d2), device=device, dtype=dtype)
+                elif len(p.shape) == 1:
+                    state['momentum_buffer'] = torch.zeros_like(p)
+        # Retrieve hyperparameters
+        beta1, beta2 = group['betas']
+        current_step = state['step']
+        nesterov = group['nesterov']
+        Simplified_AdEMAMix = group['Simplified_AdEMAMix']
+        alpha_grad = group['alpha_grad']
+        if state['factored']: # Factored AdaMuon
+            # Reconstruct momentum from previous step's factors & sign
+            d1, d2 = state['effective_shape']
+            mt_buf = _unnmf((state['mu_m_nmf'], state['mv_m_nmf']))
+            unpacked_sign = _unpack_bools(state['sign'], original_m=d2)
+            torch.where(unpacked_sign, mt_buf, -mt_buf, out=mt_buf)
+            del unpacked_sign
+            # Update momentum in full-size
+            grad_reshaped = grad.view(d1, d2)
+            mt_buf.mul_(beta1).add_(grad_reshaped)
+            if nesterov:
+                signed_m_buf = torch.sign(grad_reshaped.add(mt_buf, alpha=beta1))
+            elif Simplified_AdEMAMix:
+                signed_m_buf = torch.sign(mt_buf.add(grad_reshaped, alpha=alpha_grad))
+            else:
+                signed_m_buf = torch.sign(mt_buf)
+            del grad_reshaped
+            update = _newton_schulz_iteration(
+                signed_m_buf,
+                steps=group['ns_steps'],
+                eps=group['ns_eps'],
+                coeffs=group['ns_coeffs'],
+            )
+            if group['_kourkoutas_beta']:
+                # Call prepare_step() once at the beginning of the step for all params
+                self._kourkoutas_helper.maybe_prepare_step(current_step)
+                # Accumulate current sign-stabilized orthogonal update's norm for the *next* step
+                self._kourkoutas_helper.accumulate_gradient_sq_norm(p, update.view(p.shape))
+                # Get the dynamic beta2 calculated in prepare_step()
+                beta2 = self._kourkoutas_helper.get_beta2(p, group, current_step)
+            # Reconstruct second momentum from previous step's factors
+            vt_buf = _unnmf((state['mu_v_nmf'], state['mv_v_nmf']))
+            # Update second momentum in full-size
+            vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
+            # Apply second momentum update (adaptive scaling)
+            if group['use_atan2']:
+                a = 1.2732395
+                denom = vt_buf.sqrt()
+                update.atan2_(denom).mul_(a)
+            else:
+                denom = vt_buf.sqrt().add_(group['eps'])
+                update.div_(denom)
+            del denom
+            # RMS-aligned rescaling
+            rms_target = group['rms_target']
+            num_elements = update.numel()
+            scaling_factor = rms_target * (num_elements ** 0.5) / (update.norm())
+            update.mul_(scaling_factor)
+            update = update.view(p.shape).mul_(group['lr'])
+            del num_elements, scaling_factor
+            # Compress updated moments and store new factors
+            state['sign'] = _pack_bools(mt_buf > 0)
+            _nnmf(mt_buf.abs(), out=(state['mu_m_nmf'], state['mv_m_nmf']))
+            del mt_buf
+            _nnmf(vt_buf.abs(), out=(state['mu_v_nmf'], state['mv_v_nmf']))
+            del vt_buf
+        else: # Standard AdaMuon logic for non-factored tensors
+            if len(p.shape) >= 2 or state['reshaped_1d_muon']:
+                # Momentum update
+                mt_buf = state['momentum_buffer']
+                if state['reshaped_1d_muon']:
+                    d1, d2 = state['effective_shape']
+                    grad_reshaped = grad.view(d1, d2)
+                    mt_buf.mul_(beta1).add_(grad_reshaped)
+                    if nesterov:
+                        signed_m_buf = torch.sign(grad_reshaped.add(mt_buf, alpha=beta1))
+                    elif Simplified_AdEMAMix:
+                        signed_m_buf = torch.sign(mt_buf.add(grad_reshaped, alpha=alpha_grad))
+                    else:
+                        signed_m_buf = torch.sign(mt_buf)
+                    del grad_reshaped
+                else:
+                    mt_buf.mul_(beta1).add_(grad)
+                    if nesterov:
+                        signed_m_buf = torch.sign(grad.add(mt_buf, alpha=beta1))
+                    elif Simplified_AdEMAMix:
+                        signed_m_buf = torch.sign(mt_buf.add(grad, alpha=alpha_grad))
+                    else:
+                        signed_m_buf = torch.sign(mt_buf)
+                # Flatten if necessary (e.g., for Conv layers)
+                if len(p.shape) > 2:
+                    signed_m_buf = signed_m_buf.view(p.shape[0], -1)
+                # NewtonSchulz
+                update = _newton_schulz_iteration(
+                    signed_m_buf,
+                    steps=group['ns_steps'],
+                    eps=group['ns_eps'],
+                    coeffs=group['ns_coeffs'],
+                )
+                if len(p.shape) > 2 or state['reshaped_1d_muon']:
+                    update = update.view(p.shape)
+            if group['_kourkoutas_beta']:
+                # Call prepare_step() once at the beginning of the step for all params
+                self._kourkoutas_helper.maybe_prepare_step(current_step)
+                # Accumulate current sign-stabilized orthogonal update's norm for the *next* step
+                self._kourkoutas_helper.accumulate_gradient_sq_norm(p, update)
+                # Get the dynamic beta2 calculated in prepare_step()
+                beta2 = self._kourkoutas_helper.get_beta2(p, group, current_step)
+                vt_buf = state['second_momentum_buffer']
+                vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
+                # Apply second momentum update (adaptive scaling)
+                if group['use_atan2']:
+                    a = 1.2732395
+                    denom = vt_buf.sqrt()
+                    update.atan2_(denom).mul_(a)
+                else:
+                    denom = vt_buf.sqrt().add_(group['eps'])
+                    update.div_(denom)
+                del denom
+                # RMS-aligned rescaling
+                rms_target = group['rms_target']
+                num_elements = update.numel()
+                scaling_factor = rms_target * (num_elements ** 0.5) / (update.norm())
+                update.mul_(scaling_factor)
+                del num_elements, scaling_factor
+                update.mul_(group['lr'])
+            else: # Fallback to standard SGD with momentum for 1D params (biases, etc.) when not reshaped
+                # Momentum update
+                mt_buf = state['momentum_buffer']
+                mt_buf.mul_(beta1).add_(grad)
+                if nesterov:
+                    update = grad.add(mt_buf, alpha=beta1)
+                elif Simplified_AdEMAMix:
+                    signed_m_buf = torch.sign(mt_buf.add(grad, alpha=alpha_grad))
+                else:
+                    update = mt_buf.clone()
+                update.mul_(group['lr'])
+        # Decoupled weight decay
+        if group["weight_decay"] != 0:
+            if p.dtype == torch.bfloat16 and self.stochastic_rounding:
+                add_stochastic_(p.data, p.data, alpha=-group["weight_decay"] * group["lr"])
+            else:
+                p.data.add_(p.data, alpha=-group["weight_decay"] * group["lr"])
+        if p.dtype == torch.bfloat16 and self.stochastic_rounding:
+            add_stochastic_(p.data, -update)
+        else:
+            p.data.add_(-update)
+        del update
+        state['step'] += 1
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step."""
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            for i, p in enumerate(group['params']):
+                self.step_parameter(p, group, i)
+        return loss

adv_optm/optim/AdamW_adv.py CHANGED Viewed

@@ -73,7 +73,7 @@ class AdamW_adv(torch.optim.Optimizer):
             logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
             every logging steps. Useful for debugging and tuning. Set to 0 to disable
             logging (default: 0).
-        layer_key_fn (Optional[Callable]): A function that takes a parameter `p`
+        layer_key_kb_fn (Optional[Callable]): A function that takes a parameter `p`
             and returns a unique, hashable key representing its "layer" or "bucket".
             If `None`, parameters are bucketed by their memory ID (tensor-wise).
             (default: None)
@@ -105,7 +105,7 @@ class AdamW_adv(torch.optim.Optimizer):
         tiny_spike: float = 1e-9,
         k_warmup_steps: int = 0,
         k_logging: int = 0,
-        layer_key_fn: Optional[Callable] = None,
+        layer_key_kb_fn: Optional[Callable] = None,
         nnmf_factor: bool = False,
         _is_delegate: bool = False,
     ):
@@ -137,7 +137,7 @@ class AdamW_adv(torch.optim.Optimizer):
         self.use_AdEMAMix = use_AdEMAMix
         self.factored = nnmf_factor
         self.kourkoutas_beta = kourkoutas_beta
-        self.layer_key_fn = layer_key_fn
+        self.layer_key_kb_fn = layer_key_kb_fn
         if not _is_delegate:
             super().__init__(params, defaults)
         else:

adv_optm/optim/Adopt_adv.py CHANGED Viewed

@@ -91,7 +91,7 @@ class Adopt_adv(torch.optim.Optimizer):
             logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
             every logging steps. Useful for debugging and tuning. Set to 0 to disable
             logging (default: 0).
-        layer_key_fn (Optional[Callable]): A function that takes a parameter `p`
+        layer_key_kb_fn (Optional[Callable]): A function that takes a parameter `p`
             and returns a unique, hashable key representing its "layer" or "bucket".
             If `None`, parameters are bucketed by their memory ID (tensor-wise).
             (default: None)
@@ -125,7 +125,7 @@ class Adopt_adv(torch.optim.Optimizer):
         tiny_spike: float = 1e-9,
         k_warmup_steps: int = 0,
         k_logging: int = 0,
-        layer_key_fn: Optional[Callable] = None,
+        layer_key_kb_fn: Optional[Callable] = None,
         nnmf_factor: bool = False,
     ):
         if not (lr >= 0.0):
@@ -148,9 +148,6 @@ class Adopt_adv(torch.optim.Optimizer):
             print("Warning: grams is incompatible with Simplified_AdEMAMix, Disabling grams.")
         if cautious_mask and Simplified_AdEMAMix:
             print("Warning: cautious is incompatible with Simplified_AdEMAMix, Disabling cautious.")
-        if use_atan2 and Simplified_AdEMAMix:
-            print("Warning: use_atan2 is incompatible with Simplified_AdEMAMix. Disabling use_atan2.")
-            use_atan2 = False
         defaults = {
             "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
@@ -169,7 +166,7 @@ class Adopt_adv(torch.optim.Optimizer):
         self.Simplified_AdEMAMix = Simplified_AdEMAMix
         self.factored = nnmf_factor
         self.kourkoutas_beta = kourkoutas_beta
-        self.layer_key_fn = layer_key_fn
+        self.layer_key_kb_fn = layer_key_kb_fn
         super().__init__(params, defaults)
         if self.kourkoutas_beta:

adv_optm/optim/Muon_adv.py CHANGED Viewed

@@ -22,7 +22,7 @@ class Muon_adv(torch.optim.Optimizer):
     can handle other-dimensional parameters (e.g., 1D bias, 4D convolutional layers) by
     flattening/reshaping them.
-    This version can also operate in a hybrid mode, using an auxiliary AdamW
+    Can also operate in a hybrid mode, using an auxiliary AdamW
     optimizer for specific parameters (e.g., biases, norms, embeddings) as
     defined by a `layer_key_fn`.
@@ -38,6 +38,14 @@ class Muon_adv(torch.optim.Optimizer):
         ns_coeffs (tuple[float, float, float]): The (a, b, c) coefficients for the
             quintic polynomial in the Newton-Schulz iteration.
             (default: (3.4445, -4.7750, 2.0315)).
+        Simplified_AdEMAMix (bool): whether to use the Simplified AdEMAMix update rule.
+            This changes the update  to `alpha_grad * grad + mt`, which can be
+            more responsive, especially for small batch sizes. (default: False)
+        alpha_grad (float): Mixing coefficient for the Simplified AdEMAMix update rule
+            (only used when `Simplified_AdEMAMix` is `True`). Controls the weight of the
+            current gradient. For small batch sizes, use high values (e.g., 10-100) to be
+            more responsive. For large batch sizes, use low values (e.g., 0-1) for
+            stability. (default: 100.0)
         stochastic_rounding (bool): whether to use stochastic rounding for
             BF16 parameter updates (default: True).
         vector_reshape_muon (bool): whether to reshape 1D vectors into 2D
@@ -68,9 +76,11 @@ class Muon_adv(torch.optim.Optimizer):
         ns_steps: int = 5,
         ns_eps: float = 1e-7,
         ns_coeffs: tuple[float, float, float] = (3.4445, -4.7750, 2.0315),
+        Simplified_AdEMAMix: bool = False,
+        alpha_grad: float = 100.0,
         stochastic_rounding: bool = True,
         vector_reshape_muon: bool = False,
-        vector_reshape: bool = True,
+        vector_reshape: bool = False,
         nnmf_factor: bool = False,
         # hybrid optimizer mode
         MuonWithAuxAdam: bool = False,
@@ -86,13 +96,17 @@ class Muon_adv(torch.optim.Optimizer):
             raise ValueError(f"Weight-decay should be >= 0.0. Got {weight_decay}")
         if not (ns_steps > 0):
             raise ValueError(f"Newton-Schulz steps should be > 0. Got {ns_steps}")
+        if Simplified_AdEMAMix and nesterov:
+            print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling cautious.")
+            nesterov = False
-        defaults = {
+        muon_defaults = {
             "lr": lr, "beta1": beta1, "weight_decay": weight_decay,
             "nesterov": nesterov, "ns_steps": ns_steps, "ns_eps": ns_eps,
             "ns_coeffs": ns_coeffs, "nnmf_factor": nnmf_factor,
             "vector_reshape": vector_reshape,
             "vector_reshape_muon": vector_reshape_muon,
+            "Simplified_AdEMAMix": Simplified_AdEMAMix, "alpha_grad": alpha_grad,
         }
         self.stochastic_rounding = stochastic_rounding
@@ -100,23 +114,41 @@ class Muon_adv(torch.optim.Optimizer):
         self.helper = None
         self.aux_adam = None
-        if self.MuonWithAuxAdam:
-            adam_kwargs = adam_kwargs or {}
-            # Create a delegate AdamW optimizer to get its default hyperparameters.
-            self.aux_adam = AdamW_adv(
-                [],
-                lr=muon_adam_lr,
-                **adam_kwargs,
-                _is_delegate=True
-            )
-            # Update the defaults dictionary
-            defaults.update(self.aux_adam.defaults)
-        super().__init__(params, defaults)
+        if not self.MuonWithAuxAdam:
+            super().__init__(params, muon_defaults)
+            return
-        if self.MuonWithAuxAdam:
-            self.helper = MuonAdamHelper(self, layer_key_fn)
+        # HYBRID OPTIMIZER LOGIC
+        adam_kwargs = adam_kwargs or {}
+        self.aux_adam = AdamW_adv(
+            [],
+            lr=muon_adam_lr,
+            **adam_kwargs,
+            _is_delegate=True
+        )
+        adam_defaults = self.aux_adam.defaults
+        final_param_groups = []
+        _layer_key_fn = layer_key_fn if layer_key_fn is not None else lambda p: 'muon'
+        for group in params:
+            first_param = group['params'][0]
+            key = _layer_key_fn(first_param)
+            optim_type = 'adam' if key == 'adam' else 'muon'
+            new_group = group.copy()
+            defaults_to_use = adam_defaults if optim_type == 'adam' else muon_defaults
+            for key, value in defaults_to_use.items():
+                new_group.setdefault(key, value)
+            final_param_groups.append(new_group)
+        super().__init__(final_param_groups, {})
+        # Now that self is initialized, create the helper
+        self.helper = MuonAdamHelper(self, layer_key_fn)
     @property
     def supports_fused_back_pass(self):
@@ -130,6 +162,16 @@ class Muon_adv(torch.optim.Optimizer):
     def supports_flat_params(self):
         return False
+    @property
+    def kourkoutas_helper(self):
+        """
+        Exposes the kourkoutas_helper from the auxiliary AdamW optimizer,
+        if it exists. This allows external access for logging K-b.
+        """
+        if self.aux_adam and hasattr(self.aux_adam, 'kourkoutas_helper'):
+            return self.aux_adam.kourkoutas_helper
+        return None
     @torch.no_grad()
     def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
         if self.MuonWithAuxAdam:
@@ -165,7 +207,7 @@ class Muon_adv(torch.optim.Optimizer):
             dtype = torch.float32 if group['nnmf_factor'] else p.dtype
             device = p.device
-            if group['vector_reshape'] or state['reshaped_1d_muon']:
+            if state['factored'] or state['reshaped_1d_muon']:
                     state['effective_shape'] = _get_effective_shape(p.numel())
                     d1, d2 = state['effective_shape']
             if state['factored']:
@@ -183,6 +225,8 @@ class Muon_adv(torch.optim.Optimizer):
         beta1 = group['beta1']
         nesterov = group['nesterov']
+        Simplified_AdEMAMix = group['Simplified_AdEMAMix']
+        alpha_grad = group['alpha_grad']
         if state['factored']: # Factored Muon
@@ -200,6 +244,8 @@ class Muon_adv(torch.optim.Optimizer):
             if nesterov:
                 # Nesterov momentum
                 update = grad_reshaped.add(mt_buf, alpha=beta1)
+            elif Simplified_AdEMAMix:
+                update = torch.add(mt_buf, grad_reshaped, alpha=alpha_grad)
             else:
                 # Standard momentum
                 update = mt_buf.clone()
@@ -238,6 +284,12 @@ class Muon_adv(torch.optim.Optimizer):
                         del grad_reshaped
                     else:
                         update = grad.add(mt_buf, alpha=beta1)
+                elif Simplified_AdEMAMix:
+                    if state['reshaped_1d_muon']:
+                        update = torch.add(mt_buf, grad_reshaped, alpha=alpha_grad)
+                        del grad_reshaped
+                    else:
+                        update = torch.add(mt_buf, grad, alpha=alpha_grad)
                 else:
                     # Standard momentum
                     update = mt_buf.clone()
@@ -267,6 +319,8 @@ class Muon_adv(torch.optim.Optimizer):
                 if nesterov:
                     # Nesterov momentum
                     update = grad.add(mt_buf, alpha=beta1)
+                elif Simplified_AdEMAMix:
+                    update = torch.add(mt_buf, grad, alpha=alpha_grad)
                 else:
                     # Standard momentum
                     update = mt_buf.clone()
@@ -299,4 +353,4 @@ class Muon_adv(torch.optim.Optimizer):
             for i, p in enumerate(group['params']):
                 self.step_parameter(p, group, i)
-        return loss
+        return loss

adv_optm/optim/Prodigy_adv.py CHANGED Viewed

@@ -109,7 +109,7 @@ class Prodigy_adv(torch.optim.Optimizer):
             logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
             every logging steps. Useful for debugging and tuning. Set to 0 to disable
             logging (default: 0).
-        layer_key_fn (Optional[Callable]): A function that takes a parameter `p`
+        layer_key_kb_fn (Optional[Callable]): A function that takes a parameter `p`
             and returns a unique, hashable key representing its "layer" or "bucket".
             If `None`, parameters are bucketed by their memory ID (tensor-wise).
             (default: None)
@@ -152,7 +152,7 @@ class Prodigy_adv(torch.optim.Optimizer):
         tiny_spike: float = 1e-9,
         k_warmup_steps: int = 0,
         k_logging: int = 0,
-        layer_key_fn: Optional[Callable] = None,
+        layer_key_kb_fn: Optional[Callable] = None,
     ):
         if not (lr >= 0.0):
             raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
@@ -205,7 +205,7 @@ class Prodigy_adv(torch.optim.Optimizer):
         self.fsdp_in_use = fsdp_in_use
         self.kourkoutas_beta = kourkoutas_beta
-        self.layer_key_fn = layer_key_fn
+        self.layer_key_kb_fn = layer_key_kb_fn
         super().__init__(params, defaults)
         if self.kourkoutas_beta:

adv_optm/optim/Simplified_AdEMAMix.py CHANGED Viewed

@@ -67,7 +67,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
             logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
             every logging steps. Useful for debugging and tuning. Set to 0 to disable
             logging (default: 0).
-        layer_key_fn (Optional[Callable]): A function that takes a parameter `p`
+        layer_key_kb_fn (Optional[Callable]): A function that takes a parameter `p`
             and returns a unique, hashable key representing its "layer" or "bucket".
             If `None`, parameters are bucketed by their memory ID (tensor-wise).
             (default: None)
@@ -95,7 +95,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
         tiny_spike: float = 1e-9,
         k_warmup_steps: int = 0,
         k_logging: int = 0,
-        layer_key_fn: Optional[Callable] = None,
+        layer_key_kb_fn: Optional[Callable] = None,
         nnmf_factor: bool = False,
     ):
         if not (lr >= 0.0):
@@ -121,7 +121,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
         self.stochastic_rounding = stochastic_rounding
         self.factored = nnmf_factor
         self.kourkoutas_beta = kourkoutas_beta
-        self.layer_key_fn = layer_key_fn
+        self.layer_key_kb_fn = layer_key_kb_fn
         super().__init__(params, defaults)
         if self.kourkoutas_beta:

adv_optm/optim/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@ from .Simplified_AdEMAMix import Simplified_AdEMAMix
 from .Lion_adv import Lion_adv
 from .Lion_Prodigy_adv import Lion_Prodigy_adv
 from .Muon_adv import Muon_adv
+from .AdaMuon_adv import AdaMuon_adv
 __all__ = [
     "AdamW_adv",
@@ -14,4 +15,5 @@ __all__ = [
     "Lion_adv",
     "Lion_Prodigy_adv",
     "Muon_adv",
+    "AdaMuon_adv",
 ]

adv_optm/util/Kourkoutas.py CHANGED Viewed

@@ -32,12 +32,12 @@ class KourkoutasHelper:
         if self._layer_info_built:
             return
-        if hasattr(self.optimizer, 'layer_key_fn') and self.optimizer.layer_key_fn is not None:
+        if hasattr(self.optimizer, 'layer_key_kb_fn') and self.optimizer.layer_key_kb_fn is not None:
             # A custom key function was provided by the user. We will use it.
             pass
         else:
             # No key function was provided. Default to coarse, shape-based bucketing.
-            self.optimizer.layer_key_fn = lambda p: \
+            self.optimizer.layer_key_kb_fn = lambda p: \
                 (id(p),) if p.dim() == 2 and 1 <= p.shape[0] <= 10 and p.shape[1] in {768, 1280, 4096} \
                 else tuple(p.shape)
             # This ensures that we won't mix embeddings with tokens (1 to 10)
@@ -46,7 +46,7 @@ class KourkoutasHelper:
         for group in self.optimizer.param_groups:
             for p in group['params']:
                 # The mapping is static and should not depend on the presence of a gradient.
-                layer_key = self.optimizer.layer_key_fn(p)
+                layer_key = self.optimizer.layer_key_kb_fn(p)
                 if layer_key not in self.layer_info:
                     self.layer_info[layer_key] = {'params': [], 'group_ref': group}
                 self.layer_info[layer_key]['params'].append(p)
@@ -158,7 +158,7 @@ class KourkoutasHelper:
         """
         Accumulates the squared L2 norm of a single gradient for the next step's calculation.
         """
-        layer_key = self.optimizer.layer_key_fn(p)
+        layer_key = self.optimizer.layer_key_kb_fn(p)
         if layer_key in self.layer_info:
             # Initialize the transient state for this layer if it's the first time in the step.
@@ -173,6 +173,6 @@ class KourkoutasHelper:
         """
         Gets the appropriate beta2 for the current parameter, handling warmup and dynamic value fetching.
         """
-        layer_key = self.optimizer.layer_key_fn(p)
+        layer_key = self.optimizer.layer_key_kb_fn(p)
         # The default is the max value, which is correct for unmapped params or edge cases
         return self.layer_state.get(layer_key, {}).get('dynamic_beta2', group['betas'][1])

adv_optm/util/MuonAdam_helper.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import torch
 from torch.optim import Optimizer
 from typing import Callable, Optional

{adv_optm-1.2.dev2.dist-info → adv_optm-1.2.dev4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 1.2.dev2
+Version: 1.2.dev4
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

adv_optm-1.2.dev4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,24 @@
+adv_optm/__init__.py,sha256=bB7_VywKpvZbcGCjtZoF8giQgcUgoziISBgIaEUpcAw,379
+adv_optm/optim/AdaMuon_adv.py,sha256=s5UkR2YJ_Z10SiBokT97eq4tCHc2D8BEOFDx5AOMryQ,20983
+adv_optm/optim/AdamW_adv.py,sha256=7IvdD1rqYeHZwQCZU9X0H7x87MCKcHQ5M68GLuMCkvE,17702
+adv_optm/optim/Adopt_adv.py,sha256=C2FsEZGvCk9q4YNKAj0qIxdZ5AfPlda-1lIpSX0a1nE,21256
+adv_optm/optim/Lion_Prodigy_adv.py,sha256=LEA3UYJpPeFnmxeniLNv1u2LKKj4ufx3Bq_MLw-nWXk,14617
+adv_optm/optim/Lion_adv.py,sha256=aGNAplZlyXYgVllYcV_s4bK8iC4fv6EizFoWIMNLdBc,8299
+adv_optm/optim/Muon_adv.py,sha256=vB-Eeh0IqYMd3lkQvIPEbH256bTyYO73OgIzn0N2VCk,14985
+adv_optm/optim/Prodigy_adv.py,sha256=bmwuO8GrJHH4NaEaqE-ffcR9wHhQ57457xoN-P6hyks,25909
+adv_optm/optim/Simplified_AdEMAMix.py,sha256=sY-vThMVgADRh0ar9WHkrM2n8UcgQLQC1YV1Wx8uFz4,12983
+adv_optm/optim/__init__.py,sha256=hpUWE6CKtt_rvMdgQVb3PtjhfZAvAxTq6hp8H8rIpBo,489
+adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
+adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
+adv_optm/util/Kourkoutas.py,sha256=MDQaNVH8jqzaefks2RShveo44dpYDz88WStwUJ3iF0s,8724
+adv_optm/util/MuonAdam_helper.py,sha256=7rnNMujZVDaqo1g22QscMyPlZvIHQQSLHMED9_I8QWU,1250
+adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
+adv_optm/util/Newton_Schulz.py,sha256=wJ_sKRaGVIsOofQ737my4ng494qX_pfgOqlDDmYtnCg,1377
+adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
+adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
+adv_optm/util/__init__.py,sha256=jAaUfaAjFrTJ6-Q915ezAbq0efRbpYjriW2OdeCbSzo,433
+adv_optm-1.2.dev4.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+adv_optm-1.2.dev4.dist-info/METADATA,sha256=jNczVxIPq0LuusXuGrZ23CQ4CrMNOfJdBDpDQgulMUw,14022
+adv_optm-1.2.dev4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+adv_optm-1.2.dev4.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
+adv_optm-1.2.dev4.dist-info/RECORD,,

adv_optm-1.2.dev2.dist-info/RECORD DELETED Viewed

@@ -1,23 +0,0 @@
-adv_optm/__init__.py,sha256=THWhNF8-PI71K9Au4xAkuDs96YcEagJ-yT5r_g2-yKw,341
-adv_optm/optim/AdamW_adv.py,sha256=Zym0beeu0ye5_PgpAjpzcYghdPYFWs3gQzDmuPZVR80,17690
-adv_optm/optim/Adopt_adv.py,sha256=NXbtPrGm3tZr06cApi5oEHZ2F1zwss3tRi15SGnrYPc,21426
-adv_optm/optim/Lion_Prodigy_adv.py,sha256=LEA3UYJpPeFnmxeniLNv1u2LKKj4ufx3Bq_MLw-nWXk,14617
-adv_optm/optim/Lion_adv.py,sha256=aGNAplZlyXYgVllYcV_s4bK8iC4fv6EizFoWIMNLdBc,8299
-adv_optm/optim/Muon_adv.py,sha256=9K5YR3odaGfDDZzasletHRlqxG8xN9IXj6oiqx1CaEI,12423
-adv_optm/optim/Prodigy_adv.py,sha256=0_XG5YnMQTv-zJysJHlJniSo5kGYdX3p3o1e33HLt78,25897
-adv_optm/optim/Simplified_AdEMAMix.py,sha256=nEIA3yM11nBooKzHudB5l3x4UdFRBYRwiKVUkGmO0K8,12971
-adv_optm/optim/__init__.py,sha256=3o2XJ4J-PUq3rJM2mBnmuHwbKNb4LuW-Ig_9aBC0ycc,431
-adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
-adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
-adv_optm/util/Kourkoutas.py,sha256=woyJfX7l4eieeg0pC5XrILBLvwECwbD3a6ou1K6qjKU,8706
-adv_optm/util/MuonAdam_helper.py,sha256=llPCc9MBFen_wodbY4G2E17tBZky8clDiJSZLHkMva8,1236
-adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
-adv_optm/util/Newton_Schulz.py,sha256=wJ_sKRaGVIsOofQ737my4ng494qX_pfgOqlDDmYtnCg,1377
-adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
-adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
-adv_optm/util/__init__.py,sha256=jAaUfaAjFrTJ6-Q915ezAbq0efRbpYjriW2OdeCbSzo,433
-adv_optm-1.2.dev2.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-adv_optm-1.2.dev2.dist-info/METADATA,sha256=JTCPGBJUd4JR7DU26AhX8qSPzWrSVtEwv9Au7I3iEPY,14022
-adv_optm-1.2.dev2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-adv_optm-1.2.dev2.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
-adv_optm-1.2.dev2.dist-info/RECORD,,

{adv_optm-1.2.dev2.dist-info → adv_optm-1.2.dev4.dist-info}/WHEEL RENAMED Viewed

File without changes

{adv_optm-1.2.dev2.dist-info → adv_optm-1.2.dev4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{adv_optm-1.2.dev2.dist-info → adv_optm-1.2.dev4.dist-info}/top_level.txt RENAMED Viewed

File without changes

adv-optm 1.2.dev2__py3-none-any.whl → 1.2.dev4__py3-none-any.whl

Potentially problematic release.

adv-optm 1.2.dev2py3-none-any.whl → 1.2.dev4py3-none-any.whl