PyPI - adv-optm - Versions diffs - 1.2.dev12__py3-none-any.whl → 1.2.dev14__py3-none-any.whl - Mend

adv-optm 1.2.dev12py3-none-any.whl → 1.2.dev14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of adv-optm might be problematic. Click here for more details.

Files changed (9) hide show

adv_optm/__init__.py CHANGED Viewed

@@ -20,4 +20,4 @@ __all__ = [
     "AdaMuon_adv",
 ]
-__version__ = "1.2.dev12"
+__version__ = "1.2.dev14"

adv_optm/optim/AdaMuon_adv.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import torch
-from typing import Optional
-from .AdamW_adv import AdamW_adv
 from ..util.BF16_Stochastic_Rounding import add_stochastic_
 from ..util.Newton_Schulz import _newton_schulz_iteration
 from ..util.Effective_Shape import _get_effective_shape
 from ..util.NNMF import _nnmf,_unnmf
 from ..util.One_Bit_Boolean import _pack_bools, _unpack_bools
+from ..util.OrthoGrad import _orthogonalize_gradient
+from ..util.Kourkoutas import KourkoutasHelper
 class AdaMuon_adv(torch.optim.Optimizer):
     """
@@ -25,6 +24,8 @@ class AdaMuon_adv(torch.optim.Optimizer):
     3.  An RMS-aligned rescaling strategy to match the update magnitude of Adam,
         allowing for reuse of learning rate schedules.
+    When `MuonWithAuxAdam` is enabled, this single optimizer class handles both
+    'muon' and 'adam' parameter groups, dispatching to the appropriate logic internally.
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
@@ -55,8 +56,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
             current gradient. For small batch sizes, use high values (e.g., 10-100) to be
             more responsive. For large batch sizes, use low values (e.g., 0-1) for
             stability. (default: 100.0)
-        vector_reshape_muon (bool): whether to reshape 1D vectors into 2D
-            matrices for muon NewtonSchulz (default: False).
         vector_reshape (bool): whether to reshape 1D vectors into 2D
             matrices to apply low-rank compression (default: True).
         low_rank_ortho (bool): If True, enables low-rank orthogonalization, which
@@ -66,6 +65,19 @@ class AdaMuon_adv(torch.optim.Optimizer):
             (default: 128)
         nnmf_factor (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: False)
+        --- Auxiliary AdamW_adv Parameters (used for 'adam' groups) ---
+        adam_betas (tuple[float, float]): Betas for the AdamW optimizer part.
+        adam_eps (float): Epsilon for the AdamW optimizer part.
+        adam_weight_decay (float): Weight decay for the AdamW optimizer part.
+        adam_use_bias_correction (bool): Bias correction for AdamW.
+        adam_use_atan2 (bool): Atan2 update rule for AdamW.
+        adam_cautious_mask (bool): Cautious masking for AdamW.
+        adam_grams_moment (bool): Grams-style updates for AdamW.
+        adam_orthogonal_gradient (bool): OrthoGrad for AdamW.
+        adam_use_AdEMAMix (bool): AdEMAMix for AdamW.
+        adam_beta3_ema (float): Beta3 for AdEMAMix.
+        adam_alpha (float): Alpha for AdEMAMix.
+        adam_kourkoutas_beta (bool): Kourkoutas-β for AdamW.
     """
     def __init__(
@@ -79,17 +91,36 @@ class AdaMuon_adv(torch.optim.Optimizer):
         ns_steps: int = 5,
         ns_eps: float = 1e-7,
         ns_coeffs: tuple[float, float, float] = (3.4445, -4.7750, 2.0315),
-        stochastic_rounding: bool = True,
+        stochastic_rounding: bool = False,
         use_atan2: bool = False,
         nesterov: bool = False,
         Simplified_AdEMAMix: bool = False,
         alpha_grad: float = 100.0,
-        vector_reshape_muon: bool = False,
         vector_reshape: bool = False,
         # Low-rank Muon
         low_rank_ortho: bool = False,
         ortho_rank: int = 128,
         nnmf_factor: bool = False,
+        # Compiled
+        compiled_optimizer: bool = False,
+        # --- AdamW_adv specific parameters ---
+        adam_betas: tuple[float, float] = (0.9, 0.99),
+        adam_eps: float = 1e-8,
+        adam_weight_decay: float = 0.0,
+        adam_use_bias_correction: bool = True,
+        adam_use_atan2: bool = False,
+        adam_cautious_mask: bool = False,
+        adam_grams_moment: bool = False,
+        adam_orthogonal_gradient: bool = False,
+        adam_use_AdEMAMix: bool = False,
+        adam_beta3_ema: float = 0.9999,
+        adam_alpha: float = 5.0,
+        adam_kourkoutas_beta: bool = False,
+        adam_beta2_min: float = 0.9,
+        adam_ema_alpha: float = 0.95,
+        adam_tiny_spike: float = 1e-9,
+        adam_k_warmup_steps: int = 0,
+        adam_nnmf_factor: bool = False,
     ):
         if not (lr >= 0.0):
             raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
@@ -98,7 +129,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
         if not (ns_steps > 0):
             raise ValueError(f"Newton-Schulz steps should be > 0. Got {ns_steps}")
         if Simplified_AdEMAMix and nesterov:
-            print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling cautious.")
+            print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling nesterov.")
             nesterov = False
         defaults = {
@@ -106,16 +137,40 @@ class AdaMuon_adv(torch.optim.Optimizer):
             "eps": eps, "rms_target": rms_target, "ns_steps": ns_steps,
             "ns_eps": ns_eps, "ns_coeffs": ns_coeffs, "nnmf_factor": nnmf_factor,
             "vector_reshape": vector_reshape,
-            "vector_reshape_muon": vector_reshape_muon,
             "nesterov":nesterov, "use_atan2":use_atan2,
             "Simplified_AdEMAMix": Simplified_AdEMAMix, "alpha_grad": alpha_grad,
             # Low-rank Ortho
             "low_rank_ortho": low_rank_ortho, "ortho_rank": ortho_rank,
+            "compiled_optimizer":compiled_optimizer,
+            # AdamW_adv defaults
+            "adam_betas": adam_betas, "adam_eps": adam_eps, "adam_weight_decay": adam_weight_decay,
+            "adam_use_bias_correction": adam_use_bias_correction, "adam_use_atan2": adam_use_atan2,
+            "adam_cautious_mask": adam_cautious_mask, "adam_grams_moment": adam_grams_moment,
+            "adam_orthogonal_gradient": adam_orthogonal_gradient,
+            "adam_use_AdEMAMix": adam_use_AdEMAMix, "adam_beta3_ema": adam_beta3_ema, "adam_alpha": adam_alpha,
+            "adam_kourkoutas_beta": adam_kourkoutas_beta, "adam_beta2_min": adam_beta2_min,
+            "adam_ema_alpha": adam_ema_alpha, "adam_tiny_spike": adam_tiny_spike,
+            "adam_k_warmup_steps": adam_k_warmup_steps, "adam_nnmf_factor": adam_nnmf_factor,
         }
         self.stochastic_rounding = stochastic_rounding
         super().__init__(params, defaults)
+        self.global_step = 0 # For Adam bias correction and Kourkoutas
+        self.kourkoutas_helper = None
+        if any(group.get('adam_kourkoutas_beta', False) for group in self.param_groups):
+            self.kourkoutas_helper = KourkoutasHelper(self)
+        self.init_step()
+        # Initialize compiled functions to None
+        self._compiled_muon_step = None
+        self._compiled_adam_step = None
+        if compiled_optimizer:
+            print("Compiling AdaMuon_adv optimizer paths...")
+            torch._dynamo.config.cache_size_limit = 8192
+            self.compile(fullgraph=True)
     @property
     def supports_fused_back_pass(self):
@@ -129,50 +184,80 @@ class AdaMuon_adv(torch.optim.Optimizer):
     def supports_flat_params(self):
         return False
-    @torch.no_grad()
-    def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
-        if p.grad is None:
-            return
+    def init_step(self):
+        for group in self.param_groups:
+            for i, p in enumerate(group['params']):
+                self.__init_state(p, group)
-        grad = p.grad
+    @torch.no_grad()
+    def __init_state(self, p, group):
         state = self.state[p]
+        if len(state) > 0:
+            return
-        # State Initialization
-        if 'step' not in state:
-            state['step'] = 0
+        optim_type = group.get('optim_type', 'muon')
-            should_factor = (
+        if optim_type == 'muon':
+            state['factored'] = (
                 group['nnmf_factor'] and
                 not (len(p.shape) == 1 and not group['vector_reshape'])
             )
-            state['factored'] = should_factor
-            state['reshaped_1d_muon'] = len(p.shape) == 1 and group['vector_reshape_muon']
-            dtype = torch.float32 if group['nnmf_factor'] else p.dtype
+            dtype = torch.float32 if state['factored'] else p.dtype
             device = p.device
-            if state['factored'] or state['reshaped_1d_muon']:
+            if state['factored']:
                     state['effective_shape'] = _get_effective_shape(p.numel())
                     d1, d2 = state['effective_shape']
-            if state['factored']:
-                    state['mu_m_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
-                    state['mv_m_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+                    state['mu_mbuf_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                    state['mv_mbuf_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
                     packed_d2 = (d2 + 7) // 8
-                    state['sign'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=device)
-                    state['mu_v_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
-                    state['mv_v_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+                    state['sign_buf'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=device)
+                    state['mu_vbuf_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                    state['mv_vbuf_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
             else:
                 if len(p.shape) >= 2:
-                    state['momentum_buffer'] = torch.zeros_like(p)
                     state['second_momentum_buffer'] = torch.zeros_like(p)
-                if state['reshaped_1d_muon']:
-                    state['momentum_buffer'] = torch.zeros((d1, d2), device=device, dtype=dtype)
-                    state['second_momentum_buffer'] = torch.zeros((d1, d2), device=device, dtype=dtype)
-                elif len(p.shape) == 1:
-                    state['momentum_buffer'] = torch.zeros_like(p)
+                state['momentum_buffer'] = torch.zeros_like(p)
+        elif optim_type == 'adam':
+            state['factored'] = (
+                group['adam_nnmf_factor'] and
+                not (len(p.shape) == 1 and not group['vector_reshape'])
+            )
+            dtype = torch.float32 if state['factored'] else p.dtype
+            device = p.device
+            if state['factored']:
+                state['effective_shape'] = _get_effective_shape(p.numel())
+                d1, d2 = state['effective_shape']
+                # First moment (m)
+                if group['adam_betas'][0] > 0:
+                    state['mu_m_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                    state['mv_m_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+                    if not group.get('adam_grams_moment'):
+                        packed_d2 = (d2 + 7) // 8
+                        state['sign'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=device)
+                if group.get('adam_use_AdEMAMix'):
+                    state['mu_m_slow_nmf'] = torch.zeros(d1, device=p.device, dtype=dtype)
+                    state['mv_m_slow_nmf'] = torch.zeros(d2, device=p.device, dtype=dtype)
+                    packed_d2 = (d2 + 7) // 8
+                    state['sign_slow'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=p.device)
+                # Second moment (v)
+                state['mu_v_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                state['mv_v_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+            else:  # Fallback to standard AdamW for non-factored tensors
+                if group['adam_betas'][0] > 0:
+                    state['exp_avg'] = torch.zeros_like(p, device=device, dtype=dtype)
+                if group.get('adam_use_AdEMAMix'):
+                    state['exp_avg_slow'] = torch.zeros_like(p, device=device, dtype=dtype)
+                state['exp_avg_sq'] = torch.zeros_like(p, device=device, dtype=dtype)
+    @torch.no_grad()
+    def _muon_step_parameter(self, p, grad, state, group, lr):
         # Retrieve hyperparameters
         beta1, beta2 = group['betas']
         nesterov = group['nesterov']
@@ -183,8 +268,8 @@ class AdaMuon_adv(torch.optim.Optimizer):
             # Reconstruct momentum from previous step's factors & sign
             d1, d2 = state['effective_shape']
-            mt_buf = _unnmf((state['mu_m_nmf'], state['mv_m_nmf']))
-            unpacked_sign = _unpack_bools(state['sign'], original_m=d2)
+            mt_buf = _unnmf((state['mu_mbuf_nmf'], state['mv_mbuf_nmf']))
+            unpacked_sign = _unpack_bools(state['sign_buf'], original_m=d2)
             torch.where(unpacked_sign, mt_buf, -mt_buf, out=mt_buf)
             del unpacked_sign
@@ -231,9 +316,10 @@ class AdaMuon_adv(torch.optim.Optimizer):
                     eps=group['ns_eps'],
                     coeffs=group['ns_coeffs'],
                 )
+            del signed_m_buf
             # Reconstruct second momentum from previous step's factors
-            vt_buf = _unnmf((state['mu_v_nmf'], state['mv_v_nmf']))
+            vt_buf = _unnmf((state['mu_vbuf_nmf'], state['mv_vbuf_nmf']))
             # Update second momentum in full-size
             vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
@@ -252,49 +338,38 @@ class AdaMuon_adv(torch.optim.Optimizer):
             rms_target = group['rms_target']
             num_elements = update.numel()
             # Add eps to prevent division by zero
-            scaling_factor = rms_target * (num_elements ** 0.5) / (update.norm() + group['eps'])
+            update.mul_(rms_target * (num_elements ** 0.5) / (update.norm() + group['eps']))
-            update.mul_(scaling_factor)
-            update = update.view(p.shape).mul_(group['lr'])
-            del num_elements, scaling_factor
+            update = update.view(p.shape).mul_(lr)
+            del num_elements
             # Compress updated moments and store new factors
-            state['sign'] = _pack_bools(mt_buf > 0)
-            _nnmf(mt_buf.abs(), out=(state['mu_m_nmf'], state['mv_m_nmf']))
+            state['sign_buf'] = _pack_bools(mt_buf > 0)
+            _nnmf(mt_buf.abs(), out=(state['mu_mbuf_nmf'], state['mv_mbuf_nmf']))
             del mt_buf
-            _nnmf(vt_buf.abs(), out=(state['mu_v_nmf'], state['mv_v_nmf']))
+            _nnmf(vt_buf.abs(), out=(state['mu_vbuf_nmf'], state['mv_vbuf_nmf']))
             del vt_buf
         else: # Standard AdaMuon logic for non-factored tensors
-            if len(p.shape) >= 2 or state['reshaped_1d_muon']:
+            if len(p.shape) >= 2:
+                original_shape = p.shape
                 # Momentum update
                 mt_buf = state['momentum_buffer']
-                if state['reshaped_1d_muon']:
-                    d1, d2 = state['effective_shape']
-                    grad_reshaped = grad.view(d1, d2)
-                    mt_buf.mul_(beta1).add_(grad_reshaped)
-                    if nesterov:
-                        signed_m_buf = torch.sign(grad_reshaped.add(mt_buf, alpha=beta1))
-                    elif Simplified_AdEMAMix:
-                        signed_m_buf = torch.sign(mt_buf.add(grad_reshaped, alpha=alpha_grad))
-                    else:
-                        signed_m_buf = torch.sign(mt_buf)
-                    del grad_reshaped
+                mt_buf.mul_(beta1).add_(grad)
+                if nesterov:
+                    signed_m_buf = torch.sign(grad.add(mt_buf, alpha=beta1))
+                elif Simplified_AdEMAMix:
+                    signed_m_buf = torch.sign(mt_buf.add(grad, alpha=alpha_grad))
                 else:
-                    mt_buf.mul_(beta1).add_(grad)
-                    if nesterov:
-                        signed_m_buf = torch.sign(grad.add(mt_buf, alpha=beta1))
-                    elif Simplified_AdEMAMix:
-                        signed_m_buf = torch.sign(mt_buf.add(grad, alpha=alpha_grad))
-                    else:
-                        signed_m_buf = torch.sign(mt_buf)
+                    signed_m_buf = torch.sign(mt_buf)
                 # Flatten if necessary (e.g., for Conv layers)
-                if len(p.shape) > 2:
-                    signed_m_buf = signed_m_buf.view(p.shape[0], -1)
+                signed_m_buf = signed_m_buf.view(original_shape[0], -1)
                 # Orthogonalization step
                 if group['low_rank_ortho']:
@@ -327,9 +402,9 @@ class AdaMuon_adv(torch.optim.Optimizer):
                         eps=group['ns_eps'],
                         coeffs=group['ns_coeffs'],
                     )
+                del signed_m_buf
-                if len(p.shape) > 2 or state['reshaped_1d_muon']:
-                    update = update.view(p.shape)
+                update = update.view(original_shape)
                 vt_buf = state['second_momentum_buffer']
                 vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
@@ -348,31 +423,167 @@ class AdaMuon_adv(torch.optim.Optimizer):
                 rms_target = group['rms_target']
                 num_elements = update.numel()
                 # Add eps to prevent division by zero
-                scaling_factor = rms_target * (num_elements ** 0.5) / (update.norm() + group['eps'])
+                update.mul_(rms_target * (num_elements ** 0.5) / (update.norm() + group['eps']))
-                update.mul_(scaling_factor)
-                del num_elements, scaling_factor
+                del num_elements
-                update.mul_(group['lr'])
+                update.mul_(lr)
-            else: # Fallback to standard SGD with momentum for 1D params (biases, etc.) when not reshaped
+            else: # Fallback to standard SGD with momentum for 1D params (biases, etc.)
                 # Momentum update
                 mt_buf = state['momentum_buffer']
                 mt_buf.mul_(beta1).add_(grad)
                 if nesterov:
+                    # Nesterov momentum
                     update = grad.add(mt_buf, alpha=beta1)
-                elif Simplified_AdEMAMix:
-                    signed_m_buf = torch.sign(mt_buf.add(grad, alpha=alpha_grad))
+#                 elif Simplified_AdEMAMix: # TODO, it will break SGD since it requires x100 lower LR
+#                     update = mt_buf.add(grad, alpha=alpha_grad)
                 else:
                     update = mt_buf.clone()
-                update.mul_(group['lr'])
+                update.mul_(lr)
         # Decoupled weight decay
         if group["weight_decay"] != 0:
             if p.dtype == torch.bfloat16 and self.stochastic_rounding:
-                add_stochastic_(p.data, p.data, alpha=-group["weight_decay"] * group["lr"])
+                add_stochastic_(p.data, p.data, alpha=-group["weight_decay"] * lr)
+            else:
+                p.data.add_(p.data, alpha=-group["weight_decay"] * lr)
+        if p.dtype == torch.bfloat16 and self.stochastic_rounding:
+            add_stochastic_(p.data, -update)
+        else:
+            p.data.add_(-update)
+        del update
+    @torch.no_grad()
+    def _adam_step_parameter(self, p, grad, state, group, lr, bias_correction1, bias_correction2):
+        if grad.dtype != torch.float32 and state.get('factored', False):
+            grad = grad.float()
+        if group.get("adam_orthogonal_gradient"):
+            grad = _orthogonalize_gradient(p, grad)
+        beta1_adam, beta2_adam = group['adam_betas']
+        if group.get('adam_kourkoutas_beta', False):
+            # Accumulate current grad's norm for the *next* step
+            self.kourkoutas_helper.accumulate_gradient_sq_norm(p, grad)
+            # Get the dynamic beta2_adam calculated in prepare_step()
+            beta2_adam = self.kourkoutas_helper.get_beta2(p, group)
+        step_size = lr / bias_correction1
+        if group.get('adam_use_AdEMAMix'):
+            beta3_ema = group['adam_beta3_ema']
+            alpha = group['adam_alpha']
+        if state['factored']:
+            d1, d2 = state['effective_shape']
+            grad_reshaped = grad.view(d1, d2)
+            # Reconstruct momentum from previous step's factors
+            if beta1_adam > 0:
+                mt = _unnmf((state['mu_m_nmf'], state['mv_m_nmf']))
+                if not group.get('adam_grams_moment'):
+                    unpacked_sign = _unpack_bools(state['sign'], original_m=d2)
+                    torch.where(unpacked_sign, mt, -mt, out=mt)
+                    del unpacked_sign
+                # Update momentum in full-size
+                mt.mul_(beta1_adam).add_(grad_reshaped, alpha=1.0 - beta1_adam)
+                if group.get('adam_grams_moment'):
+                    mt = (grad_reshaped.sign().mul_(mt.abs()))
+                elif group.get('adam_cautious_mask'):
+                    mask = (mt * grad_reshaped > 0).to(grad_reshaped.dtype)
+                    mask.div_(mask.mean().clamp_(min=1e-3))
+                    mt.mul_(mask)
+                    del mask
+            vt = _unnmf((state['mu_v_nmf'], state['mv_v_nmf']))
+            vt.mul_(beta2_adam).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2_adam)
+            if group.get('adam_use_AdEMAMix'):
+                mt_slow = _unnmf((state['mu_m_slow_nmf'], state['mv_m_slow_nmf']))
+                if state['sign_slow'].dtype != torch.uint8:
+                    state['sign_slow'] = state['sign_slow'].to(torch.uint8)
+                unpacked_sign_slow = _unpack_bools(state['sign_slow'], original_m=d2)
+                torch.where(unpacked_sign_slow, mt_slow, -mt_slow, out=mt_slow)
+                del unpacked_sign_slow
+                mt_slow.mul_(beta3_ema).add_(grad_reshaped, alpha=1.0 - beta3_ema)
+                if beta1_adam > 0:
+                    update = torch.add(mt, mt_slow, alpha=alpha)
+                else:
+                    update = torch.add(grad_reshaped, mt_slow, alpha=alpha)
+            else:
+                update = mt.clone() if beta1_adam > 0 else grad_reshaped.clone()
+            del grad_reshaped
+            if group['adam_use_atan2']:
+                a = 1.2732395
+                denom = (vt.sqrt() / (bias_correction2**0.5))
+                update.atan2_(denom).mul_(a)
+            else:
+                denom = (vt.sqrt() / (bias_correction2**0.5)).add_(group['adam_eps'])
+                update.div_(denom)
+            del denom
+            update = update.view(p.shape).mul_(step_size)
+            # Compress updated moments and store new factors
+            if beta1_adam > 0:
+                if not group.get('adam_grams_moment'):
+                    state['sign'] = _pack_bools(mt > 0)
+                _nnmf(mt.abs(), out=(state['mu_m_nmf'], state['mv_m_nmf']))
+                del mt
+            if group.get('adam_use_AdEMAMix'):
+                state['sign_slow'] = _pack_bools(mt_slow > 0)
+                _nnmf(mt_slow.abs(), out=(state['mu_m_slow_nmf'], state['mv_m_slow_nmf']))
+                del mt_slow
+            _nnmf(vt, out=(state['mu_v_nmf'], state['mv_v_nmf']))
+            del vt
+        else:  # Standard AdamW logic for non-factored tensors
+            exp_avg_sq = state['exp_avg_sq']
+            if beta1_adam > 0:
+                exp_avg = state['exp_avg']
+                exp_avg.mul_(beta1_adam).add_(grad, alpha=1 - beta1_adam)
+                if group.get('adam_grams_moment'):
+                    exp_avg = grad.sign().mul_(exp_avg.abs())
+                elif group.get('adam_cautious_mask'):
+                    mask = (exp_avg * grad > 0).to(grad.dtype)
+                    mask.div_(mask.mean().clamp_(min=1e-3))
+                    exp_avg.mul_(mask)
+                    del mask
+            if group.get('adam_use_AdEMAMix'):
+                exp_avg_slow = state['exp_avg_slow']
+                exp_avg_slow.mul_(beta3_ema).add_(grad, alpha=1 - beta3_ema)
+                if beta1_adam > 0:
+                    update = torch.add(exp_avg, exp_avg_slow, alpha=alpha)
+                else:
+                    update = torch.add(grad, exp_avg_slow, alpha=alpha)
+            else:
+                update = exp_avg.clone() if beta1_adam > 0 else grad.clone()
+            exp_avg_sq.mul_(beta2_adam).addcmul_(grad, grad.conj(), value=1 - beta2_adam)
+            if group.get('adam_use_atan2'):
+                a = 1.2732395
+                denom = (exp_avg_sq.sqrt() / (bias_correction2**0.5))
+                update.atan2_(denom).mul_(a)
+            else:
+                denom = (exp_avg_sq.sqrt() / (bias_correction2**0.5)).add_(group['adam_eps'])
+                update.div_(denom)
+            del denom
+            update.mul_(step_size)
+        # Decoupled weight decay
+        if group["adam_weight_decay"] != 0:
+            if p.dtype == torch.bfloat16 and self.stochastic_rounding:
+                add_stochastic_(p.data, p.data, alpha=-group["adam_weight_decay"] * lr)
             else:
-                p.data.add_(p.data, alpha=-group["weight_decay"] * group["lr"])
+                p.data.add_(p.data, alpha=-group["adam_weight_decay"] * lr)
         if p.dtype == torch.bfloat16 and self.stochastic_rounding:
             add_stochastic_(p.data, -update)
@@ -380,7 +591,61 @@ class AdaMuon_adv(torch.optim.Optimizer):
             p.data.add_(-update)
         del update
-        state['step'] += 1
+    @torch.no_grad()
+    def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
+        grad = p.grad
+        if grad is None:
+            return
+        state = self.state[p]
+        # Determine if using Adam or Muon based on state keys
+        # We can use optm_type but I see this as a safer way.
+        if 'momentum_buffer' in state or 'mu_mbuf_nmf' in state:
+            use_adam = False
+        else:
+            use_adam = True
+        lr = group['lr']
+        is_compiled = group.get('compiled_optimizer', False)
+        if use_adam:
+            if self.kourkoutas_helper:
+                # Prepare Kourkoutas-β once per optimizer step.
+                self.kourkoutas_helper.maybe_prepare_step(self.global_step)
+            # Adam-specific setup (bias correction)
+            if group['adam_use_bias_correction']:
+                current_step = self.global_step + 1
+                beta1_adam, beta2_adam = group['adam_betas']
+                bias_correction1 = 1.0 - beta1_adam ** current_step
+                bias_correction2 = 1.0 - beta2_adam ** current_step
+            else:
+                bias_correction1 = 1.0
+                bias_correction2 = 1.0
+            # Dispatch to compiled or uncompiled Adam step
+            if is_compiled and self._compiled_adam_step is not None:
+                # Tensors must be used for compiled functions
+                lr_tensor = torch.tensor(lr, device=p.device)
+                bc1_tensor = torch.tensor(bias_correction1, device=p.device)
+                bc2_tensor = torch.tensor(bias_correction2, device=p.device)
+                self._compiled_adam_step(p, grad, state, group, lr_tensor, bc1_tensor, bc2_tensor)
+            else:
+                self._adam_step_parameter(p, grad, state, group, lr, bias_correction1, bias_correction2)
+        else: # Muon path
+            # Dispatch to compiled or uncompiled Muon step
+            if is_compiled and self._compiled_muon_step is not None:
+                lr_tensor = torch.tensor(lr, device=p.device)
+                self._compiled_muon_step(p, grad, state, group, lr_tensor)
+            else:
+                self._muon_step_parameter(p, grad, state, group, lr)
+    def compile(self, *args, **kwargs):
+        print("Compiling AdaMuon step path...")
+        self._compiled_muon_step = torch.compile(self._muon_step_parameter, *args, **kwargs)
+        print("Compiling AuxAdam step path...")
+        self._compiled_adam_step = torch.compile(self._adam_step_parameter, *args, **kwargs)
     @torch.no_grad()
     def step(self, closure=None):
@@ -394,4 +659,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
             for i, p in enumerate(group['params']):
                 self.step_parameter(p, group, i)
+        self.global_step += 1
         return loss

adv_optm/optim/Muon_adv.py CHANGED Viewed

@@ -1,28 +1,23 @@
 import torch
-from typing import Optional
-from .AdamW_adv import AdamW_adv
 from ..util.BF16_Stochastic_Rounding import add_stochastic_
 from ..util.Newton_Schulz import _newton_schulz_iteration
 from ..util.Effective_Shape import _get_effective_shape
 from ..util.NNMF import _nnmf,_unnmf
 from ..util.One_Bit_Boolean import _pack_bools, _unpack_bools
+from ..util.OrthoGrad import _orthogonalize_gradient
+from ..util.Kourkoutas import KourkoutasHelper
 class Muon_adv(torch.optim.Optimizer):
     """
-    Implements an advanced Muon algorithm.
+    Implements an advanced Muon algorithm, with an integrated auxiliary AdamW optimizer.
     Muon (MomentUm Orthogonalized by Newton-Schulz) is an optimizer designed for
     the hidden layers of neural networks. It applies SGD with momentum and then
     orthogonalizes the resulting update matrix using a Newton-Schulz iteration.
-    NorMuon (Neuron-wise Normalized Muon) extends this by adding neuron-level
-    adaptive learning rates, combining the benefits of orthogonalization with
-    second-order momentum statistics.
-    This implementation is designed for 2D parameters (e.g., linear layers) and
-    can handle other-dimensional parameters (e.g., 1D bias, 4D convolutional layers) by
-    flattening/reshaping them.
+    When `MuonWithAuxAdam` is enabled, this single optimizer class handles both
+    'muon' and 'adam' parameter groups, dispatching to the appropriate logic internally.
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
@@ -65,6 +60,19 @@ class Muon_adv(torch.optim.Optimizer):
         normuon_lr_scale (float): Scaling factor for the NorMuon learning rate.
             (default: 0.2)
         normuon_atan2 (bool): whether to use the atan2 for NorMuon. (default: False)
+        --- Auxiliary AdamW_adv Parameters (used for 'adam' groups) ---
+        adam_betas (tuple[float, float]): Betas for the AdamW optimizer part.
+        adam_eps (float): Epsilon for the AdamW optimizer part.
+        adam_weight_decay (float): Weight decay for the AdamW optimizer part.
+        adam_use_bias_correction (bool): Bias correction for AdamW.
+        adam_use_atan2 (bool): Atan2 update rule for AdamW.
+        adam_cautious_mask (bool): Cautious masking for AdamW.
+        adam_grams_moment (bool): Grams-style updates for AdamW.
+        adam_orthogonal_gradient (bool): OrthoGrad for AdamW.
+        adam_use_AdEMAMix (bool): AdEMAMix for AdamW.
+        adam_beta3_ema (float): Beta3 for AdEMAMix.
+        adam_alpha (float): Alpha for AdEMAMix.
+        adam_kourkoutas_beta (bool): Kourkoutas-β for AdamW.
     """
     def __init__(
@@ -92,6 +100,25 @@ class Muon_adv(torch.optim.Optimizer):
         normuon_eps: float = 1e-8,
         normuon_lr_scale: float = 0.2,
         normuon_atan2: bool = False,
+        # Compiled
+        compiled_optimizer: bool = False,
+        # --- AdamW_adv specific parameters ---
+        adam_betas: tuple[float, float] = (0.9, 0.99),
+        adam_eps: float = 1e-8,
+        adam_weight_decay: float = 0.0,
+        adam_use_bias_correction: bool = True,
+        adam_use_atan2: bool = False,
+        adam_cautious_mask: bool = False,
+        adam_grams_moment: bool = False,
+        adam_orthogonal_gradient: bool = False,
+        adam_use_AdEMAMix: bool = False,
+        adam_beta3_ema: float = 0.9999,
+        adam_alpha: float = 5.0,
+        adam_kourkoutas_beta: bool = False,
+        adam_beta2_min: float = 0.9,
+        adam_ema_alpha: float = 0.95,
+        adam_tiny_spike: float = 1e-9,
+        adam_k_warmup_steps: int = 0,
     ):
         if not (lr >= 0.0):
             raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
@@ -104,7 +131,7 @@ class Muon_adv(torch.optim.Optimizer):
         if not (ns_steps > 0):
             raise ValueError(f"Newton-Schulz steps should be > 0. Got {ns_steps}")
         if Simplified_AdEMAMix and nesterov:
-            print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling cautious.")
+            print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling nesterov.")
             nesterov = False
         defaults = {
@@ -114,17 +141,43 @@ class Muon_adv(torch.optim.Optimizer):
             "vector_reshape": vector_reshape,
             "vector_reshape_muon": vector_reshape_muon,
             "Simplified_AdEMAMix": Simplified_AdEMAMix, "alpha_grad": alpha_grad,
+            'compiled_optimizer': compiled_optimizer,
             # Low-rank Ortho
             "low_rank_ortho": low_rank_ortho, "ortho_rank": ortho_rank,
             # NorMuon
             "normuon_variant": normuon_variant, "beta2_normuon": beta2_normuon,
             "normuon_eps": normuon_eps, "normuon_lr_scale": normuon_lr_scale,
             "normuon_atan2": normuon_atan2,
+            # AdamW_adv defaults
+            "adam_betas": adam_betas, "adam_eps": adam_eps, "adam_weight_decay": adam_weight_decay,
+            "adam_use_bias_correction": adam_use_bias_correction, "adam_use_atan2": adam_use_atan2,
+            "adam_cautious_mask": adam_cautious_mask, "adam_grams_moment": adam_grams_moment,
+            "adam_orthogonal_gradient": adam_orthogonal_gradient,
+            "adam_use_AdEMAMix": adam_use_AdEMAMix, "adam_beta3_ema": adam_beta3_ema, "adam_alpha": adam_alpha,
+            "adam_kourkoutas_beta": adam_kourkoutas_beta, "adam_beta2_min": adam_beta2_min,
+            "adam_ema_alpha": adam_ema_alpha, "adam_tiny_spike": adam_tiny_spike,
+            "adam_k_warmup_steps": adam_k_warmup_steps,
         }
         self.stochastic_rounding = stochastic_rounding
+        self.compiled_optimizer = compiled_optimizer
         super().__init__(params, defaults)
+        self.global_step = 0 # For Adam bias correction and Kourkoutas
+        self.kourkoutas_helper = None
+        if any(group.get('adam_kourkoutas_beta', False) for group in self.param_groups):
+            self.kourkoutas_helper = KourkoutasHelper(self)
+        self.init_step()
+        # Initialize compiled functions to None
+        self._compiled_muon_step = None
+        self._compiled_adam_step = None
+        if compiled_optimizer:
+            print("Compiling Muon_adv optimizer paths...")
+            torch._dynamo.config.cache_size_limit = 8192
+            self.compile(fullgraph=True)
     @property
     def supports_fused_back_pass(self):
@@ -138,53 +191,90 @@ class Muon_adv(torch.optim.Optimizer):
     def supports_flat_params(self):
         return False
+    def init_step(self):
+        for group in self.param_groups:
+            for i, p in enumerate(group['params']):
+                self.__init_state(p, group)
     @torch.no_grad()
-    def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
-        if p.grad is None:
+    def __init_state(self, p, group):
+        state = self.state[p]
+        if len(state) > 0:
             return
-        grad = p.grad
-        state = self.state[p]
+        optim_type = group.get('optim_type', 'muon')
-        # State Initialization
-        if 'step' not in state:
-            state['step'] = 0
+        state['factored'] = (
+            group['nnmf_factor'] and
+            not (len(p.shape) == 1 and not group['vector_reshape'])
+        )
+        dtype = torch.float32 if state['factored'] else p.dtype
+        device = p.device
-            should_factor = (
+        if optim_type == 'muon':
+            state['factored'] = (
                 group['nnmf_factor'] and
                 not (len(p.shape) == 1 and not group['vector_reshape'])
             )
-            state['factored'] = should_factor
-            state['reshaped_1d_muon'] = len(p.shape) == 1 and group['vector_reshape_muon']
-            dtype = torch.float32 if group['nnmf_factor'] else p.dtype
+            dtype = torch.float32 if state['factored'] else p.dtype
             device = p.device
-            if state['factored'] or state['reshaped_1d_muon']:
+            if state['factored']:
                     state['effective_shape'] = _get_effective_shape(p.numel())
                     d1, d2 = state['effective_shape']
-            if state['factored']:
-                    state['mu_m_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
-                    state['mv_m_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+                    state['mu_mbuf_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                    state['mv_mbuf_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
                     packed_d2 = (d2 + 7) // 8
-                    state['sign'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=device)
+                    state['sign_buf'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=device)
             else:
-                if len(p.shape) >= 2:
-                    state['momentum_buffer'] = torch.zeros_like(p)
-                if state['reshaped_1d_muon']:
-                    state['momentum_buffer'] = torch.zeros((d1, d2), device=device, dtype=dtype)
-                elif len(p.shape) == 1:
-                    state['momentum_buffer'] = torch.zeros_like(p)
+                state['momentum_buffer'] = torch.zeros_like(p)
             # NorMuon state initialization
             if group['normuon_variant']:
                 if state['factored']:
                     state['normuon_v'] = torch.zeros(d1, device=p.device, dtype=torch.float32)
-                elif len(p.shape) >= 2 or state['reshaped_1d_muon']:
-                    num_rows = p.shape[0] if len(p.shape) >= 2 else state['effective_shape'][0]
-                    state['normuon_v'] = torch.zeros(num_rows, device=p.device, dtype=torch.float32)
+                elif len(p.shape) >= 2:
+                    state['normuon_v'] = torch.zeros(p.shape[0], device=p.device, dtype=torch.float32)
+        elif optim_type == 'adam':
+            state['factored'] = (
+                group['adam_nnmf_factor'] and
+                not (len(p.shape) == 1 and not group['vector_reshape'])
+            )
+            dtype = torch.float32 if state['factored'] else p.dtype
+            device = p.device
+            if state['factored']:
+                state['effective_shape'] = _get_effective_shape(p.numel())
+                d1, d2 = state['effective_shape']
+                # First moment (m)
+                if group['adam_betas'][0] > 0:
+                    state['mu_m_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                    state['mv_m_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+                    if not group.get('adam_grams_moment'):
+                        packed_d2 = (d2 + 7) // 8
+                        state['sign'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=device)
+                if group.get('adam_use_AdEMAMix'):
+                    state['mu_m_slow_nmf'] = torch.zeros(d1, device=p.device, dtype=dtype)
+                    state['mv_m_slow_nmf'] = torch.zeros(d2, device=p.device, dtype=dtype)
+                    packed_d2 = (d2 + 7) // 8
+                    state['sign_slow'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=p.device)
+                # Second moment (v)
+                state['mu_v_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                state['mv_v_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+            else:  # Fallback to standard AdamW for non-factored tensors
+                if group['adam_betas'][0] > 0:
+                    state['exp_avg'] = torch.zeros_like(p, device=device, dtype=dtype)
+                if group.get('adam_use_AdEMAMix'):
+                    state['exp_avg_slow'] = torch.zeros_like(p, device=device, dtype=dtype)
+                state['exp_avg_sq'] = torch.zeros_like(p, device=device, dtype=dtype)
+    @torch.no_grad()
+    def _muon_step_parameter(self, p, grad, state, group, lr):
         beta1 = group['beta1']
         nesterov = group['nesterov']
         Simplified_AdEMAMix = group['Simplified_AdEMAMix']
@@ -194,8 +284,8 @@ class Muon_adv(torch.optim.Optimizer):
             # Reconstruct momentum from previous step's factors & sign
             d1, d2 = state['effective_shape']
-            mt_buf = _unnmf((state['mu_m_nmf'], state['mv_m_nmf']))
-            unpacked_sign = _unpack_bools(state['sign'], original_m=d2)
+            mt_buf = _unnmf((state['mu_mbuf_nmf'], state['mv_mbuf_nmf']))
+            unpacked_sign = _unpack_bools(state['sign_buf'], original_m=d2)
             torch.where(unpacked_sign, mt_buf, -mt_buf, out=mt_buf)
             del unpacked_sign
@@ -260,51 +350,41 @@ class Muon_adv(torch.optim.Optimizer):
                     update.div_(v_t.sqrt().unsqueeze(1).add_(group['normuon_eps']))
                 # Scale learning rate
                 update_norm = torch.linalg.vector_norm(update)
-                if update_norm > 1e-12:
-                    scaled_lr = group['normuon_lr_scale'] * group['lr'] * (p.numel()**0.5) / update_norm
-                else:
-                    scaled_lr = 0.0
+                scaled_lr = group['normuon_lr_scale'] * lr * (p.numel()**0.5) / update_norm.add_(group['normuon_eps'])
                 update = update.view(p.shape).mul_(scaled_lr)
+                del update_norm, scaled_lr
             else: # Original Muon learning rate application
-                update = update.view(p.shape).mul_(group['lr'])
+                update = update.view(p.shape).mul_(lr)
-            state['sign'] = _pack_bools(mt_buf > 0)
-            _nnmf(mt_buf.abs(), out=(state['mu_m_nmf'], state['mv_m_nmf']))
+            state['sign_buf'] = _pack_bools(mt_buf > 0)
+            _nnmf(mt_buf.abs(), out=(state['mu_mbuf_nmf'], state['mv_mbuf_nmf']))
             del mt_buf
         else: # Standard Muon logic for non-factored tensors
-            if len(p.shape) >= 2 or state['reshaped_1d_muon']:
+            if len(p.shape) >= 2:
+                original_shape = p.shape
                 # Momentum update
                 mt_buf = state['momentum_buffer']
-                if state['reshaped_1d_muon']:
-                    d1, d2 = state['effective_shape']
-                    grad_reshaped = grad.view(d1, d2)
-                    mt_buf.mul_(beta1).add_(grad_reshaped)
-                else:
-                    mt_buf.mul_(beta1).add_(grad)
+                mt_buf.mul_(beta1).add_(grad)
                 if nesterov:
                     # Nesterov momentum
-                    if state['reshaped_1d_muon']:
-                        update = grad_reshaped.add(mt_buf, alpha=beta1)
-                        del grad_reshaped
-                    else:
-                        update = grad.add(mt_buf, alpha=beta1)
+                    update = grad.add(mt_buf, alpha=beta1)
                 elif Simplified_AdEMAMix:
-                    if state['reshaped_1d_muon']:
-                        update = torch.add(mt_buf, grad_reshaped, alpha=alpha_grad)
-                        del grad_reshaped
-                    else:
-                        update = torch.add(mt_buf, grad, alpha=alpha_grad)
+                    update = torch.add(mt_buf, grad, alpha=alpha_grad)
                 else:
                     # Standard momentum
                     update = mt_buf.clone()
-                # For Conv layers (4D) or other high-dim tensors, flatten to 2D
-                if len(p.shape) > 2:
-                    update = update.view(p.shape[0], -1)
+                # flatten to 2D for orthogonalization.
+                # This is a no-op for 2D tensors and correctly flattens 4D+ tensors.
+                # This removes the dynamic control flow that breaks torch.compile.
+                update = update.view(original_shape[0], -1)
                 # Orthogonalization step
                 if group['low_rank_ortho']:
@@ -316,7 +396,7 @@ class Muon_adv(torch.optim.Optimizer):
                         # 1. Sketch the matrix
                         G_sketch = torch.randn(M.shape[1], r, device=M.device, dtype=M.dtype)
                         MG = M @ G_sketch
                         # 2. QR decomposition to get orthogonal basis Q
                         if MG.dtype != torch.float32:
                             MG_dtype = M.dtype
@@ -324,10 +404,10 @@ class Muon_adv(torch.optim.Optimizer):
                             Q = Q.to(MG_dtype)
                         else:
                             Q, _ = torch.linalg.qr(MG)
                         # 3. Project M onto the basis
                         projected_M = Q.T @ M
                         # 4. Orthogonalize the smaller projected matrix
                         ortho_projected_M = _newton_schulz_iteration(
                             projected_M,
@@ -335,7 +415,7 @@ class Muon_adv(torch.optim.Optimizer):
                             eps=group['ns_eps'],
                             coeffs=group['ns_coeffs'],
                         )
                         # 5. Project back to the original space
                         update = Q @ ortho_projected_M
                     else: # Fallback for invalid rank
@@ -369,19 +449,16 @@ class Muon_adv(torch.optim.Optimizer):
                         update.div_(v_t.sqrt().unsqueeze(1).add_(group['normuon_eps']))
                     # Scale learning rate
                     update_norm = torch.linalg.vector_norm(update)
-                    if update_norm > 1e-12:
-                        scaled_lr = group['normuon_lr_scale'] * group['lr'] * (p.numel()**0.5) / update_norm
-                    else:
-                        scaled_lr = 0.0
+                    scaled_lr = group['normuon_lr_scale'] * lr * (p.numel()**0.5) / update_norm.add_(group['normuon_eps'])
                     update.mul_(scaled_lr)
                 else: # Original Muon learning rate application
-                    update.mul_(group['lr'])
+                    update.mul_(lr)
+                # reshape back to the original shape.
+                update = update.view(original_shape)
-                # Reshape back to original if we flattened or reshaped
-                if len(p.shape) > 2 or state['reshaped_1d_muon']:
-                    update = update.view(p.shape)
-            else: # Fallback to standard SGD with momentum for 1D params (biases, etc.) when not reshaped
+            else: # Fallback to standard SGD with momentum for 1D params (biases, etc.)
                 # Momentum update
                 mt_buf = state['momentum_buffer']
                 mt_buf.mul_(beta1).add_(grad)
@@ -393,14 +470,14 @@ class Muon_adv(torch.optim.Optimizer):
                 else:
                     # Standard momentum
                     update = mt_buf.clone()
-                update.mul_(group['lr'])
+                update.mul_(lr)
         # Decoupled weight decay
         if group["weight_decay"] != 0:
             if p.dtype == torch.bfloat16 and self.stochastic_rounding:
-                add_stochastic_(p.data, p.data, alpha=-group["weight_decay"] * group["lr"])
+                add_stochastic_(p.data, p.data, alpha=-group["weight_decay"] * lr)
             else:
-                p.data.add_(p.data, alpha=-group["weight_decay"] * group["lr"])
+                p.data.add_(p.data, alpha=-group["weight_decay"] * lr)
         if p.dtype == torch.bfloat16 and self.stochastic_rounding:
             add_stochastic_(p.data, -update)
@@ -408,7 +485,198 @@ class Muon_adv(torch.optim.Optimizer):
             p.data.add_(-update)
         del update
-        state['step'] += 1
+    @torch.no_grad()
+    def _adam_step_parameter(self, p, grad, state, group, lr, bias_correction1, bias_correction2):
+        if grad.dtype != torch.float32 and state.get('factored', False):
+            grad = grad.float()
+        if group.get("adam_orthogonal_gradient"):
+            grad = _orthogonalize_gradient(p, grad)
+        beta1_adam, beta2_adam = group['adam_betas']
+        if group.get('adam_kourkoutas_beta', False):
+            # Accumulate current grad's norm for the *next* step
+            self.kourkoutas_helper.accumulate_gradient_sq_norm(p, grad)
+            # Get the dynamic beta2_adam calculated in prepare_step()
+            beta2_adam = self.kourkoutas_helper.get_beta2(p, group)
+        step_size = lr / bias_correction1
+        if group.get('adam_use_AdEMAMix'):
+            beta3_ema = group['adam_beta3_ema']
+            alpha = group['adam_alpha']
+        if state['factored']:
+            d1, d2 = state['effective_shape']
+            grad_reshaped = grad.view(d1, d2)
+            # Reconstruct momentum from previous step's factors
+            if beta1_adam > 0:
+                mt = _unnmf((state['mu_m_nmf'], state['mv_m_nmf']))
+                if not group.get('adam_grams_moment'):
+                    unpacked_sign = _unpack_bools(state['sign'], original_m=d2)
+                    torch.where(unpacked_sign, mt, -mt, out=mt)
+                    del unpacked_sign
+                # Update momentum in full-size
+                mt.mul_(beta1_adam).add_(grad_reshaped, alpha=1.0 - beta1_adam)
+                if group.get('adam_grams_moment'):
+                    mt = (grad_reshaped.sign().mul_(mt.abs()))
+                elif group.get('adam_cautious_mask'):
+                    mask = (mt * grad_reshaped > 0).to(grad_reshaped.dtype)
+                    mask.div_(mask.mean().clamp_(min=1e-3))
+                    mt.mul_(mask)
+                    del mask
+            vt = _unnmf((state['mu_v_nmf'], state['mv_v_nmf']))
+            vt.mul_(beta2_adam).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2_adam)
+            if group.get('adam_use_AdEMAMix'):
+                mt_slow = _unnmf((state['mu_m_slow_nmf'], state['mv_m_slow_nmf']))
+                if state['sign_slow'].dtype != torch.uint8:
+                    state['sign_slow'] = state['sign_slow'].to(torch.uint8)
+                unpacked_sign_slow = _unpack_bools(state['sign_slow'], original_m=d2)
+                torch.where(unpacked_sign_slow, mt_slow, -mt_slow, out=mt_slow)
+                del unpacked_sign_slow
+                mt_slow.mul_(beta3_ema).add_(grad_reshaped, alpha=1.0 - beta3_ema)
+                if beta1_adam > 0:
+                    update = torch.add(mt, mt_slow, alpha=alpha)
+                else:
+                    update = torch.add(grad_reshaped, mt_slow, alpha=alpha)
+            else:
+                update = mt.clone() if beta1_adam > 0 else grad_reshaped.clone()
+            del grad_reshaped
+            if group['adam_use_atan2']:
+                a = 1.2732395
+                denom = (vt.sqrt() / (bias_correction2**0.5))
+                update.atan2_(denom).mul_(a)
+            else:
+                denom = (vt.sqrt() / (bias_correction2**0.5)).add_(group['adam_eps'])
+                update.div_(denom)
+            del denom
+            update = update.view(p.shape).mul_(step_size)
+            # Compress updated moments and store new factors
+            if beta1_adam > 0:
+                if not group.get('adam_grams_moment'):
+                    state['sign'] = _pack_bools(mt > 0)
+                _nnmf(mt.abs(), out=(state['mu_m_nmf'], state['mv_m_nmf']))
+                del mt
+            if group.get('adam_use_AdEMAMix'):
+                state['sign_slow'] = _pack_bools(mt_slow > 0)
+                _nnmf(mt_slow.abs(), out=(state['mu_m_slow_nmf'], state['mv_m_slow_nmf']))
+                del mt_slow
+            _nnmf(vt, out=(state['mu_v_nmf'], state['mv_v_nmf']))
+            del vt
+        else:  # Standard AdamW logic for non-factored tensors
+            exp_avg_sq = state['exp_avg_sq']
+            if beta1_adam > 0:
+                exp_avg = state['exp_avg']
+                exp_avg.mul_(beta1_adam).add_(grad, alpha=1 - beta1_adam)
+                if group.get('adam_grams_moment'):
+                    exp_avg = grad.sign().mul_(exp_avg.abs())
+                elif group.get('adam_cautious_mask'):
+                    mask = (exp_avg * grad > 0).to(grad.dtype)
+                    mask.div_(mask.mean().clamp_(min=1e-3))
+                    exp_avg.mul_(mask)
+                    del mask
+            if group.get('adam_use_AdEMAMix'):
+                exp_avg_slow = state['exp_avg_slow']
+                exp_avg_slow.mul_(beta3_ema).add_(grad, alpha=1 - beta3_ema)
+                if beta1_adam > 0:
+                    update = torch.add(exp_avg, exp_avg_slow, alpha=alpha)
+                else:
+                    update = torch.add(grad, exp_avg_slow, alpha=alpha)
+            else:
+                update = exp_avg.clone() if beta1_adam > 0 else grad.clone()
+            exp_avg_sq.mul_(beta2_adam).addcmul_(grad, grad.conj(), value=1 - beta2_adam)
+            if group.get('adam_use_atan2'):
+                a = 1.2732395
+                denom = (exp_avg_sq.sqrt() / (bias_correction2**0.5))
+                update.atan2_(denom).mul_(a)
+            else:
+                denom = (exp_avg_sq.sqrt() / (bias_correction2**0.5)).add_(group['adam_eps'])
+                update.div_(denom)
+            del denom
+            update.mul_(step_size)
+        # Decoupled weight decay
+        if group["adam_weight_decay"] != 0:
+            if p.dtype == torch.bfloat16 and self.stochastic_rounding:
+                add_stochastic_(p.data, p.data, alpha=-group["adam_weight_decay"] * lr)
+            else:
+                p.data.add_(p.data, alpha=-group["adam_weight_decay"] * lr)
+        if p.dtype == torch.bfloat16 and self.stochastic_rounding:
+            add_stochastic_(p.data, -update)
+        else:
+            p.data.add_(-update)
+        del update
+    @torch.no_grad()
+    def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
+        grad = p.grad
+        if grad is None:
+            return
+        state = self.state[p]
+        if self.kourkoutas_helper:
+            # Prepare Kourkoutas-β once per optimizer step.
+            self.kourkoutas_helper.maybe_prepare_step(self.global_step)
+        # Determine if using Adam or Muon based on state keys
+        # We can use optm_type but I see this as a safer way.
+        if 'momentum_buffer' in state or 'mu_mbuf_nmf' in state:
+            use_adam = False
+        else:
+            use_adam = True
+        lr = group['lr']
+        is_compiled = group.get('compiled_optimizer', False)
+        if use_adam:
+            # Adam-specific setup (bias correction)
+            if group['adam_use_bias_correction']:
+                current_step = self.global_step + 1
+                beta1_adam, beta2_adam = group['adam_betas']
+                bias_correction1 = 1.0 - beta1_adam ** current_step
+                bias_correction2 = 1.0 - beta2_adam ** current_step
+            else:
+                bias_correction1 = 1.0
+                bias_correction2 = 1.0
+            # Dispatch to compiled or uncompiled Adam step
+            if is_compiled and self._compiled_adam_step is not None:
+                # Tensors must be used for compiled functions
+                lr_tensor = torch.tensor(lr, device=p.device)
+                bc1_tensor = torch.tensor(bias_correction1, device=p.device)
+                bc2_tensor = torch.tensor(bias_correction2, device=p.device)
+                self._compiled_adam_step(p, grad, state, group, lr_tensor, bc1_tensor, bc2_tensor)
+            else:
+                self._adam_step_parameter(p, grad, state, group, lr, bias_correction1, bias_correction2)
+        else: # Muon path
+            # Dispatch to compiled or uncompiled Muon step
+            if is_compiled and self._compiled_muon_step is not None:
+                lr_tensor = torch.tensor(lr, device=p.device)
+                self._compiled_muon_step(p, grad, state, group, lr_tensor)
+            else:
+                self._muon_step_parameter(p, grad, state, group, lr)
+    def compile(self, *args, **kwargs):
+        print("Compiling Muon step path...")
+        self._compiled_muon_step = torch.compile(self._muon_step_parameter, *args, **kwargs)
+        print("Compiling AuxAdam step path...")
+        self._compiled_adam_step = torch.compile(self._adam_step_parameter, *args, **kwargs)
     @torch.no_grad()
     def step(self, closure=None):
@@ -422,4 +690,6 @@ class Muon_adv(torch.optim.Optimizer):
             for i, p in enumerate(group['params']):
                 self.step_parameter(p, group, i)
-        return loss
+        self.global_step += 1
+        return loss

adv_optm/util/Kourkoutas.py CHANGED Viewed

@@ -71,7 +71,7 @@ class KourkoutasHelper:
         for layer_key, info in self.layer_info.items():
             params, group = info['params'], info['group_ref']
-            if not group.get('kourkoutas_beta', False):
+            if not group.get('kourkoutas_beta', False) and not group.get('adam_kourkoutas_beta', False):
                 continue
             first_param_in_layer = info['params'][0]

{adv_optm-1.2.dev12.dist-info → adv_optm-1.2.dev14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 1.2.dev12
+Version: 1.2.dev14
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-1.2.dev12.dist-info → adv_optm-1.2.dev14.dist-info}/RECORD RENAMED Viewed

@@ -1,23 +1,23 @@
-adv_optm/__init__.py,sha256=eREtFkRqgGhb8_duC4ZROpRguYIQZiJfws1MJbrBn8c,380
-adv_optm/optim/AdaMuon_adv.py,sha256=828WtdsaKXJqlZqFXE2yrsxY3Erxn-6N7CxV9jBXiaI,17880
+adv_optm/__init__.py,sha256=D5arg90L2AukHVLCuo7eEbYCh1KtUMOnCwrxsBQgA18,380
+adv_optm/optim/AdaMuon_adv.py,sha256=-UBw_mJj8JzDAi3zQ0nLnSOgzsTzl7b7kVksDRUziEE,30582
 adv_optm/optim/AdamW_adv.py,sha256=KL9SCJWZ_ckAQEApB6ofbndVYjancN-v7Us7hJLFf54,17475
 adv_optm/optim/Adopt_adv.py,sha256=S8XI2YA7683jsW8p7igc2YcU30lsN0H18qL02Kpvj8E,21244
 adv_optm/optim/Lion_Prodigy_adv.py,sha256=LEA3UYJpPeFnmxeniLNv1u2LKKj4ufx3Bq_MLw-nWXk,14617
 adv_optm/optim/Lion_adv.py,sha256=aGNAplZlyXYgVllYcV_s4bK8iC4fv6EizFoWIMNLdBc,8299
-adv_optm/optim/Muon_adv.py,sha256=uo9CBUI_5hZuuKbEmqHqozvS1_d3rKK0NKlyuv_0XxU,19518
+adv_optm/optim/Muon_adv.py,sha256=8d99NcXzLyxTbxVVXC8mHyeW7wM8jjK59QoXVTLScQA,32112
 adv_optm/optim/Prodigy_adv.py,sha256=lEjbtuQbomsCX39DnTPeI8Z5YG0f2aZPXN_E7-nGgWw,26060
 adv_optm/optim/Simplified_AdEMAMix.py,sha256=nEIA3yM11nBooKzHudB5l3x4UdFRBYRwiKVUkGmO0K8,12971
 adv_optm/optim/__init__.py,sha256=hpUWE6CKtt_rvMdgQVb3PtjhfZAvAxTq6hp8H8rIpBo,489
 adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
 adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
-adv_optm/util/Kourkoutas.py,sha256=_fq2glPqKmzgWpLedfwq5EqIJAxICUK2fmUP-cdcgq0,7467
+adv_optm/util/Kourkoutas.py,sha256=SSzhe0B6Zb2AXGwCKpVTLr0aaFfspcFBNZCZG3azI9k,7516
 adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
 adv_optm/util/Newton_Schulz.py,sha256=wJ_sKRaGVIsOofQ737my4ng494qX_pfgOqlDDmYtnCg,1377
 adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
 adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
 adv_optm/util/__init__.py,sha256=CXzS703GB4gil85khZi7sgKOnbzXGBOltshIOSPqj18,435
-adv_optm-1.2.dev12.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-adv_optm-1.2.dev12.dist-info/METADATA,sha256=RC88vvcjd7LgqF6wuHsBLQyoyDwm87ECoTkat7NOs5Y,14023
-adv_optm-1.2.dev12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-adv_optm-1.2.dev12.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
-adv_optm-1.2.dev12.dist-info/RECORD,,
+adv_optm-1.2.dev14.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+adv_optm-1.2.dev14.dist-info/METADATA,sha256=g017hnuxrm1a34pjnXUDZlnUify9xQtX_ZkrbMEXLLY,14023
+adv_optm-1.2.dev14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+adv_optm-1.2.dev14.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
+adv_optm-1.2.dev14.dist-info/RECORD,,

{adv_optm-1.2.dev12.dist-info → adv_optm-1.2.dev14.dist-info}/WHEEL RENAMED Viewed

File without changes

{adv_optm-1.2.dev12.dist-info → adv_optm-1.2.dev14.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{adv_optm-1.2.dev12.dist-info → adv_optm-1.2.dev14.dist-info}/top_level.txt RENAMED Viewed

File without changes

adv-optm 1.2.dev12__py3-none-any.whl → 1.2.dev14__py3-none-any.whl

Potentially problematic release.

adv-optm 1.2.dev12py3-none-any.whl → 1.2.dev14py3-none-any.whl