PyPI - adv-optm - Versions diffs - 1.2.dev14__tar.gz → 1.2.dev16__tar.gz - Mend

adv-optm 1.2.dev14tar.gz → 1.2.dev16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of adv-optm might be problematic. Click here for more details.

Files changed (29) hide show

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 1.2.dev14
+Version: 1.2.dev16
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/__init__.py RENAMED Viewed

@@ -20,4 +20,4 @@ __all__ = [
     "AdaMuon_adv",
 ]
-__version__ = "1.2.dev14"
+__version__ = "1.2.dev16"

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/optim/AdaMuon_adv.py RENAMED Viewed

@@ -63,6 +63,9 @@ class AdaMuon_adv(torch.optim.Optimizer):
             (default: False)
         ortho_rank (int): The rank for low-rank orthogonalization.
             (default: 128)
+        accelerated_ns (bool): If True, enables Chebyshev-accelerated Newton-Schulz, which
+            dynamically calculates optimal 3rd-order polynomial coefficients. (default: False)
+        cns_a_bound (float): Initial lower bound for singular values for CANS. (default: 1e-4)
         nnmf_factor (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: False)
         --- Auxiliary AdamW_adv Parameters (used for 'adam' groups) ---
@@ -96,11 +99,16 @@ class AdaMuon_adv(torch.optim.Optimizer):
         nesterov: bool = False,
         Simplified_AdEMAMix: bool = False,
         alpha_grad: float = 100.0,
-        vector_reshape: bool = False,
+        normuon_variant: bool = False,
         # Low-rank Muon
         low_rank_ortho: bool = False,
         ortho_rank: int = 128,
+        # Factored
+        vector_reshape: bool = False,
         nnmf_factor: bool = False,
+        # CANS
+        accelerated_ns: bool = False,
+        cns_a_bound: float = 1e-4,
         # Compiled
         compiled_optimizer: bool = False,
         # --- AdamW_adv specific parameters ---
@@ -139,9 +147,12 @@ class AdaMuon_adv(torch.optim.Optimizer):
             "vector_reshape": vector_reshape,
             "nesterov":nesterov, "use_atan2":use_atan2,
             "Simplified_AdEMAMix": Simplified_AdEMAMix, "alpha_grad": alpha_grad,
+            "normuon_variant": normuon_variant,
             # Low-rank Ortho
             "low_rank_ortho": low_rank_ortho, "ortho_rank": ortho_rank,
             "compiled_optimizer":compiled_optimizer,
+            # CANS
+            "accelerated_ns": accelerated_ns, "cns_a_bound": cns_a_bound,
             # AdamW_adv defaults
             "adam_betas": adam_betas, "adam_eps": adam_eps, "adam_weight_decay": adam_weight_decay,
             "adam_use_bias_correction": adam_use_bias_correction, "adam_use_atan2": adam_use_atan2,
@@ -156,7 +167,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        self.global_step = 0 # For Adam bias correction and Kourkoutas
         self.kourkoutas_helper = None
         if any(group.get('adam_kourkoutas_beta', False) for group in self.param_groups):
             self.kourkoutas_helper = KourkoutasHelper(self)
@@ -214,16 +224,25 @@ class AdaMuon_adv(torch.optim.Optimizer):
                     state['mv_mbuf_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
                     packed_d2 = (d2 + 7) // 8
                     state['sign_buf'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=device)
-                    state['mu_vbuf_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
-                    state['mv_vbuf_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+                    if not group['normuon_variant']:
+                        state['mu_vbuf_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                        state['mv_vbuf_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
             else:
-                if len(p.shape) >= 2:
+                if len(p.shape) >= 2 and not group['normuon_variant']:
                     state['second_momentum_buffer'] = torch.zeros_like(p)
                 state['momentum_buffer'] = torch.zeros_like(p)
+            # NorMuon state initialization
+            if group['normuon_variant']:
+                if state['factored']:
+                    state['normuon_v'] = torch.zeros(d1, device=p.device, dtype=torch.float32)
+                elif len(p.shape) >= 2:
+                    state['normuon_v'] = torch.zeros(p.shape[0], device=p.device, dtype=torch.float32)
         elif optim_type == 'adam':
+            state['step'] = 0
             state['factored'] = (
                 group['adam_nnmf_factor'] and
                 not (len(p.shape) == 1 and not group['vector_reshape'])
@@ -301,12 +320,12 @@ class AdaMuon_adv(torch.optim.Optimizer):
                         Q, _ = torch.linalg.qr(MG)
                     projected_M = Q.T @ M
                     ortho_projected_M = _newton_schulz_iteration(
-                        projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                        projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                     )
                     update = Q @ ortho_projected_M
                 else: # Fallback for invalid rank
                     update = _newton_schulz_iteration(
-                        signed_m_buf, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                        signed_m_buf, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                     )
             else:
                 # Original full Newton-Schulz
@@ -315,41 +334,61 @@ class AdaMuon_adv(torch.optim.Optimizer):
                     steps=group['ns_steps'],
                     eps=group['ns_eps'],
                     coeffs=group['ns_coeffs'],
+                    cns=group['accelerated_ns'],
+                    cns_a_bound=group['cns_a_bound'],
                 )
             del signed_m_buf
-            # Reconstruct second momentum from previous step's factors
-            vt_buf = _unnmf((state['mu_vbuf_nmf'], state['mv_vbuf_nmf']))
+            if group['normuon_variant']:
+                v_t = state['normuon_v']
+                # Update 2nd moment estimate
+                mean_squared_update = torch.mean(update.square(), dim=1)
+                v_t.mul_(beta2).add_(mean_squared_update, alpha=1 - beta2)
+                # Normalize update
+                if group['use_atan2']:
+                    a = 1.2732395
+                    update.atan2_(v_t.sqrt().unsqueeze(1)).mul_(a)
+                else:
+                    update.div_(v_t.sqrt().unsqueeze(1).add_(group['eps']))
+                # Scale learning rate
+                update_norm = torch.linalg.vector_norm(update)
+                scaled_lr = group['rms_target'] * lr * (p.numel()**0.5) / update_norm.add_(group['eps'])
+                update = update.view(p.shape).mul_(scaled_lr)
+                del mean_squared_update, update_norm, scaled_lr
+            else:
+                # Reconstruct second momentum from previous step's factors
+                vt_buf = _unnmf((state['mu_vbuf_nmf'], state['mv_vbuf_nmf']))
-            # Update second momentum in full-size
-            vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
+                # Update second momentum in full-size
+                vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
-            # Apply second momentum update (adaptive scaling)
-            if group['use_atan2']:
-                a = 1.2732395
-                denom = vt_buf.sqrt()
-                update.atan2_(denom).mul_(a)
-            else:
-                denom = vt_buf.sqrt().add_(group['eps'])
-                update.div_(denom)
-            del denom
+                # Apply second momentum update (adaptive scaling)
+                if group['use_atan2']:
+                    a = 1.2732395
+                    denom = vt_buf.sqrt()
+                    update.atan2_(denom).mul_(a)
+                else:
+                    denom = vt_buf.sqrt().add_(group['eps'])
+                    update.div_(denom)
+                del denom
-            # RMS-aligned rescaling
-            rms_target = group['rms_target']
-            num_elements = update.numel()
-            # Add eps to prevent division by zero
-            update.mul_(rms_target * (num_elements ** 0.5) / (update.norm() + group['eps']))
+                # RMS-aligned rescaling
+                rms_target = group['rms_target']
+                num_elements = update.numel()
+                # Add eps to prevent division by zero
+                update.mul_(rms_target * (num_elements ** 0.5) / (update.norm().add_(group['eps'])))
-            update = update.view(p.shape).mul_(lr)
-            del num_elements
+                update = update.view(p.shape).mul_(lr)
+                del num_elements
             # Compress updated moments and store new factors
             state['sign_buf'] = _pack_bools(mt_buf > 0)
             _nnmf(mt_buf.abs(), out=(state['mu_mbuf_nmf'], state['mv_mbuf_nmf']))
             del mt_buf
-            _nnmf(vt_buf.abs(), out=(state['mu_vbuf_nmf'], state['mv_vbuf_nmf']))
-            del vt_buf
+            if not group['normuon_variant']:
+                _nnmf(vt_buf.abs(), out=(state['mu_vbuf_nmf'], state['mv_vbuf_nmf']))
+                del vt_buf
         else: # Standard AdaMuon logic for non-factored tensors
@@ -387,12 +426,12 @@ class AdaMuon_adv(torch.optim.Optimizer):
                             Q, _ = torch.linalg.qr(MG)
                         projected_M = Q.T @ M
                         ortho_projected_M = _newton_schulz_iteration(
-                            projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                            projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                         )
                         update = Q @ ortho_projected_M
                     else: # Fallback for invalid rank
                         update = _newton_schulz_iteration(
-                            signed_m_buf, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                            signed_m_buf, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                         )
                 else:
                     # Original full Newton-Schulz
@@ -401,42 +440,59 @@ class AdaMuon_adv(torch.optim.Optimizer):
                         steps=group['ns_steps'],
                         eps=group['ns_eps'],
                         coeffs=group['ns_coeffs'],
+                        cns=group['accelerated_ns'],
+                        cns_a_bound=group['cns_a_bound'],
                     )
                 del signed_m_buf
                 update = update.view(original_shape)
-                vt_buf = state['second_momentum_buffer']
-                vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
-                # Apply second momentum update (adaptive scaling)
-                if group['use_atan2']:
-                    a = 1.2732395
-                    denom = vt_buf.sqrt()
-                    update.atan2_(denom).mul_(a)
+                if group['normuon_variant']:
+                    # NorMuon Logic
+                    v_t = state['normuon_v']
+                    # Update 2nd moment estimate
+                    mean_squared_update = torch.mean(update.square(), dim=1)
+                    v_t.mul_(beta2).add_(mean_squared_update, alpha=1 - beta2)
+                    # Normalize update
+                    if group['use_atan2']:
+                        a = 1.2732395
+                        update.atan2_(v_t.sqrt().unsqueeze(1)).mul_(a)
+                    else:
+                        update.div_(v_t.sqrt().unsqueeze(1).add_(group['eps']))
+                    # Scale learning rate
+                    update_norm = torch.linalg.vector_norm(update)
+                    scaled_lr = group['rms_target'] * lr * (p.numel()**0.5) / update_norm.add_(group['eps'])
+                    update.mul_(scaled_lr)
+                    del mean_squared_update, update_norm, scaled_lr
                 else:
-                    denom = vt_buf.sqrt().add_(group['eps'])
-                    update.div_(denom)
-                del denom
-                # RMS-aligned rescaling
-                rms_target = group['rms_target']
-                num_elements = update.numel()
-                # Add eps to prevent division by zero
-                update.mul_(rms_target * (num_elements ** 0.5) / (update.norm() + group['eps']))
-                del num_elements
-                update.mul_(lr)
+                    # Original AdaMuon Logic
+                    vt_buf = state['second_momentum_buffer']
+                    vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
+                    # Apply second momentum update (adaptive scaling)
+                    if group['use_atan2']:
+                        a = 1.2732395
+                        denom = vt_buf.sqrt()
+                        update.atan2_(denom).mul_(a)
+                    else:
+                        denom = vt_buf.sqrt().add_(group['eps'])
+                        update.div_(denom)
+                    del denom
+                    # RMS-aligned rescaling
+                    rms_target = group['rms_target']
+                    num_elements = update.numel()
+                    # Add eps to prevent division by zero
+                    update.mul_(rms_target * (num_elements ** 0.5) / (update.norm().add_(group['eps'])))
+                    del num_elements
+                    update.mul_(lr)
             else: # Fallback to standard SGD with momentum for 1D params (biases, etc.)
                 # Momentum update
                 mt_buf = state['momentum_buffer']
                 mt_buf.mul_(beta1).add_(grad)
                 if nesterov:
-                    # Nesterov momentum
                     update = grad.add(mt_buf, alpha=beta1)
-#                 elif Simplified_AdEMAMix: # TODO, it will break SGD since it requires x100 lower LR
+                # FIXME, Simplified_AdEMAMix will break SGD since it requires x100 lower LR
+#                 elif Simplified_AdEMAMix:
 #                     update = mt_buf.add(grad, alpha=alpha_grad)
                 else:
                     update = mt_buf.clone()
@@ -599,7 +655,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
         state = self.state[p]
         # Determine if using Adam or Muon based on state keys
         # We can use optm_type but I see this as a safer way.
         if 'momentum_buffer' in state or 'mu_mbuf_nmf' in state:
@@ -611,32 +666,41 @@ class AdaMuon_adv(torch.optim.Optimizer):
         is_compiled = group.get('compiled_optimizer', False)
         if use_adam:
+            step = state['step']
             if self.kourkoutas_helper:
                 # Prepare Kourkoutas-β once per optimizer step.
-                self.kourkoutas_helper.maybe_prepare_step(self.global_step)
+                self.kourkoutas_helper.maybe_prepare_step(step)
             # Adam-specific setup (bias correction)
             if group['adam_use_bias_correction']:
-                current_step = self.global_step + 1
+                current_step = step + 1
                 beta1_adam, beta2_adam = group['adam_betas']
                 bias_correction1 = 1.0 - beta1_adam ** current_step
                 bias_correction2 = 1.0 - beta2_adam ** current_step
             else:
                 bias_correction1 = 1.0
                 bias_correction2 = 1.0
+            self.state[p]['step'] += 1
             # Dispatch to compiled or uncompiled Adam step
             if is_compiled and self._compiled_adam_step is not None:
-                # Tensors must be used for compiled functions
-                lr_tensor = torch.tensor(lr, device=p.device)
-                bc1_tensor = torch.tensor(bias_correction1, device=p.device)
-                bc2_tensor = torch.tensor(bias_correction2, device=p.device)
-                self._compiled_adam_step(p, grad, state, group, lr_tensor, bc1_tensor, bc2_tensor)
+                # convert to tensors for compiled path once a step
+                if not hasattr(self, 'lr_adam_tensor') or self.lr_adam_tensor is None:
+                    self.lr_adam_tensor = torch.tensor(group['lr'])
+                    self.bc1 = torch.tensor(bias_correction1)
+                    self.bc2 = torch.tensor(bias_correction2)
+                self._compiled_adam_step(p, grad, state, group, self.lr_adam_tensor, self.bc1, self.bc2)
             else:
                 self._adam_step_parameter(p, grad, state, group, lr, bias_correction1, bias_correction2)
         else: # Muon path
             # Dispatch to compiled or uncompiled Muon step
             if is_compiled and self._compiled_muon_step is not None:
-                lr_tensor = torch.tensor(lr, device=p.device)
-                self._compiled_muon_step(p, grad, state, group, lr_tensor)
+                # convert to tensors for compiled path once a step
+                if not hasattr(self, 'lr_tensor') or self.lr_tensor is None:
+                    self.lr_tensor = torch.tensor(group['lr'])
+                self._compiled_muon_step(p, grad, state, group, self.lr_tensor)
             else:
                 self._muon_step_parameter(p, grad, state, group, lr)
@@ -659,6 +723,8 @@ class AdaMuon_adv(torch.optim.Optimizer):
             for i, p in enumerate(group['params']):
                 self.step_parameter(p, group, i)
-        self.global_step += 1
+        if self.param_groups[0].get('compiled_optimizer', False):
+            # Reset compile tensors once a step
+            self.lr_tensor = None
+            self.lr_adam_tensor = None
         return loss

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/optim/Muon_adv.py RENAMED Viewed

@@ -60,6 +60,9 @@ class Muon_adv(torch.optim.Optimizer):
         normuon_lr_scale (float): Scaling factor for the NorMuon learning rate.
             (default: 0.2)
         normuon_atan2 (bool): whether to use the atan2 for NorMuon. (default: False)
+        accelerated_ns (bool): If True, enables Chebyshev-accelerated Newton-Schulz, which
+            dynamically calculates optimal 3rd-order polynomial coefficients. (default: False)
+        cns_a_bound (float): Initial lower bound for singular values for CANS. (default: 1e-4)
         --- Auxiliary AdamW_adv Parameters (used for 'adam' groups) ---
         adam_betas (tuple[float, float]): Betas for the AdamW optimizer part.
         adam_eps (float): Epsilon for the AdamW optimizer part.
@@ -73,6 +76,7 @@ class Muon_adv(torch.optim.Optimizer):
         adam_beta3_ema (float): Beta3 for AdEMAMix.
         adam_alpha (float): Alpha for AdEMAMix.
         adam_kourkoutas_beta (bool): Kourkoutas-β for AdamW.
+        adam_nnmf_factor (bool): 1-bit factored for AdamW.
     """
     def __init__(
@@ -100,6 +104,9 @@ class Muon_adv(torch.optim.Optimizer):
         normuon_eps: float = 1e-8,
         normuon_lr_scale: float = 0.2,
         normuon_atan2: bool = False,
+        # CANS
+        accelerated_ns: bool = False,
+        cns_a_bound: float = 1e-4,
         # Compiled
         compiled_optimizer: bool = False,
         # --- AdamW_adv specific parameters ---
@@ -119,6 +126,7 @@ class Muon_adv(torch.optim.Optimizer):
         adam_ema_alpha: float = 0.95,
         adam_tiny_spike: float = 1e-9,
         adam_k_warmup_steps: int = 0,
+        adam_nnmf_factor: bool = False,
     ):
         if not (lr >= 0.0):
             raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
@@ -148,6 +156,8 @@ class Muon_adv(torch.optim.Optimizer):
             "normuon_variant": normuon_variant, "beta2_normuon": beta2_normuon,
             "normuon_eps": normuon_eps, "normuon_lr_scale": normuon_lr_scale,
             "normuon_atan2": normuon_atan2,
+            # CANS
+            "accelerated_ns": accelerated_ns, "cns_a_bound": cns_a_bound,
             # AdamW_adv defaults
             "adam_betas": adam_betas, "adam_eps": adam_eps, "adam_weight_decay": adam_weight_decay,
             "adam_use_bias_correction": adam_use_bias_correction, "adam_use_atan2": adam_use_atan2,
@@ -157,13 +167,13 @@ class Muon_adv(torch.optim.Optimizer):
             "adam_kourkoutas_beta": adam_kourkoutas_beta, "adam_beta2_min": adam_beta2_min,
             "adam_ema_alpha": adam_ema_alpha, "adam_tiny_spike": adam_tiny_spike,
             "adam_k_warmup_steps": adam_k_warmup_steps,
+            "adam_nnmf_factor":adam_nnmf_factor,
         }
         self.stochastic_rounding = stochastic_rounding
         self.compiled_optimizer = compiled_optimizer
         super().__init__(params, defaults)
-        self.global_step = 0 # For Adam bias correction and Kourkoutas
         self.kourkoutas_helper = None
         if any(group.get('adam_kourkoutas_beta', False) for group in self.param_groups):
             self.kourkoutas_helper = KourkoutasHelper(self)
@@ -214,6 +224,7 @@ class Muon_adv(torch.optim.Optimizer):
         if optim_type == 'muon':
             state['factored'] = (
                 group['nnmf_factor'] and
                 not (len(p.shape) == 1 and not group['vector_reshape'])
@@ -238,9 +249,12 @@ class Muon_adv(torch.optim.Optimizer):
                 elif len(p.shape) >= 2:
                     state['normuon_v'] = torch.zeros(p.shape[0], device=p.device, dtype=torch.float32)
+            group['adam_kourkoutas_beta'] = False
         elif optim_type == 'adam':
+            state['step'] = 0
             state['factored'] = (
                 group['adam_nnmf_factor'] and
                 not (len(p.shape) == 1 and not group['vector_reshape'])
@@ -319,12 +333,12 @@ class Muon_adv(torch.optim.Optimizer):
                         Q, _ = torch.linalg.qr(MG)
                     projected_M = Q.T @ M
                     ortho_projected_M = _newton_schulz_iteration(
-                        projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                        projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                     )
                     update = Q @ ortho_projected_M
                 else: # Fallback for invalid rank
                     update = _newton_schulz_iteration(
-                        update, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                        update, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                     )
             else:
                 # Original full Newton-Schulz
@@ -333,10 +347,12 @@ class Muon_adv(torch.optim.Optimizer):
                     steps=group['ns_steps'],
                     eps=group['ns_eps'],
                     coeffs=group['ns_coeffs'],
+                    cns=group['accelerated_ns'],
+                    cns_a_bound=group['cns_a_bound'],
                 )
-            if group['normuon_variant'] and 'normuon_v' in state:
+            if group['normuon_variant']:
                 v_t = state['normuon_v']
                 beta2_normuon = group['beta2_normuon']
                 # Update 2nd moment estimate
@@ -414,6 +430,8 @@ class Muon_adv(torch.optim.Optimizer):
                             steps=group['ns_steps'],
                             eps=group['ns_eps'],
                             coeffs=group['ns_coeffs'],
+                            cns=group['accelerated_ns'],
+                            cns_a_bound=group['cns_a_bound'],
                         )
                         # 5. Project back to the original space
@@ -424,6 +442,8 @@ class Muon_adv(torch.optim.Optimizer):
                             steps=group['ns_steps'],
                             eps=group['ns_eps'],
                             coeffs=group['ns_coeffs'],
+                            cns=group['accelerated_ns'],
+                            cns_a_bound=group['cns_a_bound'],
                         )
                 else:
                     # Original NewtonSchulz
@@ -432,10 +452,12 @@ class Muon_adv(torch.optim.Optimizer):
                         steps=group['ns_steps'],
                         eps=group['ns_eps'],
                         coeffs=group['ns_coeffs'],
+                        cns=group['accelerated_ns'],
+                        cns_a_bound=group['cns_a_bound'],
                     )
                 # NorMuon Logic
-                if group['normuon_variant'] and 'normuon_v' in state:
+                if group['normuon_variant']:
                     v_t = state['normuon_v']
                     beta2_normuon = group['beta2_normuon']
                     # Update 2nd moment estimate
@@ -629,10 +651,6 @@ class Muon_adv(torch.optim.Optimizer):
         state = self.state[p]
-        if self.kourkoutas_helper:
-            # Prepare Kourkoutas-β once per optimizer step.
-            self.kourkoutas_helper.maybe_prepare_step(self.global_step)
         # Determine if using Adam or Muon based on state keys
         # We can use optm_type but I see this as a safer way.
         if 'momentum_buffer' in state or 'mu_mbuf_nmf' in state:
@@ -644,9 +662,15 @@ class Muon_adv(torch.optim.Optimizer):
         is_compiled = group.get('compiled_optimizer', False)
         if use_adam:
+            step = state['step']
+            if self.kourkoutas_helper:
+                # Prepare Kourkoutas-β once per optimizer step.
+                self.kourkoutas_helper.maybe_prepare_step(step)
             # Adam-specific setup (bias correction)
             if group['adam_use_bias_correction']:
-                current_step = self.global_step + 1
+                current_step = step + 1
                 beta1_adam, beta2_adam = group['adam_betas']
                 bias_correction1 = 1.0 - beta1_adam ** current_step
                 bias_correction2 = 1.0 - beta2_adam ** current_step
@@ -654,6 +678,8 @@ class Muon_adv(torch.optim.Optimizer):
                 bias_correction1 = 1.0
                 bias_correction2 = 1.0
+            self.state[p]['step'] += 1
             # Dispatch to compiled or uncompiled Adam step
             if is_compiled and self._compiled_adam_step is not None:
                 # Tensors must be used for compiled functions
@@ -690,6 +716,4 @@ class Muon_adv(torch.optim.Optimizer):
             for i, p in enumerate(group['params']):
                 self.step_parameter(p, group, i)
-        self.global_step += 1
         return loss

adv_optm-1.2.dev16/adv_optm/util/Newton_Schulz.py ADDED Viewed

@@ -0,0 +1,87 @@
+import torch
+@torch.no_grad()
+def _newton_schulz_iteration(
+    G: torch.Tensor,
+    steps: int = 5,
+    eps: float = 1e-7,
+    coeffs: tuple[float, float, float] = (3.4445, -4.7750, 2.0315),
+    cns: bool = False,
+    cns_a_bound: float = 1e-4,
+) -> torch.Tensor:
+    """
+    Performs the Newton-Schulz iteration to find the nearest orthogonal matrix.
+    This is the core computation of the Muon optimizer.
+    Args:
+        G (torch.Tensor): The 2D input matrix (momentum-accumulated gradient).
+        steps (int): The number of iterations to run.
+        eps (float): Small constant for numerical stability during normalization.
+        coeffs (tuple[float, float, float]): The (a, b, c) coefficients for the
+            quintic polynomial update.
+        cns (bool): If True, enables Chebyshev-accelerated Newton-Schulz (CANS)
+            using an iterative 3rd-order polynomial with optimal coefficients
+            derived at each step.
+        cns_a_bound (float): The initial lower bound for singular values when
+            using CANS. The upper bound is assumed to be 1.0 after normalization.
+    Returns:
+        torch.Tensor: The orthogonalized matrix.
+    """
+    assert G.ndim >= 2
+    a, b, c = coeffs
+    X = G.to(torch.bfloat16)
+    transposed = G.size(-2) > G.size(-1)
+    if transposed:
+        X = X.mT
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + eps)
+    if cns:
+        # Chebyshev-accelerated Newton-Schulz (CANS) from
+        # "Accelerating Newton-Schulz Iteration for Orthogonalization via Chebyshev-type Polynomials"
+        # This implements the iterative scheme from Algorithm 1, using the
+        # closed-form 3rd-order polynomial from Proposition 2.
+        lower_bound = cns_a_bound
+        upper_bound = 1.0  # Matrix is normalized, so largest singular value is approx 1.
+        for _ in range(steps):
+            # Calculate optimal 3rd-order coefficients c1, c3 for p(x) = c1*x + c3*x^3
+            # based on the current singular value bounds [lower_bound, upper_bound].
+            # Formulas are derived from Proposition 2 and its proof in Appendix B of the paper.
+            a_bound, b_bound = lower_bound, upper_bound
+            term = a_bound*a_bound + a_bound*b_bound + b_bound*b_bound
+            e_sq = term / 3.0
+            # Calculate alpha, which scales the polynomial
+            common_den_part = 2.0 * (e_sq**1.5)
+            ab_part = a_bound*a_bound*b_bound + b_bound*b_bound*a_bound
+            alpha_den = common_den_part + ab_part
+            alpha = 6.0 / alpha_den
+            c1 = alpha * e_sq
+            c3 = -alpha / 3.0
+            # Apply the 3rd-order Newton-Schulz update
+            A = X @ X.mT
+            X = c1 * X + c3 * (A @ X)
+            # Update the singular value bounds for the next iteration based on the error
+            eps_num = common_den_part - ab_part
+            eps_val = eps_num / alpha_den
+            lower_bound = 1.0 - eps_val
+            upper_bound = 1.0 + eps_val
+    else:
+        # Perform the iterative updates
+        for _ in range(steps):
+            A = X @ X.mT
+            B = b * A + c * (A @ A)
+            X = a * X + B @ X
+    # Transpose back if necessary
+    if transposed:
+        X = X.mT
+    return X.to(G.dtype)

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 1.2.dev14
+Version: 1.2.dev16
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setup(
     name="adv_optm",
-    version="1.2.dev14",
+    version="1.2.dev16",
     author="Koratahiu",
     author_email="hiuhonor@gmail.com",
     license='Apache 2.0',

adv_optm-1.2.dev14/adv_optm/util/Newton_Schulz.py DELETED Viewed

@@ -1,48 +0,0 @@
-import torch
-@torch.no_grad()
-def _newton_schulz_iteration(
-    G: torch.Tensor,
-    steps: int = 5,
-    eps: float = 1e-7,
-    coeffs: tuple[float, float, float] = (3.4445, -4.7750, 2.0315)
-) -> torch.Tensor:
-    """
-    Performs the Newton-Schulz iteration to find the nearest orthogonal matrix.
-    This is the core computation of the Muon optimizer.
-    Args:
-        G (torch.Tensor): The 2D input matrix (momentum-accumulated gradient).
-        steps (int): The number of iterations to run.
-        eps (float): Small constant for numerical stability during normalization.
-        coeffs (tuple[float, float, float]): The (a, b, c) coefficients for the
-            quintic polynomial update.
-    Returns:
-        torch.Tensor: The orthogonalized matrix.
-    """
-    assert G.ndim == 2, "Newton-Schulz iteration only supports 2D matrices."
-    a, b, c = coeffs
-    X = G.to(torch.bfloat16)
-    # Normalize the matrix
-    X.div_(X.norm() + eps)
-    # Handle non-square matrices by transposing the taller one
-    transposed = G.size(0) > G.size(1)
-    if transposed:
-        X = X.T
-    # Perform the iterative updates
-    for _ in range(steps):
-        A = X @ X.T
-        B = b * A + c * (A @ A)
-        X = a * X + B @ X
-    # Transpose back if necessary
-    if transposed:
-        X = X.T
-    return X.to(G.dtype)

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/LICENSE RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/README.md RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/optim/AdamW_adv.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/optim/Adopt_adv.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/optim/Lion_Prodigy_adv.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/optim/Lion_adv.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/optim/Prodigy_adv.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/optim/Simplified_AdEMAMix.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/optim/__init__.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/util/BF16_Stochastic_Rounding.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/util/Effective_Shape.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/util/Kourkoutas.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/util/NNMF.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/util/One_Bit_Boolean.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/util/OrthoGrad.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm/util/__init__.py RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm.egg-info/requires.txt RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/adv_optm.egg-info/top_level.txt RENAMED Viewed

File without changes

{adv_optm-1.2.dev14 → adv_optm-1.2.dev16}/setup.cfg RENAMED Viewed

File without changes

adv-optm 1.2.dev14__tar.gz → 1.2.dev16__tar.gz

Potentially problematic release.

adv-optm 1.2.dev14tar.gz → 1.2.dev16tar.gz