PyPI - adv-optm - Versions diffs - 1.2.dev13__py3-none-any.whl → 1.2.dev15__py3-none-any.whl - Mend

adv-optm 1.2.dev13py3-none-any.whl → 1.2.dev15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of adv-optm might be problematic. Click here for more details.

Files changed (10) hide show

adv_optm/__init__.py CHANGED Viewed

@@ -20,4 +20,4 @@ __all__ = [
     "AdaMuon_adv",
 ]
-__version__ = "1.2.dev13"
+__version__ = "1.2.dev15"

adv_optm/optim/AdaMuon_adv.py CHANGED Viewed

@@ -63,6 +63,9 @@ class AdaMuon_adv(torch.optim.Optimizer):
             (default: False)
         ortho_rank (int): The rank for low-rank orthogonalization.
             (default: 128)
+        accelerated_ns (bool): If True, enables Chebyshev-accelerated Newton-Schulz, which
+            dynamically calculates optimal 3rd-order polynomial coefficients. (default: False)
+        cns_a_bound (float): Initial lower bound for singular values for CANS. (default: 1e-4)
         nnmf_factor (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: False)
         --- Auxiliary AdamW_adv Parameters (used for 'adam' groups) ---
@@ -96,11 +99,16 @@ class AdaMuon_adv(torch.optim.Optimizer):
         nesterov: bool = False,
         Simplified_AdEMAMix: bool = False,
         alpha_grad: float = 100.0,
-        vector_reshape: bool = False,
+        normuon_variant: bool = False,
         # Low-rank Muon
         low_rank_ortho: bool = False,
         ortho_rank: int = 128,
+        # Factored
+        vector_reshape: bool = False,
         nnmf_factor: bool = False,
+        # CANS
+        accelerated_ns: bool = False,
+        cns_a_bound: float = 1e-4,
         # Compiled
         compiled_optimizer: bool = False,
         # --- AdamW_adv specific parameters ---
@@ -139,9 +147,12 @@ class AdaMuon_adv(torch.optim.Optimizer):
             "vector_reshape": vector_reshape,
             "nesterov":nesterov, "use_atan2":use_atan2,
             "Simplified_AdEMAMix": Simplified_AdEMAMix, "alpha_grad": alpha_grad,
+            "normuon_variant": normuon_variant,
             # Low-rank Ortho
             "low_rank_ortho": low_rank_ortho, "ortho_rank": ortho_rank,
             "compiled_optimizer":compiled_optimizer,
+            # CANS
+            "accelerated_ns": accelerated_ns, "cns_a_bound": cns_a_bound,
             # AdamW_adv defaults
             "adam_betas": adam_betas, "adam_eps": adam_eps, "adam_weight_decay": adam_weight_decay,
             "adam_use_bias_correction": adam_use_bias_correction, "adam_use_atan2": adam_use_atan2,
@@ -156,7 +167,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        self.global_step = 0 # For Adam bias correction and Kourkoutas
         self.kourkoutas_helper = None
         if any(group.get('adam_kourkoutas_beta', False) for group in self.param_groups):
             self.kourkoutas_helper = KourkoutasHelper(self)
@@ -214,16 +224,25 @@ class AdaMuon_adv(torch.optim.Optimizer):
                     state['mv_mbuf_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
                     packed_d2 = (d2 + 7) // 8
                     state['sign_buf'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=device)
-                    state['mu_vbuf_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
-                    state['mv_vbuf_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+                    if not group['normuon_variant']:
+                        state['mu_vbuf_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                        state['mv_vbuf_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
             else:
-                if len(p.shape) >= 2:
+                if len(p.shape) >= 2 and not group['normuon_variant']:
                     state['second_momentum_buffer'] = torch.zeros_like(p)
                 state['momentum_buffer'] = torch.zeros_like(p)
+            # NorMuon state initialization
+            if group['normuon_variant']:
+                if state['factored']:
+                    state['normuon_v'] = torch.zeros(d1, device=p.device, dtype=torch.float32)
+                elif len(p.shape) >= 2:
+                    state['normuon_v'] = torch.zeros(p.shape[0], device=p.device, dtype=torch.float32)
         elif optim_type == 'adam':
+            state['step'] = 0
             state['factored'] = (
                 group['adam_nnmf_factor'] and
                 not (len(p.shape) == 1 and not group['vector_reshape'])
@@ -301,12 +320,12 @@ class AdaMuon_adv(torch.optim.Optimizer):
                         Q, _ = torch.linalg.qr(MG)
                     projected_M = Q.T @ M
                     ortho_projected_M = _newton_schulz_iteration(
-                        projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                        projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                     )
                     update = Q @ ortho_projected_M
                 else: # Fallback for invalid rank
                     update = _newton_schulz_iteration(
-                        signed_m_buf, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                        signed_m_buf, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                     )
             else:
                 # Original full Newton-Schulz
@@ -315,41 +334,61 @@ class AdaMuon_adv(torch.optim.Optimizer):
                     steps=group['ns_steps'],
                     eps=group['ns_eps'],
                     coeffs=group['ns_coeffs'],
+                    cns=group['accelerated_ns'],
+                    cns_a_bound=group['cns_a_bound'],
                 )
             del signed_m_buf
-            # Reconstruct second momentum from previous step's factors
-            vt_buf = _unnmf((state['mu_vbuf_nmf'], state['mv_vbuf_nmf']))
+            if group['normuon_variant']:
+                v_t = state['normuon_v']
+                # Update 2nd moment estimate
+                mean_squared_update = torch.mean(update.square(), dim=1)
+                v_t.mul_(beta2).add_(mean_squared_update, alpha=1 - beta2)
+                # Normalize update
+                if group['use_atan2']:
+                    a = 1.2732395
+                    update.atan2_(v_t.sqrt().unsqueeze(1)).mul_(a)
+                else:
+                    update.div_(v_t.sqrt().unsqueeze(1).add_(group['eps']))
+                # Scale learning rate
+                update_norm = torch.linalg.vector_norm(update)
+                scaled_lr = group['rms_target'] * lr * (p.numel()**0.5) / update_norm.add_(group['eps'])
+                update = update.view(p.shape).mul_(scaled_lr)
+                del mean_squared_update, update_norm, scaled_lr
+            else:
+                # Reconstruct second momentum from previous step's factors
+                vt_buf = _unnmf((state['mu_vbuf_nmf'], state['mv_vbuf_nmf']))
-            # Update second momentum in full-size
-            vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
+                # Update second momentum in full-size
+                vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
-            # Apply second momentum update (adaptive scaling)
-            if group['use_atan2']:
-                a = 1.2732395
-                denom = vt_buf.sqrt()
-                update.atan2_(denom).mul_(a)
-            else:
-                denom = vt_buf.sqrt().add_(group['eps'])
-                update.div_(denom)
-            del denom
+                # Apply second momentum update (adaptive scaling)
+                if group['use_atan2']:
+                    a = 1.2732395
+                    denom = vt_buf.sqrt()
+                    update.atan2_(denom).mul_(a)
+                else:
+                    denom = vt_buf.sqrt().add_(group['eps'])
+                    update.div_(denom)
+                del denom
-            # RMS-aligned rescaling
-            rms_target = group['rms_target']
-            num_elements = update.numel()
-            # Add eps to prevent division by zero
-            update.mul_(rms_target * (num_elements ** 0.5) / (update.norm() + group['eps']))
+                # RMS-aligned rescaling
+                rms_target = group['rms_target']
+                num_elements = update.numel()
+                # Add eps to prevent division by zero
+                update.mul_(rms_target * (num_elements ** 0.5) / (update.norm().add_(group['eps'])))
-            update = update.view(p.shape).mul_(lr)
-            del num_elements
+                update = update.view(p.shape).mul_(lr)
+                del num_elements
             # Compress updated moments and store new factors
             state['sign_buf'] = _pack_bools(mt_buf > 0)
             _nnmf(mt_buf.abs(), out=(state['mu_mbuf_nmf'], state['mv_mbuf_nmf']))
             del mt_buf
-            _nnmf(vt_buf.abs(), out=(state['mu_vbuf_nmf'], state['mv_vbuf_nmf']))
-            del vt_buf
+            if not group['normuon_variant']:
+                _nnmf(vt_buf.abs(), out=(state['mu_vbuf_nmf'], state['mv_vbuf_nmf']))
+                del vt_buf
         else: # Standard AdaMuon logic for non-factored tensors
@@ -387,12 +426,12 @@ class AdaMuon_adv(torch.optim.Optimizer):
                             Q, _ = torch.linalg.qr(MG)
                         projected_M = Q.T @ M
                         ortho_projected_M = _newton_schulz_iteration(
-                            projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                            projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                         )
                         update = Q @ ortho_projected_M
                     else: # Fallback for invalid rank
                         update = _newton_schulz_iteration(
-                            signed_m_buf, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                            signed_m_buf, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                         )
                 else:
                     # Original full Newton-Schulz
@@ -401,42 +440,59 @@ class AdaMuon_adv(torch.optim.Optimizer):
                         steps=group['ns_steps'],
                         eps=group['ns_eps'],
                         coeffs=group['ns_coeffs'],
+                        cns=group['accelerated_ns'],
+                        cns_a_bound=group['cns_a_bound'],
                     )
                 del signed_m_buf
                 update = update.view(original_shape)
-                vt_buf = state['second_momentum_buffer']
-                vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
-                # Apply second momentum update (adaptive scaling)
-                if group['use_atan2']:
-                    a = 1.2732395
-                    denom = vt_buf.sqrt()
-                    update.atan2_(denom).mul_(a)
+                if group['normuon_variant']:
+                    # NorMuon Logic
+                    v_t = state['normuon_v']
+                    # Update 2nd moment estimate
+                    mean_squared_update = torch.mean(update.square(), dim=1)
+                    v_t.mul_(beta2).add_(mean_squared_update, alpha=1 - beta2)
+                    # Normalize update
+                    if group['use_atan2']:
+                        a = 1.2732395
+                        update.atan2_(v_t.sqrt().unsqueeze(1)).mul_(a)
+                    else:
+                        update.div_(v_t.sqrt().unsqueeze(1).add_(group['eps']))
+                    # Scale learning rate
+                    update_norm = torch.linalg.vector_norm(update)
+                    scaled_lr = group['rms_target'] * lr * (p.numel()**0.5) / update_norm.add_(group['eps'])
+                    update.mul_(scaled_lr)
+                    del mean_squared_update, update_norm, scaled_lr
                 else:
-                    denom = vt_buf.sqrt().add_(group['eps'])
-                    update.div_(denom)
-                del denom
-                # RMS-aligned rescaling
-                rms_target = group['rms_target']
-                num_elements = update.numel()
-                # Add eps to prevent division by zero
-                update.mul_(rms_target * (num_elements ** 0.5) / (update.norm() + group['eps']))
-                del num_elements
-                update.mul_(lr)
+                    # Original AdaMuon Logic
+                    vt_buf = state['second_momentum_buffer']
+                    vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
+                    # Apply second momentum update (adaptive scaling)
+                    if group['use_atan2']:
+                        a = 1.2732395
+                        denom = vt_buf.sqrt()
+                        update.atan2_(denom).mul_(a)
+                    else:
+                        denom = vt_buf.sqrt().add_(group['eps'])
+                        update.div_(denom)
+                    del denom
+                    # RMS-aligned rescaling
+                    rms_target = group['rms_target']
+                    num_elements = update.numel()
+                    # Add eps to prevent division by zero
+                    update.mul_(rms_target * (num_elements ** 0.5) / (update.norm().add_(group['eps'])))
+                    del num_elements
+                    update.mul_(lr)
             else: # Fallback to standard SGD with momentum for 1D params (biases, etc.)
                 # Momentum update
                 mt_buf = state['momentum_buffer']
                 mt_buf.mul_(beta1).add_(grad)
                 if nesterov:
-                    # Nesterov momentum
                     update = grad.add(mt_buf, alpha=beta1)
-#                 elif Simplified_AdEMAMix: # TODO, it will break SGD since it requires x100 lower LR
+                # FIXME, Simplified_AdEMAMix will break SGD since it requires x100 lower LR
+#                 elif Simplified_AdEMAMix:
 #                     update = mt_buf.add(grad, alpha=alpha_grad)
                 else:
                     update = mt_buf.clone()
@@ -599,7 +655,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
         state = self.state[p]
         # Determine if using Adam or Muon based on state keys
         # We can use optm_type but I see this as a safer way.
         if 'momentum_buffer' in state or 'mu_mbuf_nmf' in state:
@@ -611,32 +666,41 @@ class AdaMuon_adv(torch.optim.Optimizer):
         is_compiled = group.get('compiled_optimizer', False)
         if use_adam:
+            step = state['step']
             if self.kourkoutas_helper:
                 # Prepare Kourkoutas-β once per optimizer step.
-                self.kourkoutas_helper.maybe_prepare_step(self.global_step)
+                self.kourkoutas_helper.maybe_prepare_step(step)
             # Adam-specific setup (bias correction)
             if group['adam_use_bias_correction']:
-                current_step = self.global_step + 1
+                current_step = step + 1
                 beta1_adam, beta2_adam = group['adam_betas']
                 bias_correction1 = 1.0 - beta1_adam ** current_step
                 bias_correction2 = 1.0 - beta2_adam ** current_step
             else:
                 bias_correction1 = 1.0
                 bias_correction2 = 1.0
+            self.state[p]['step'] += 1
             # Dispatch to compiled or uncompiled Adam step
             if is_compiled and self._compiled_adam_step is not None:
-                # Tensors must be used for compiled functions
-                lr_tensor = torch.tensor(lr, device=p.device)
-                bc1_tensor = torch.tensor(bias_correction1, device=p.device)
-                bc2_tensor = torch.tensor(bias_correction2, device=p.device)
-                self._compiled_adam_step(p, grad, state, group, lr_tensor, bc1_tensor, bc2_tensor)
+                # convert to tensors for compiled path once a step
+                if not hasattr(self, 'lr_adam_tensor') or self.lr_adam_tensor is None:
+                    self.lr_adam_tensor = torch.tensor(group['lr'])
+                    self.bc1 = torch.tensor(bias_correction1)
+                    self.bc2 = torch.tensor(bias_correction2)
+                self._compiled_adam_step(p, grad, state, group, self.lr_adam_tensor, self.bc1, self.bc2)
             else:
                 self._adam_step_parameter(p, grad, state, group, lr, bias_correction1, bias_correction2)
         else: # Muon path
             # Dispatch to compiled or uncompiled Muon step
             if is_compiled and self._compiled_muon_step is not None:
-                lr_tensor = torch.tensor(lr, device=p.device)
-                self._compiled_muon_step(p, grad, state, group, lr_tensor)
+                # convert to tensors for compiled path once a step
+                if not hasattr(self, 'lr_tensor') or self.lr_tensor is None:
+                    self.lr_tensor = torch.tensor(group['lr'])
+                self._compiled_muon_step(p, grad, state, group, self.lr_tensor)
             else:
                 self._muon_step_parameter(p, grad, state, group, lr)
@@ -659,6 +723,8 @@ class AdaMuon_adv(torch.optim.Optimizer):
             for i, p in enumerate(group['params']):
                 self.step_parameter(p, group, i)
-        self.global_step += 1
+        if self.param_groups[0].get('compiled_optimizer', False):
+            # Reset compile tensors once a step
+            self.lr_tensor = None
+            self.lr_adam_tensor = None
         return loss

adv_optm/optim/Muon_adv.py CHANGED Viewed

@@ -60,6 +60,9 @@ class Muon_adv(torch.optim.Optimizer):
         normuon_lr_scale (float): Scaling factor for the NorMuon learning rate.
             (default: 0.2)
         normuon_atan2 (bool): whether to use the atan2 for NorMuon. (default: False)
+        accelerated_ns (bool): If True, enables Chebyshev-accelerated Newton-Schulz, which
+            dynamically calculates optimal 3rd-order polynomial coefficients. (default: False)
+        cns_a_bound (float): Initial lower bound for singular values for CANS. (default: 1e-4)
         --- Auxiliary AdamW_adv Parameters (used for 'adam' groups) ---
         adam_betas (tuple[float, float]): Betas for the AdamW optimizer part.
         adam_eps (float): Epsilon for the AdamW optimizer part.
@@ -100,6 +103,9 @@ class Muon_adv(torch.optim.Optimizer):
         normuon_eps: float = 1e-8,
         normuon_lr_scale: float = 0.2,
         normuon_atan2: bool = False,
+        # CANS
+        accelerated_ns: bool = False,
+        cns_a_bound: float = 1e-4,
         # Compiled
         compiled_optimizer: bool = False,
         # --- AdamW_adv specific parameters ---
@@ -148,6 +154,8 @@ class Muon_adv(torch.optim.Optimizer):
             "normuon_variant": normuon_variant, "beta2_normuon": beta2_normuon,
             "normuon_eps": normuon_eps, "normuon_lr_scale": normuon_lr_scale,
             "normuon_atan2": normuon_atan2,
+            # CANS
+            "accelerated_ns": accelerated_ns, "cns_a_bound": cns_a_bound,
             # AdamW_adv defaults
             "adam_betas": adam_betas, "adam_eps": adam_eps, "adam_weight_decay": adam_weight_decay,
             "adam_use_bias_correction": adam_use_bias_correction, "adam_use_atan2": adam_use_atan2,
@@ -163,7 +171,6 @@ class Muon_adv(torch.optim.Optimizer):
         super().__init__(params, defaults)
-        self.global_step = 0 # For Adam bias correction and Kourkoutas
         self.kourkoutas_helper = None
         if any(group.get('adam_kourkoutas_beta', False) for group in self.param_groups):
             self.kourkoutas_helper = KourkoutasHelper(self)
@@ -214,6 +221,7 @@ class Muon_adv(torch.optim.Optimizer):
         if optim_type == 'muon':
             state['factored'] = (
                 group['nnmf_factor'] and
                 not (len(p.shape) == 1 and not group['vector_reshape'])
@@ -238,9 +246,12 @@ class Muon_adv(torch.optim.Optimizer):
                 elif len(p.shape) >= 2:
                     state['normuon_v'] = torch.zeros(p.shape[0], device=p.device, dtype=torch.float32)
+            group['adam_kourkoutas_beta'] = False
         elif optim_type == 'adam':
+            state['step'] = 0
             state['factored'] = (
                 group['adam_nnmf_factor'] and
                 not (len(p.shape) == 1 and not group['vector_reshape'])
@@ -319,12 +330,12 @@ class Muon_adv(torch.optim.Optimizer):
                         Q, _ = torch.linalg.qr(MG)
                     projected_M = Q.T @ M
                     ortho_projected_M = _newton_schulz_iteration(
-                        projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                        projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                     )
                     update = Q @ ortho_projected_M
                 else: # Fallback for invalid rank
                     update = _newton_schulz_iteration(
-                        update, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                        update, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs'], cns=group['accelerated_ns'], cns_a_bound=group['cns_a_bound']
                     )
             else:
                 # Original full Newton-Schulz
@@ -333,10 +344,12 @@ class Muon_adv(torch.optim.Optimizer):
                     steps=group['ns_steps'],
                     eps=group['ns_eps'],
                     coeffs=group['ns_coeffs'],
+                    cns=group['accelerated_ns'],
+                    cns_a_bound=group['cns_a_bound'],
                 )
-            if group['normuon_variant'] and 'normuon_v' in state:
+            if group['normuon_variant']:
                 v_t = state['normuon_v']
                 beta2_normuon = group['beta2_normuon']
                 # Update 2nd moment estimate
@@ -414,6 +427,8 @@ class Muon_adv(torch.optim.Optimizer):
                             steps=group['ns_steps'],
                             eps=group['ns_eps'],
                             coeffs=group['ns_coeffs'],
+                            cns=group['accelerated_ns'],
+                            cns_a_bound=group['cns_a_bound'],
                         )
                         # 5. Project back to the original space
@@ -424,6 +439,8 @@ class Muon_adv(torch.optim.Optimizer):
                             steps=group['ns_steps'],
                             eps=group['ns_eps'],
                             coeffs=group['ns_coeffs'],
+                            cns=group['accelerated_ns'],
+                            cns_a_bound=group['cns_a_bound'],
                         )
                 else:
                     # Original NewtonSchulz
@@ -432,10 +449,12 @@ class Muon_adv(torch.optim.Optimizer):
                         steps=group['ns_steps'],
                         eps=group['ns_eps'],
                         coeffs=group['ns_coeffs'],
+                        cns=group['accelerated_ns'],
+                        cns_a_bound=group['cns_a_bound'],
                     )
                 # NorMuon Logic
-                if group['normuon_variant'] and 'normuon_v' in state:
+                if group['normuon_variant']:
                     v_t = state['normuon_v']
                     beta2_normuon = group['beta2_normuon']
                     # Update 2nd moment estimate
@@ -629,10 +648,6 @@ class Muon_adv(torch.optim.Optimizer):
         state = self.state[p]
-        if self.kourkoutas_helper:
-            # Prepare Kourkoutas-β once per optimizer step.
-            self.kourkoutas_helper.maybe_prepare_step(self.global_step)
         # Determine if using Adam or Muon based on state keys
         # We can use optm_type but I see this as a safer way.
         if 'momentum_buffer' in state or 'mu_mbuf_nmf' in state:
@@ -644,9 +659,15 @@ class Muon_adv(torch.optim.Optimizer):
         is_compiled = group.get('compiled_optimizer', False)
         if use_adam:
+            step = state['step']
+            if self.kourkoutas_helper:
+                # Prepare Kourkoutas-β once per optimizer step.
+                self.kourkoutas_helper.maybe_prepare_step(step)
             # Adam-specific setup (bias correction)
             if group['adam_use_bias_correction']:
-                current_step = self.global_step + 1
+                current_step = step + 1
                 beta1_adam, beta2_adam = group['adam_betas']
                 bias_correction1 = 1.0 - beta1_adam ** current_step
                 bias_correction2 = 1.0 - beta2_adam ** current_step
@@ -654,6 +675,8 @@ class Muon_adv(torch.optim.Optimizer):
                 bias_correction1 = 1.0
                 bias_correction2 = 1.0
+            self.state[p]['step'] += 1
             # Dispatch to compiled or uncompiled Adam step
             if is_compiled and self._compiled_adam_step is not None:
                 # Tensors must be used for compiled functions
@@ -690,6 +713,4 @@ class Muon_adv(torch.optim.Optimizer):
             for i, p in enumerate(group['params']):
                 self.step_parameter(p, group, i)
-        self.global_step += 1
         return loss

adv_optm/util/Kourkoutas.py CHANGED Viewed

@@ -71,7 +71,7 @@ class KourkoutasHelper:
         for layer_key, info in self.layer_info.items():
             params, group = info['params'], info['group_ref']
-            if not group.get('kourkoutas_beta', False):
+            if not group.get('kourkoutas_beta', False) and not group.get('adam_kourkoutas_beta', False):
                 continue
             first_param_in_layer = info['params'][0]

adv_optm/util/Newton_Schulz.py CHANGED Viewed

@@ -5,7 +5,9 @@ def _newton_schulz_iteration(
     G: torch.Tensor,
     steps: int = 5,
     eps: float = 1e-7,
-    coeffs: tuple[float, float, float] = (3.4445, -4.7750, 2.0315)
+    coeffs: tuple[float, float, float] = (3.4445, -4.7750, 2.0315),
+    cns: bool = False,
+    cns_a_bound: float = 1e-4,
 ) -> torch.Tensor:
     """
     Performs the Newton-Schulz iteration to find the nearest orthogonal matrix.
@@ -17,32 +19,69 @@ def _newton_schulz_iteration(
         eps (float): Small constant for numerical stability during normalization.
         coeffs (tuple[float, float, float]): The (a, b, c) coefficients for the
             quintic polynomial update.
+        cns (bool): If True, enables Chebyshev-accelerated Newton-Schulz (CANS)
+            using an iterative 3rd-order polynomial with optimal coefficients
+            derived at each step.
+        cns_a_bound (float): The initial lower bound for singular values when
+            using CANS. The upper bound is assumed to be 1.0 after normalization.
     Returns:
         torch.Tensor: The orthogonalized matrix.
     """
-    assert G.ndim == 2, "Newton-Schulz iteration only supports 2D matrices."
+    assert G.ndim >= 2
     a, b, c = coeffs
     X = G.to(torch.bfloat16)
-    # Normalize the matrix
-    X.div_(X.norm() + eps)
-    # Handle non-square matrices by transposing the taller one
-    transposed = G.size(0) > G.size(1)
+    transposed = G.size(-2) > G.size(-1)
     if transposed:
-        X = X.T
+        X = X.mT
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + eps)
+    if cns:
+        # Chebyshev-accelerated Newton-Schulz (CANS) from
+        # "Accelerating Newton-Schulz Iteration for Orthogonalization via Chebyshev-type Polynomials"
+        # This implements the iterative scheme from Algorithm 1, using the
+        # closed-form 3rd-order polynomial from Proposition 2.
+        lower_bound = cns_a_bound
+        upper_bound = 1.0  # Matrix is normalized, so largest singular value is approx 1.
+        for _ in range(steps):
+            # Calculate optimal 3rd-order coefficients c1, c3 for p(x) = c1*x + c3*x^3
+            # based on the current singular value bounds [lower_bound, upper_bound].
+            # Formulas are derived from Proposition 2 and its proof in Appendix B of the paper.
+            a_bound, b_bound = lower_bound, upper_bound
+            term = a_bound*a_bound + a_bound*b_bound + b_bound*b_bound
+            e_sq = term / 3.0
+            # Calculate alpha, which scales the polynomial
+            common_den_part = 2.0 * (e_sq**1.5)
+            ab_part = a_bound*a_bound*b_bound + b_bound*b_bound*a_bound
+            alpha_den = common_den_part + ab_part
+            alpha = 6.0 / alpha_den
+            c1 = alpha * e_sq
+            c3 = -alpha / 3.0
+            # Apply the 3rd-order Newton-Schulz update
+            A = X @ X.mT
+            X = c1 * X + c3 * (A @ X)
-    # Perform the iterative updates
-    for _ in range(steps):
-        A = X @ X.T
-        B = b * A + c * (A @ A)
-        X = a * X + B @ X
+            # Update the singular value bounds for the next iteration based on the error
+            eps_num = common_den_part - ab_part
+            eps_val = eps_num / alpha_den
+            lower_bound = 1.0 - eps_val
+            upper_bound = 1.0 + eps_val
+    else:
+        # Perform the iterative updates
+        for _ in range(steps):
+            A = X @ X.mT
+            B = b * A + c * (A @ A)
+            X = a * X + B @ X
     # Transpose back if necessary
     if transposed:
-        X = X.T
+        X = X.mT
-    return X.to(G.dtype)
+    return X.to(G.dtype)

{adv_optm-1.2.dev13.dist-info → adv_optm-1.2.dev15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 1.2.dev13
+Version: 1.2.dev15
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-1.2.dev13.dist-info → adv_optm-1.2.dev15.dist-info}/RECORD RENAMED Viewed

@@ -1,23 +1,23 @@
-adv_optm/__init__.py,sha256=b2cBBXd4W_tBKGcxO1SHA5SaQrCl_cjxlHdoZSICo3E,380
-adv_optm/optim/AdaMuon_adv.py,sha256=-UBw_mJj8JzDAi3zQ0nLnSOgzsTzl7b7kVksDRUziEE,30582
+adv_optm/__init__.py,sha256=I9iRXHonvg_82dEmyKXqt9PyN04Ez8TVbMb1uZgRZAc,380
+adv_optm/optim/AdaMuon_adv.py,sha256=zjZHFS7ng5KwemQzePjFiGtNZlcgbzmmnqF6A80h_Tg,34652
 adv_optm/optim/AdamW_adv.py,sha256=KL9SCJWZ_ckAQEApB6ofbndVYjancN-v7Us7hJLFf54,17475
 adv_optm/optim/Adopt_adv.py,sha256=S8XI2YA7683jsW8p7igc2YcU30lsN0H18qL02Kpvj8E,21244
 adv_optm/optim/Lion_Prodigy_adv.py,sha256=LEA3UYJpPeFnmxeniLNv1u2LKKj4ufx3Bq_MLw-nWXk,14617
 adv_optm/optim/Lion_adv.py,sha256=aGNAplZlyXYgVllYcV_s4bK8iC4fv6EizFoWIMNLdBc,8299
-adv_optm/optim/Muon_adv.py,sha256=8d99NcXzLyxTbxVVXC8mHyeW7wM8jjK59QoXVTLScQA,32112
+adv_optm/optim/Muon_adv.py,sha256=QutgiRkDS36O5BQNdcwdIcYBCKPy7U07YYVQT6dq8tc,33165
 adv_optm/optim/Prodigy_adv.py,sha256=lEjbtuQbomsCX39DnTPeI8Z5YG0f2aZPXN_E7-nGgWw,26060
 adv_optm/optim/Simplified_AdEMAMix.py,sha256=nEIA3yM11nBooKzHudB5l3x4UdFRBYRwiKVUkGmO0K8,12971
 adv_optm/optim/__init__.py,sha256=hpUWE6CKtt_rvMdgQVb3PtjhfZAvAxTq6hp8H8rIpBo,489
 adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
 adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
-adv_optm/util/Kourkoutas.py,sha256=_fq2glPqKmzgWpLedfwq5EqIJAxICUK2fmUP-cdcgq0,7467
+adv_optm/util/Kourkoutas.py,sha256=SSzhe0B6Zb2AXGwCKpVTLr0aaFfspcFBNZCZG3azI9k,7516
 adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
-adv_optm/util/Newton_Schulz.py,sha256=wJ_sKRaGVIsOofQ737my4ng494qX_pfgOqlDDmYtnCg,1377
+adv_optm/util/Newton_Schulz.py,sha256=bBboYw_jm5_FMf0Citl79uqNedkHOTjQnUI7rZgLBmY,3341
 adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
 adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
 adv_optm/util/__init__.py,sha256=CXzS703GB4gil85khZi7sgKOnbzXGBOltshIOSPqj18,435
-adv_optm-1.2.dev13.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-adv_optm-1.2.dev13.dist-info/METADATA,sha256=NW2SU3Uw-ow_Nn7B7VURZ4LmWKsem5KebbDrYystfU4,14023
-adv_optm-1.2.dev13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-adv_optm-1.2.dev13.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
-adv_optm-1.2.dev13.dist-info/RECORD,,
+adv_optm-1.2.dev15.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+adv_optm-1.2.dev15.dist-info/METADATA,sha256=CH8IxEUd-TSH5dVzXKR-rl54pTIIU_JTN_MkviBWprs,14023
+adv_optm-1.2.dev15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+adv_optm-1.2.dev15.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
+adv_optm-1.2.dev15.dist-info/RECORD,,

{adv_optm-1.2.dev13.dist-info → adv_optm-1.2.dev15.dist-info}/WHEEL RENAMED Viewed

File without changes

{adv_optm-1.2.dev13.dist-info → adv_optm-1.2.dev15.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{adv_optm-1.2.dev13.dist-info → adv_optm-1.2.dev15.dist-info}/top_level.txt RENAMED Viewed

File without changes

adv-optm 1.2.dev13__py3-none-any.whl → 1.2.dev15__py3-none-any.whl

Potentially problematic release.

adv-optm 1.2.dev13py3-none-any.whl → 1.2.dev15py3-none-any.whl