PyPI - adv-optm - Versions diffs - 2.5.7__tar.gz → 2.6.dev1__tar.gz - Mend

adv-optm 2.5.7tar.gz → 2.6.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 2.5.7
+Version: 2.6.dev1
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/__init__.py RENAMED Viewed

@@ -20,4 +20,4 @@ __all__ = [
     "SinkSGD_adv",
 ]
-__version__ = "2.5.7"
+__version__ = "2.6.dev1"

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/optim/AdaMuon_adv.py RENAMED Viewed

@@ -106,7 +106,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
             low-rank compression while keeping the first moment (momentum_buffer)
             dense. Ignored when `nnmf_factor=True` (full SMMF) or `normuon_variant=True`.
             Combines well with `state_precision` on the first moment. (default: False)
-        n_layers (int): The depth of the network (L). Required for optimal epsilon scaling. (default: 1)
         spectral_normalization (bool): Enable explicit spectral normalization using power iteration. (default: False)
         --- Auxiliary AdamW_adv Parameters (used for 'adam' groups) ---
         adam_betas (tuple[float, float]): Betas for the AdamW optimizer part.
@@ -177,8 +176,9 @@ class AdaMuon_adv(torch.optim.Optimizer):
         approx_mars: bool = False,
         mars_gamma: float = 0.025,
         # Spectral Normalization
-        n_layers: int = 1,
         spectral_normalization: bool = False,
+        # Orthogonalize the weights (Matrix Sign - MSign) every x steps
+        MSign_interval: int | None = None,
         # Centered WD
         centered_wd: float = 0.0,
         centered_wd_mode: str = 'float8',
@@ -249,7 +249,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
             # MARS-M
             "approx_mars": approx_mars, "mars_gamma": mars_gamma,
             # Spectral Normalization
-            "n_layers": n_layers, "spectral_normalization": spectral_normalization,
+            "spectral_normalization": spectral_normalization, "MSign_interval": MSign_interval,
             # Centered WD
             "centered_wd": centered_wd,
             "centered_wd_mode": centered_wd_mode,

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/optim/AdamW_adv.py RENAMED Viewed

@@ -121,6 +121,8 @@ class AdamW_adv(torch.optim.Optimizer):
         layer_key_fn: Optional[Callable] = None,
         # Spectral Normed Optimizer
         spectral_normalization: bool = False,
+        # Orthogonalize the weights (Matrix Sign - MSign) every x steps
+        MSign_interval: int | None = None,
         # Centered WD
         centered_wd: float = 0.0,
         centered_wd_mode: str = 'float8',
@@ -163,7 +165,7 @@ class AdamW_adv(torch.optim.Optimizer):
             "compiled_optimizer": compiled_optimizer,
             "kourkoutas_beta": kourkoutas_beta, "beta2_min": beta2_min, "ema_alpha": ema_alpha,
             "tiny_spike": tiny_spike, "k_warmup_steps": k_warmup_steps, "k_logging": k_logging,
-            "spectral_normalization": spectral_normalization,
+            "spectral_normalization": spectral_normalization, "MSign_interval": MSign_interval,
             "centered_wd": centered_wd, "centered_wd_mode": centered_wd_mode,
             "state_precision": state_precision,
             "nnmf_factor": nnmf_factor, "vector_reshape": vector_reshape, "factored_2nd": factored_2nd

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/optim/Adopt_adv.py RENAMED Viewed

@@ -122,6 +122,8 @@ class Adopt_adv(torch.optim.Optimizer):
         layer_key_fn: Optional[Callable] = None,
         # Spectral Normed Optimizer
         spectral_normalization: bool = False,
+        # Orthogonalize the weights (Matrix Sign - MSign) every x steps
+        MSign_interval: int | None = None,
         # Centered WD
         centered_wd: float = 0.0,
         centered_wd_mode: str = 'float8',
@@ -162,7 +164,7 @@ class Adopt_adv(torch.optim.Optimizer):
             "nesterov": nesterov, "nesterov_coef": nesterov_coef,
             "kourkoutas_beta": kourkoutas_beta, "beta2_min": beta2_min, "ema_alpha": ema_alpha,
             "tiny_spike": tiny_spike, "k_warmup_steps": k_warmup_steps, "k_logging": k_logging,
-            "spectral_normalization": spectral_normalization,
+            "spectral_normalization": spectral_normalization, "MSign_interval": MSign_interval,
             "centered_wd": centered_wd,
             "centered_wd_mode": centered_wd_mode,
             "state_precision": state_precision,

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/optim/Lion_adv.py RENAMED Viewed

@@ -78,6 +78,8 @@ class Lion_adv(torch.optim.Optimizer):
         centered_wd_mode: str = 'float8',
         # Spectral Normed Optimizer
         spectral_normalization: bool = False,
+        # Orthogonalize the weights (Matrix Sign - MSign) every x steps
+        MSign_interval: int | None = None,
         # SMMF factorization
         nnmf_factor: bool = False,
         vector_reshape: bool = False,
@@ -102,6 +104,7 @@ class Lion_adv(torch.optim.Optimizer):
             auto_kappa_p=auto_kappa_p,
             stochastic_sign=stochastic_sign,
             spectral_normalization=spectral_normalization,
+            MSign_interval=MSign_interval,
             nnmf_factor=nnmf_factor,
             centered_wd= centered_wd,
             centered_wd_mode= centered_wd_mode,

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/optim/Muon_adv.py RENAMED Viewed

@@ -81,7 +81,6 @@ class Muon_adv(torch.optim.Optimizer):
             'float8': Uses torch.float8_e4m3fn for a balance of precision and memory.
             'int8': Uses 8-bit block-wise quantization (block size 128).
             'int4': Uses 4-bit block-wise quantization (block size 32).
-        n_layers (int): The depth of the network (L). Required for optimal epsilon scaling. (default: 1)
         spectral_normalization (bool): Enable explicit spectral normalization using power iteration. (default: False)
         --- Auxiliary AdamW_adv Parameters (used for 'adam' groups) ---
         adam_betas (tuple[float, float]): Betas for the AdamW optimizer part.
@@ -146,8 +145,9 @@ class Muon_adv(torch.optim.Optimizer):
         approx_mars: bool = False,
         mars_gamma: float = 0.025,
         # Spectral Normalization
-        n_layers: int = 1,
         spectral_normalization: bool = False,
+        # Orthogonalize the weights (Matrix Sign - MSign) every x steps
+        MSign_interval: int | None = None,
         # Centered WD
         centered_wd: float = 0.0,
         centered_wd_mode: str = 'float8',
@@ -220,7 +220,7 @@ class Muon_adv(torch.optim.Optimizer):
             # MARS-M
             "approx_mars": approx_mars, "mars_gamma": mars_gamma,
             # Spectral Normalization
-            "n_layers": n_layers, "spectral_normalization": spectral_normalization,
+            "spectral_normalization": spectral_normalization, "MSign_interval": MSign_interval,
             # Centered WD
             "centered_wd": centered_wd,
             "centered_wd_mode": centered_wd_mode,

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/optim/Prodigy_adv.py RENAMED Viewed

@@ -154,6 +154,10 @@ class Prodigy_adv(torch.optim.Optimizer):
         # Centered WD
         centered_wd: float = 0.0,
         centered_wd_mode: str = 'float8',
+        # Spectral Normalization
+        spectral_normalization: bool = False,
+        # Orthogonalize the weights (Matrix Sign - MSign) every x steps
+        MSign_interval: int | None = None,
     ):
         if not (lr >= 0.0):
             raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
@@ -190,7 +194,8 @@ class Prodigy_adv(torch.optim.Optimizer):
             "kourkoutas_beta": kourkoutas_beta, "beta2_min": beta2_min, "ema_alpha": ema_alpha,
             "tiny_spike": tiny_spike, "k_warmup_steps": k_warmup_steps, "k_logging": k_logging,
             "centered_wd": centered_wd, "centered_wd_mode": centered_wd_mode,
-            "nnmf_factor": nnmf_factor, "vector_reshape": vector_reshape, "factored_2nd": factored_2nd
+            "nnmf_factor": nnmf_factor, "vector_reshape": vector_reshape, "factored_2nd": factored_2nd,
+            "spectral_normalization": spectral_normalization, "MSign_interval": MSign_interval,
         }
         self.stochastic_rounding = stochastic_rounding
         self.fsdp_in_use = fsdp_in_use

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/optim/SignSGD_adv.py RENAMED Viewed

@@ -79,6 +79,8 @@ class SignSGD_adv(torch.optim.Optimizer):
         state_precision: str = "auto", # 'fp32', 'factored', 'bf16_sr', 'int8_sr'.
         # Spectral Normed Optimizer
         spectral_normalization: bool = False,
+        # Orthogonalize the weights (Matrix Sign - MSign) every x steps
+        MSign_interval: int | None = None,
         # SMMF factorization
         nnmf_factor: bool = False,
         vector_reshape: bool = False,
@@ -117,6 +119,7 @@ class SignSGD_adv(torch.optim.Optimizer):
             normed_momentum=normed_momentum,
             snr_cond=snr_cond,
             spectral_normalization=spectral_normalization,
+            MSign_interval=MSign_interval,
             centered_wd= centered_wd,
             centered_wd_mode= centered_wd_mode,
             state_precision=state_precision,

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/optim/SinkSGD_adv.py RENAMED Viewed

@@ -72,6 +72,8 @@ class SinkSGD_adv(torch.optim.Optimizer):
         orthogonal_gradient: str = 'disabled', # 'flattened', 'iterative'
         # Spectral Normed Optimizer
         spectral_normalization: bool = False,
+        # Orthogonalize the weights (Matrix Sign - MSign) every x steps
+        MSign_interval: int | None = None,
         # Centered WD
         centered_wd: float = 0.0,
         centered_wd_mode: str = 'float8',
@@ -108,7 +110,7 @@ class SinkSGD_adv(torch.optim.Optimizer):
             "compiled_optimizer": compiled_optimizer,
             "sinkhorn_iterations": sinkhorn_iterations,
             "orthogonal_sinkhorn": orthogonal_sinkhorn,
-            "spectral_normalization": spectral_normalization,
+            "spectral_normalization": spectral_normalization, "MSign_interval": MSign_interval,
             "centered_wd": centered_wd, "centered_wd_mode": centered_wd_mode,
             "state_precision": state_precision,
             "nnmf_factor": nnmf_factor, "vector_reshape": vector_reshape

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/util/Kourkoutas.py RENAMED Viewed

@@ -168,8 +168,6 @@ class KourkoutasHelper:
             # Update the persistent EMA tensor in-place.
             r_ema_tensor.mul_(ema_alpha).add_(pooled_grad_norm, alpha=1.0 - ema_alpha)
-            tiny_spike = scale_tiny_spike(group, info['params'], tiny_spike)
             # Calculate Beta2
             raw = pooled_grad_norm / (r_ema_tensor + tiny_spike)
             sun = raw / (1.0 + raw)
@@ -255,29 +253,3 @@ class KourkoutasHelper:
         # The default is the max value, which is correct for unmapped params or edge cases
         beta2_default = group.get('betas', group.get('adam_betas'))[1] if group.get('betas', group.get('adam_betas')) else 0.999
         return self.layer_state.get(layer_key, {}).get('dynamic_beta2', beta2_default)
-def scale_tiny_spike(group: dict, layer_params: list, tiny_spike: float) -> float:
-    """
-    Derives scale-invariant tiny_spike from the EMA tensor's effective numel.
-    """
-    if not group.get('spectral_normalization', False):
-        return tiny_spike
-    p0 = layer_params[0]
-    if getattr(p0, '_is_lora_A', False) or p0.ndim < 2:
-        # No depth scaling for:
-        # - lora_A: non-zero init, different gradient dynamics than B
-        # - 1D params (biases, norms, DoRA scales): additive, don't compound through depth.
-        L = 1
-    else:
-        L = group['n_layers']
-    if getattr(p0, '_is_lora_A', False) or getattr(p0, '_is_oft', False):
-        ema_numel = p0.shape[1] # (1, in_features)
-    elif getattr(p0, '_is_lora_B', False):
-        ema_numel = p0.shape[0] # (out_features, 1)
-    else:
-        ema_numel = sum(p.numel() for p in layer_params) # scalar EMA
-    return 1.0 / (L * math.sqrt(ema_numel))

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/util/Muon_AuxAdam.py RENAMED Viewed

@@ -66,6 +66,7 @@ def _init_auxadam_state(self, p, group):
     _init_anchor(p, state, group)
     _init_fisher_wd_scaler(group, state, p)
+    group["MSign_interval"] = None
 @torch.no_grad()
 def _adam_step_parameter(self, p, grad, state, group, beta1_adam, beta2_adam, sqrt_bias_correction2, step_size, random_int_tensor, random_int_state_tensor=None):

adv_optm-2.6.dev1/adv_optm/util/msign.py ADDED Viewed

@@ -0,0 +1,114 @@
+import torch
+import math
+@torch.no_grad()
+def msign_ortho_precond_(p):
+    """
+    Applies an orthogonal preconditioner to a parameter tensor in-place using Matrix Sign.
+    Originally proposed in the paper:
+    "MSign: An Optimizer Preventing Training Instability in Large Language Models via Stable Rank Restoration"(arxiv:2602.01734)
+    Modified to use CANS instead of SVD.
+    Args:
+        p (torch.Tensor): The parameter tensor to be preconditioned. Must be a dense
+            tensor. Vectors and biases should generally be excluded before calling this.
+            The operation modifies `p` in-place.
+    Note:
+        - The Frobenius norm of the matrix is strictly preserved.
+        - High-dimensional tensors (e.g., Conv2D weights) are reshaped into
+          `(out_channels, in_channels * ...)` automatically before computation.
+    """
+    # Record the original Frobenius norm of the weight matrix
+    orig_norm = torch.linalg.vector_norm(p, ord=2).clamp_min_(1e-12)
+    # Reshape parameter to 2D for the matrix operation
+    p_2d = p.view(p.shape[0], -1)
+    # Approximate the matrix sign UV^T using CA Newton-Schulz
+    p_sign = _cans_newton_schulz_iteration(
+        p_2d,
+        steps=15,
+        eps=1e-7,
+        cns_a_bound=None, # auto
+    )
+    # Calculate the Frobenius norm of the sign-projected matrix
+    sign_norm = torch.linalg.vector_norm(p_sign, ord=2).clamp_min_(1e-12)
+    # Restore original Frobenius norm scale and write back in-place
+    p.copy_(p_sign.mul_(orig_norm / sign_norm).view_as(p))
+@torch.no_grad()
+def _cans_newton_schulz_iteration(
+    G: torch.Tensor,
+    steps: int = 15,
+    eps: float = 1e-7,
+    cns_a_bound: float | None = None,
+) -> torch.Tensor:
+    """
+    Computes the matrix sign function using Chebyshev-Optimized Newton-Schulz (CANS) iterations.
+    This function iteratively approximates the orthogonalized version of a matrix (i.e., UV^T from SVD)
+    using purely matrix multiplications. This is highly optimized for Tensor Cores on modern GPUs
+    and is significantly faster than computing an exact SVD.
+    Args:
+        G (torch.Tensor): The input matrix/tensor to be orthogonalized. Must be at least 2D.
+        steps (int, optional): Number of Newton-Schulz iterations. Deep learning models
+            usually only require 5 or 6 steps for sufficient orthogonal preconditioner
+            precision due to quadratic convergence. Defaults to 6.
+        eps (float, optional): A small epsilon value used to prevent division by zero
+            during the initial spectral normalization. Defaults to 1e-7.
+        cns_a_bound (float | None, optional): The lower bound for singular values after
+            normalization. If None, it is automatically calculated based on the Marchenko-Pastur
+            theoretical minimum and baseline L2 bounds. Defaults to None.
+    Returns:
+        torch.Tensor: The approximate matrix sign of G (orthogonalized matrix), preserving
+        the original shape and device of G.
+    """
+    X = G
+    # Transpose if needed
+    transposed = X.size(-2) > X.size(-1)
+    if transposed:
+        X = X.mT
+    # Normalize spectral norm to at most 1
+    X.div_(X.norm(dim=(-2, -1), keepdim=True).clamp_min_(eps))
+    if cns_a_bound is None:
+        M = G.shape[-2]
+        N = G.shape[-1]
+        # baseline L2 norm bound (for square matrices)
+        baseline_bound = 1.0 / math.sqrt(M * N)
+        # Marchenko-Pastur theoretical minimum (for rectangular matrices)
+        mp_bound = abs(1.0 / math.sqrt(N) - 1.0 / math.sqrt(M))
+        # The optimal bound is safely the maximum of the two
+        cns_a_bound = max(baseline_bound, mp_bound)
+    lower_bound = cns_a_bound
+    upper_bound = 1.0
+    for _ in range(steps):
+        lb, ub = lower_bound, upper_bound
+        lb_ub = lb * ub
+        # Calculate Mean Square Error term
+        e_sq = (lb**2 + lb_ub + ub**2) / 3.0
+        # Calculate components for alpha and bounds update
+        K = 2.0 * e_sq**1.5
+        L = lb_ub * (lb + ub)
+        denom = K + L
+        alpha = 6.0 / denom
+        c1 = alpha * e_sq
+        c3 = -alpha / 3.0
+        # Apply the 3rd-order Newton-Schulz update
+        A = X @ X.mT
+        X = c1 * X + c3 * (A @ X)
+        # Update the singular value bounds for the next iteration based on the error
+        eps_val = (K - L) / denom
+        lower_bound, upper_bound = 1.0 - eps_val, 1.0 + eps_val
+    # Transpose back if necessary
+    if transposed:
+        X = X.mT
+    return X

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm/util/param_update.py RENAMED Viewed

@@ -8,6 +8,7 @@ from typing import Dict, Any
 from .scaled_optm import adjust_wds
 from .centered_decay import dequantize_anchor
+from .msign import msign_ortho_precond_
 _generators: Dict[torch.device, torch.Generator] = {}
@@ -120,6 +121,11 @@ def apply_parameter_update(
     state = self.state[p]
+    ortho_interval = group.get('MSign_interval', None)
+    if ortho_interval is not None:
+        is_vector = p.ndim < 2 or getattr(p, '_is_dora_scale', False) or getattr(p, 'is_vector', False)
+        is_ortho_step = (state['step'] % ortho_interval == 0) and not is_vector
     # Compute full update in float32 if using bfloat16 with stochastic rounding
     if p.dtype == torch.bfloat16 and self.stochastic_rounding:
         p_fp32 = p.float()
@@ -135,6 +141,9 @@ def apply_parameter_update(
         # Apply main update
         p_fp32.add_(-update_fp32)
+        if is_ortho_step:
+            msign_ortho_precond_(p_fp32)
         # Single stochastic rounding at the end
         if random_int_tensor is not None:
             # Compiled path: use the pre-computed random tensor
@@ -153,6 +162,9 @@ def apply_parameter_update(
         # Apply main update
         p.add_(-update)
+        if is_ortho_step:
+            msign_ortho_precond_(p)
     del update

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 2.5.7
+Version: 2.6.dev1
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/adv_optm.egg-info/SOURCES.txt RENAMED Viewed

@@ -24,6 +24,7 @@ adv_optm/util/__init__.py
 adv_optm/util/centered_decay.py
 adv_optm/util/factorization_util.py
 adv_optm/util/lion_k.py
+adv_optm/util/msign.py
 adv_optm/util/param_update.py
 adv_optm/util/scaled_optm.py
 adv_optm/util/signed_util.py

{adv_optm-2.5.7 → adv_optm-2.6.dev1}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setup(
     name="adv_optm",
-    version="2.5.7",
+    version="2.6.dev1",
     author="Koratahiu",
     author_email="hiuhonor@gmail.com",
     license='Apache 2.0',