PyPI - adv-optm - Versions diffs - 2.4.dev16__tar.gz → 2.4.dev18__tar.gz - Mend

adv-optm 2.4.dev16tar.gz → 2.4.dev18tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 2.4.dev16
+Version: 2.4.dev18
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/__init__.py RENAMED Viewed

@@ -24,4 +24,4 @@ __all__ = [
     "SinkSGD_adv",
 ]
-__version__ = "2.4.dev16"
+__version__ = "2.4.dev18"

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/AdamW_adv.py RENAMED Viewed

@@ -63,6 +63,7 @@ class AdamW_adv(torch.optim.Optimizer):
             before it is added to the fast momentum term (`update = mt + alpha * mt_slow`).
             A higher value increases the stabilizing influence of the slow
             momentum. (default: 5.0)
+        normed_momentum (bool): whether to compute the first moment on the normalized gradient. (default: False)
         kourkoutas_beta (bool): whether to enable the layer-wise dynamic β₂ logic.
             If `False`, the optimizer behaves as standard AdamW. (default: False)
         beta2_min (float): The minimum value for dynamic β₂, used during periods of
@@ -131,6 +132,8 @@ class AdamW_adv(torch.optim.Optimizer):
         # Nesterov momentum
         nesterov: bool = False,
         nesterov_coef: float | None = None,
+        # Normalization then Momentum
+        normed_momentum: bool = False,
         # K-b (adaptive beta2)
         kourkoutas_beta: bool = False,
         beta2_min: float = 0.9,
@@ -181,6 +184,7 @@ class AdamW_adv(torch.optim.Optimizer):
             "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
             "fisher_wd": fisher_wd, "cautious_wd": cautious_wd,
             "use_atan2": use_atan2, "nesterov": nesterov, "nesterov_coef": nesterov_coef,
+            "normed_momentum": normed_momentum,
             "orthogonal_gradient": orthogonal_gradient, "use_bias_correction": use_bias_correction,
             "beta3_ema": beta3_ema, "alpha": alpha, "compiled_optimizer": compiled_optimizer,
             "kourkoutas_beta": kourkoutas_beta, "beta2_min": beta2_min, "ema_alpha": ema_alpha,
@@ -383,6 +387,27 @@ class AdamW_adv(torch.optim.Optimizer):
             d1, d2 = state['effective_shape']
             grad_reshaped = grad.view(d1, d2)
+            vt = _reconstruct_state((state['mu_v_nmf'], state['mv_v_nmf']), signed=False)
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped * (1.0 - beta2))
+            else:
+                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2)
+            # Factorize
+            state['mu_v_nmf'], state['mv_v_nmf'] = _factorize_state(vt, signed=False)
+            if group['use_atan2']:
+                denom = vt.sqrt_()
+                denom.div_(sqrt_bias_correction2)
+                if group.get('normed_momentum', False):
+                    grad_reshaped.atan2_(denom)
+            else:
+                denom = vt.sqrt_()
+                denom.div_(sqrt_bias_correction2).add_(adaptive_eps)
+                if group.get('normed_momentum', False):
+                    grad_reshaped.div_(denom)
             # Reconstruct momentum from previous step's factors
             if use_mt:
                 mt = _reconstruct_state((state['mu_m_nmf'], state['mv_m_nmf'], state['sign'], d2), signed=True)
@@ -404,13 +429,6 @@ class AdamW_adv(torch.optim.Optimizer):
                     nv_coef = beta1 if nesterov_coef is None else nesterov_coef
                     update_mt = update_mt.lerp_(grad_reshaped, 1-nv_coef)
-            vt = _reconstruct_state((state['mu_v_nmf'], state['mv_v_nmf']), signed=False)
-            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
-                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped * (1.0 - beta2))
-            else:
-                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2)
             if self.use_AdEMAMix:
                 mt_slow = _reconstruct_state((state['mu_m_slow_nmf'], state['mv_m_slow_nmf'], state['sign_slow'], d2), signed=True)
@@ -430,17 +448,11 @@ class AdamW_adv(torch.optim.Optimizer):
                 else:
                     update = grad_reshaped.clone()
-            # Factorize
-            state['mu_v_nmf'], state['mv_v_nmf'] = _factorize_state(vt, signed=False)
-            if group['use_atan2']:
-                denom = vt.sqrt_()
-                denom.div_(sqrt_bias_correction2)
-                update.atan2_(denom)
-            else:
-                denom = vt.sqrt_()
-                denom.div_(sqrt_bias_correction2).add_(adaptive_eps)
-                update.div_(denom)
+            if not group.get('normed_momentum', False):
+                if group['use_atan2']:
+                    update.atan2_(denom)
+                else:
+                    update.div_(denom)
             wd_scaler = _get_fisher_wd_scaler(group, state.get("wd_scaler"), p, denom, group['use_atan2'])
@@ -452,6 +464,36 @@ class AdamW_adv(torch.optim.Optimizer):
             actual_precision = group['actual_state_precision']
             factored_2nd = state.get('factored_2nd', False)
+            if factored_2nd:
+                d1, d2 = state['effective_shape']
+                exp_avg_sq = _reconstruct_state((state['mu_v_nmf'], state['mv_v_nmf']), signed=False)
+                exp_avg_sq = exp_avg_sq.view(p.shape)
+            else:
+                exp_avg_sq = get_state(state, 'exp_avg_sq', actual_precision)
+            grad_vt = grad.float() if factored_2nd else grad
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                exp_avg_sq.mul_(beta2).addcmul_(grad_vt, grad_vt * (1.0 - beta2))
+            else:
+                exp_avg_sq.mul_(beta2).addcmul_(grad_vt, grad_vt, value=1.0 - beta2)
+            if factored_2nd:
+                state['mu_v_nmf'], state['mv_v_nmf'] = _factorize_state(exp_avg_sq.view(d1, d2), signed=False)
+            else:
+                set_state(state, 'exp_avg_sq', exp_avg_sq, actual_precision, random_int_state_tensor, non_neg=True)
+            if group['use_atan2']:
+                denom = exp_avg_sq.sqrt()
+                denom.div_(sqrt_bias_correction2)
+                if group.get('normed_momentum', False):
+                    grad.atan2_(denom.to(grad.dtype))
+            else:
+                denom = exp_avg_sq.sqrt()
+                denom.div_(sqrt_bias_correction2).add_(adaptive_eps)
+                if group.get('normed_momentum', False):
+                    grad.div_(denom.to(grad.dtype))
             if use_mt:
                 exp_avg = get_state(state, 'exp_avg', actual_precision)
                 exp_avg.lerp_(grad, 1.0 - beta1)
@@ -481,38 +523,15 @@ class AdamW_adv(torch.optim.Optimizer):
             else:
                 update = update_mt if use_mt else grad.clone()
-            if factored_2nd:
-                d1, d2 = state['effective_shape']
-                exp_avg_sq = _reconstruct_state((state['mu_v_nmf'], state['mv_v_nmf']), signed=False)
-                exp_avg_sq = exp_avg_sq.view(p.shape)
-            else:
-                exp_avg_sq = get_state(state, 'exp_avg_sq', actual_precision)
-            grad_vt = grad.float() if factored_2nd else grad
-            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
-                exp_avg_sq.mul_(beta2).addcmul_(grad_vt, grad_vt * (1.0 - beta2))
-            else:
-                exp_avg_sq.mul_(beta2).addcmul_(grad_vt, grad_vt, value=1.0 - beta2)
-            if factored_2nd:
-                state['mu_v_nmf'], state['mv_v_nmf'] = _factorize_state(exp_avg_sq.view(d1, d2), signed=False)
-            else:
-                set_state(state, 'exp_avg_sq', exp_avg_sq, actual_precision, random_int_state_tensor, non_neg=True)
-            del random_int_state_tensor
-            if group['use_atan2']:
-                denom = exp_avg_sq.sqrt()
-                denom.div_(sqrt_bias_correction2)
-                update.atan2_(denom.to(update.dtype))
-            else:
-                denom = exp_avg_sq.sqrt()
-                denom.div_(sqrt_bias_correction2).add_(adaptive_eps)
-                update.div_(denom.to(update.dtype))
+            if not group.get('normed_momentum', False):
+                if group['use_atan2']:
+                    update.atan2_(denom.to(update.dtype))
+                else:
+                    update.div_(denom.to(update.dtype))
             wd_scaler = _get_fisher_wd_scaler(group, state.get("wd_scaler"), p, denom, group['use_atan2'])
-            del denom
+            del denom, random_int_state_tensor
         update_scaling = step_size * A if group['use_atan2'] else step_size
         if group.get('spectral_normalization', False):

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/SinkSGD_adv.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import torch
-from typing import Optional, Callable
+import math
 from ..util import param_update
 from ..util.factorization_util import _get_effective_shape, _reconstruct_state, _factorize_state
@@ -9,7 +9,7 @@ from ..util.OrthoGrad import _orthogonalize_gradient
 from ..util.scaled_optm import scale_update, is_spectral, init_spectral_norm
 from ..util.centered_decay import _init_anchor
 from ..util.state_util import init_state_tensor, get_state, set_state, upcast_grad_for_precision
-from ..util.sinkhorn import apply_sr_sinkhorn
+from ..util.sinkhorn import apply_sr_sinkhorn, _sinkhorn_sq_grad, get_sinkhorn_wd_scaler
 from ..util.signed_util import apply_stochastic_sign_
 class SinkSGD_adv(torch.optim.Optimizer):
@@ -26,8 +26,6 @@ class SinkSGD_adv(torch.optim.Optimizer):
         weight_decay (float): weight decay (L2 penalty or decoupled) (default: 0).
         nesterov (bool): enables Nesterov momentum. Only applicable when momentum
             is non-zero. (default: False)
-        decoupled_wd (bool): whether to apply decoupled weight decay (like AdamW)
-            instead of standard L2 penalty. (default: False)
         cautious_wd (bool): Enables Cautious Weight Decay. If True, weight decay is
             applied only to parameter coordinates where the sign of the parameter
             and the sign of the optimizer update align (default: False).
@@ -61,11 +59,13 @@ class SinkSGD_adv(torch.optim.Optimizer):
         orthogonal_sinkhorn: bool = False,
         # Normalization then Momentum
         normed_momentum: bool = False,
+        # Centered Variance Precondition
+        centered_vt: bool = False,
         # Nesterov Momentum
         nesterov: bool = False,
         nesterov_coef: float | None = None,
-        # Decoupled/cautious weight decay
-        decoupled_wd: bool = False,
+        # weight decay features
+        geometric_wd: bool = False,
         cautious_wd: bool = False,
         # Stochastic Rounding for BF16
         stochastic_rounding: bool = True,
@@ -101,8 +101,8 @@ class SinkSGD_adv(torch.optim.Optimizer):
         defaults = {
             "lr": lr, "momentum": momentum,
-            "weight_decay": weight_decay, "nesterov": nesterov, "nesterov_coef": nesterov_coef, "normed_momentum": normed_momentum,
-            "decoupled_wd": decoupled_wd, "cautious_wd": cautious_wd,
+            "weight_decay": weight_decay, "nesterov": nesterov, "nesterov_coef": nesterov_coef, "normed_momentum": normed_momentum, "centered_vt": centered_vt,
+            "geometric_wd": geometric_wd, "cautious_wd": cautious_wd,
             "orthogonal_gradient": orthogonal_gradient,
             "compiled_optimizer": compiled_optimizer,
             "sinkhorn_iterations": sinkhorn_iterations,
@@ -182,6 +182,11 @@ class SinkSGD_adv(torch.optim.Optimizer):
                     if group['momentum'] != 0:
                         init_state_tensor(state, 'momentum_buffer', p.shape, actual_precision, p.device, dtype)
+                if group.get('centered_vt', False):
+                    p_shape = p.shape
+                    state['vt_row'] = torch.zeros(p_shape[:-1], device=device, dtype=torch.float32)
+                    state['vt_col'] = torch.zeros(p_shape[:-2] + p_shape[-1:], device=device, dtype=torch.float32)
             if group.get('spectral_normalization', False) and is_spectral(p):
                 init_spectral_norm(state, p)
@@ -237,7 +242,7 @@ class SinkSGD_adv(torch.optim.Optimizer):
         if group.get('normed_momentum', False):
             if not is_vector:
                 # Sinkhorn iterative normalization
-                grad = apply_sr_sinkhorn(grad, p, ortho_project=orthogonal_sinkhorn, iters=sinkhorn_iterations)
+                grad = apply_sr_sinkhorn(grad, iters=sinkhorn_iterations, p=p, ortho_project=orthogonal_sinkhorn)
             else:
                 # For vectors, apply adaptive stochastic sign
                 grad = apply_stochastic_sign_(grad, sign_noise, is_vector=is_vector)
@@ -271,6 +276,24 @@ class SinkSGD_adv(torch.optim.Optimizer):
             if momentum != 0:
                 buf = get_state(state, 'momentum_buffer', actual_precision)
+                if group.get('centered_vt', False):
+                    vt_row, vt_col = state['vt_row'], state['vt_col']
+                    grad_vt = grad - buf
+                    grad_vt_sq = grad_vt * grad_vt
+                    mean_row_grad = grad_vt_sq.mean(dim=-1)
+                    mean_col_grad = grad_vt_sq.mean(dim=-2)
+                    vt_row.mul_(momentum).add_(mean_row_grad, alpha=1.0 - momentum)
+                    vt_col.mul_(momentum).add_(mean_col_grad, alpha=1.0 - momentum)
+                    if nesterov:
+                        nv_coef = momentum if nesterov_coef is None else nesterov_coef
+                        vt_row = vt_row.lerp(mean_row_grad, 1.0 - nv_coef)
+                        vt_col = vt_col.lerp(mean_col_grad, 1.0 - nv_coef)
+                    vt = _sinkhorn_sq_grad(vt_row, vt_col)
+                else:
+                    vt_row = None
+                    vt_col = None
                 buf.lerp_(grad, 1 - momentum)
                 set_state(state, 'momentum_buffer', buf, actual_precision, random_int_state_tensor)
@@ -285,21 +308,34 @@ class SinkSGD_adv(torch.optim.Optimizer):
             del random_int_state_tensor
+        if group.get('centered_vt', False):
+            denom = vt
+            update.atan2_(denom)
+        else:
+            denom = None
         if not group.get('normed_momentum', False):
             if not is_vector:
                 # Sinkhorn iterative normalization
-                update = apply_sr_sinkhorn(update, p, ortho_project=orthogonal_sinkhorn, iters=sinkhorn_iterations)
+                update = apply_sr_sinkhorn(update, iters=sinkhorn_iterations, p=p, ortho_project=orthogonal_sinkhorn)
             else:
                 # For vectors, apply adaptive stochastic sign
                 update = apply_stochastic_sign_(update, sign_noise, is_vector=is_vector)
+        if group.get('geometric_wd', False):
+            wd_scaler = get_sinkhorn_wd_scaler(p, row_denom=vt_row, col_denom=vt_col)
+        else:
+            wd_scaler = None
         update_scaling = step_size
         if group.get('spectral_normalization', False):
             update = scale_update(p, update, update_scaling, state=state)
         else:
+            if group.get('centered_vt', False):
+                update_scaling = update_scaling * (4/math.pi)
             update.mul_(update_scaling)
-        param_update.apply_parameter_update(self, p, group, update, step_size, random_int_tensor=random_int_tensor)
+        param_update.apply_parameter_update(self, p, group, update, step_size, random_int_tensor=random_int_tensor, wd_scaler=wd_scaler)
     def compile(self, *args, **kwargs):
         self._compiled_step_parameter = torch.compile(self._step_parameter, *args, **kwargs)

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/signed_util.py RENAMED Viewed

@@ -4,15 +4,19 @@ from . import param_update
 def apply_stochastic_sign_(update: torch.Tensor, noise: torch.Tensor | None, is_vector: bool = False) -> torch.Tensor:
     """
-    Applies the Stochastic Sign operator S_R(v).
+    Applies the Iterative L-infinity Stochastic Sign operator.
     Uses uniform noise injection to compute the stochastic sign
     """
     if update.dim() >= 2 and not is_vector:
-        update_abs = update.abs()
-        # Calculate row and col maximums
-        R_col = update_abs.amax(dim=0, keepdim=True) # Shape: (1, cols)
-        R_row = update_abs.amax(dim=1, keepdim=True) # Shape: (rows, 1)
-        R = torch.minimum(R_row, R_col)
+        # Iterative L-infinity Sinkhorn algorithm
+        # This converges in just one iteration
+        # Step 1: Row Max (every row max is 1.0, all values <= 1.0)
+        R_row = torch.linalg.vector_norm(update, ord=float('inf'), dim=1, keepdim=True).clamp_min_(1e-12)
+        update.div_(R_row)
+        # Step 2: Col Max (every col max is 1.0 and every row max stays 1.0)
+        R_col = torch.linalg.vector_norm(update, ord=float('inf'), dim=0, keepdim=True).clamp_min_(1e-12)
+        update.div_(R_col)
     else:
         # Fallback for 1D tensors (e.g., biases, layernorm)
         # Block-wise scaling to protect against outliers
@@ -21,7 +25,8 @@ def apply_stochastic_sign_(update: torch.Tensor, noise: torch.Tensor | None, is_
         if numel <= block_size:
             # Too small to chunk, just use global max
-            R = update.abs().max()
+            R = update.abs().max().clamp_min_(1e-12)
+            update.div_(R)
         else:
             # Calculate how much padding we need to make it divisible by block_size
             remainder = numel % block_size
@@ -41,13 +46,11 @@ def apply_stochastic_sign_(update: torch.Tensor, noise: torch.Tensor | None, is_
             R_blocks = blocks.abs().max(dim=1, keepdim=True).values
             # Broadcast R_blocks back to the padded shape, slice off padding, and restore original shape
-            R = R_blocks.expand_as(blocks).reshape(-1)[:numel].view_as(update)
-    # Prevent division by zero
-    R = R.clamp_min(1e-12)
+            R = R_blocks.expand_as(blocks).reshape(-1)[:numel].view_as(update).clamp_min(1e-12)
+            update.div_(R)
     if noise is None:
         noise = param_update._get_random_noise_for_sso(update)
-    # Chain inplace operations: torch.sign(update / R + noise)
-    return update.div_(R).add_(noise).sign_()
+    # Final stochastic step: sign(v + U[-1, 1])
+    return update.add_(noise).sign_()

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/sinkhorn.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import math
 import torch
-def apply_sr_sinkhorn(update: torch.Tensor, p: torch.Tensor, ortho_project: bool, iters: int = 5) -> torch.Tensor:
+def apply_sr_sinkhorn(update: torch.Tensor, iters: int = 5, p: torch.Tensor | None = None, ortho_project: bool = False) -> torch.Tensor:
     """
     Applies Square-Root Sinkhorn (SR-Sinkhorn) multi-normalization.
     As described in 'Gradient Multi-Normalization for Efficient LLM Training'.
@@ -47,13 +47,16 @@ def apply_sr_sinkhorn(update: torch.Tensor, p: torch.Tensor, ortho_project: bool
     # In-place alternating Sinkhorn normalization steps
     for _ in range(iters):
         # First normalization step
-        norm1 = update_2d.norm(p=2, dim=dim, keepdim=True).clamp_min_(1e-12)
+        # Stability floor: equivalent to a single-element vector norm lower bound (lb)
+        norm1_lb = 1 / math.sqrt(update_2d.shape[dim])
+        norm1 = update_2d.norm(p=2, dim=dim, keepdim=True).clamp_min_(norm1_lb)
         update_2d.mul_(scale_first / norm1)
         if ortho_project:
             update_2d = ortho_normed(param_2d, update_2d, p_norm_sq_dim, dim, scale_first)
         # Second normalization step
-        norm2 = update_2d.norm(p=2, dim=1-dim, keepdim=True).clamp_min_(1e-12)
+        norm2_lb = 1 / math.sqrt(update_2d.shape[1-dim])
+        norm2 = update_2d.norm(p=2, dim=1-dim, keepdim=True).clamp_min_(norm2_lb)
         update_2d.mul_(scale_second / norm2)
         if ortho_project:
             update_2d = ortho_normed(param_2d, update_2d, p_norm_sq_adim, 1-dim, scale_second)
@@ -72,6 +75,69 @@ def ortho_normed(p_2d, update_2d, p_norm_sq, dim, target_norm):
     update_2d.addcmul_(proj, p_2d, value=-1.0)
     # Magnitude Preservation
-    g_orth_norm = update_2d.norm(p=2, dim=dim, keepdim=True).clamp_min_(1e-12)
+    norm_lb = 1 / math.sqrt(update_2d.shape[dim])
+    g_orth_norm = update_2d.norm(p=2, dim=dim, keepdim=True).clamp_min_(norm_lb)
     scale_factor = target_norm / g_orth_norm
     return update_2d.mul_(scale_factor)
+def _sinkhorn_sq_grad(
+    vt_row: torch.Tensor,
+    vt_col: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Reconstructs the variance precondition from its rank-1 factors.
+    Modified from:
+    https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/adafactor.py
+    """
+    r_factor = (
+        (vt_row / vt_row.mean(dim=-1).clamp_min_(1e-30))
+        .sqrt_()
+        .unsqueeze(-1)
+    )
+    c_factor = vt_col.unsqueeze(-2).sqrt()
+    return torch.mul(r_factor, c_factor)
+def get_sinkhorn_wd_scaler(
+    p: torch.Tensor,
+    row_denom: torch.Tensor | None = None,
+    col_denom: torch.Tensor | None = None
+):
+    """
+    Computes a structural weight decay multiplier.
+    Penalizes parameters belonging to dominant rows/columns more heavily,
+    while protecting parameters in under-utilized/noisy rows/columns from decay.
+    """
+    if p.ndim < 2:
+        return 1.0
+    p_2d = p.view(p.shape[0], -1)
+    # Lower bounds based on the effective 2D shapes
+    row_lb = 1 / math.sqrt(p_2d.shape[1])
+    col_lb = 1 / math.sqrt(p_2d.shape[0])
+    # Get the norms
+    row_norms = torch.linalg.vector_norm(p_2d, ord=2, dim=1, keepdim=True).clamp_min_(row_lb)
+    col_norms = torch.linalg.vector_norm(p_2d, ord=2, dim=0, keepdim=True).clamp_min_(col_lb)
+    # Compute the structural scaler
+    row_factor = row_norms.sqrt_()
+    col_factor = col_norms.sqrt_()
+    if row_denom is not None and col_denom is not None:
+        # Reshape denominators to ensure safe in-place broadcasting
+        row_denom = row_denom.view(p_2d.shape[0], 1)
+        col_denom = col_denom.view(1, p_2d.shape[1])
+        # High denom (noise) -> smaller angle (protects weights)
+        # Low denom (confident) -> larger angle (decays weights)
+        row_factor.atan2_(row_denom)
+        col_factor.atan2_(col_denom)
+    # Outer product: merges the row and column confidences into a 2D matrix
+    wd_scaler = row_factor * col_factor
+    # Normalize the scaler so its mean is exactly 1.0
+    wd_scaler.div_(wd_scaler.mean().clamp_min_(1e-12))
+    return wd_scaler.view_as(p)

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 2.4.dev16
+Version: 2.4.dev18
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setup(
     name="adv_optm",
-    version="2.4.dev16",
+    version="2.4.dev18",
     author="Koratahiu",
     author_email="hiuhonor@gmail.com",
     license='Apache 2.0',

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/LICENSE RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/README.md RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/AdaMuon_adv.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/Adopt_adv.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/Lion_Prodigy_adv.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/Lion_adv.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/Muon_adv.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/Prodigy_adv.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/SignSGD_adv.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/Simplified_AdEMAMix.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/optim/__init__.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/Kourkoutas.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/Muon_AuxAdam.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/Muon_util.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/OrthoGrad.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/__init__.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/centered_decay.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/factorization_util.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/lion_k.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/param_update.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/scaled_optm.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/state_util.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm/util/update_util.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm.egg-info/requires.txt RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/adv_optm.egg-info/top_level.txt RENAMED Viewed

File without changes

{adv_optm-2.4.dev16 → adv_optm-2.4.dev18}/setup.cfg RENAMED Viewed

File without changes

adv-optm 2.4.dev16__tar.gz → 2.4.dev18__tar.gz

adv-optm 2.4.dev16tar.gz → 2.4.dev18tar.gz