PyPI - adv-optm - Versions diffs - 2.4.dev2__tar.gz → 2.4.dev4__tar.gz - Mend

adv-optm 2.4.dev2tar.gz → 2.4.dev4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 2.4.dev2
+Version: 2.4.dev4
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/__init__.py RENAMED Viewed

@@ -22,4 +22,4 @@ __all__ = [
     "SignSGD_adv",
 ]
-__version__ = "2.4.dev2"
+__version__ = "2.4.dev4"

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/optim/AdaMuon_adv.py RENAMED Viewed

@@ -280,8 +280,8 @@ class AdaMuon_adv(torch.optim.Optimizer):
     def load_state_dict(self, state_dict: dict) -> None:
         """
         Overrides default load_state_dict to implement a workaround for PyTorch's
-        automatic dtype casting. It ensures factorized states remain float32 for
-        stability, preserves integer/float8 quantized anchor states, and forces
+        automatic dtype casting. It ensures factorized states remain float32 for
+        stability, preserves integer/float8 quantized anchor states, and forces
         standard states onto the parameter's current dtype/device.
         """
         super().load_state_dict(state_dict)

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/optim/AdamW_adv.py RENAMED Viewed

@@ -91,7 +91,7 @@ class AdamW_adv(torch.optim.Optimizer):
             'int4': Uses 4-bit block-wise quantization (block size 32).
         nnmf_factor (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: False)
-        factored_2nd (bool): whether to keep the first moment uncompressed (dense)
+        factored_2nd (bool): whether to keep the first moment uncompressed (dense)
             while only factorizing the second moment. (default: True)
     """
@@ -192,8 +192,8 @@ class AdamW_adv(torch.optim.Optimizer):
     def load_state_dict(self, state_dict: dict) -> None:
         """
         Overrides default load_state_dict to implement a workaround for PyTorch's
-        automatic dtype casting. It ensures factorized states remain float32 for
-        stability, preserves integer/float8 quantized anchor states, and forces
+        automatic dtype casting. It ensures factorized states remain float32 for
+        stability, preserves integer/float8 quantized anchor states, and forces
         standard states onto the parameter's current dtype/device.
         """
         super().load_state_dict(state_dict)
@@ -349,7 +349,11 @@ class AdamW_adv(torch.optim.Optimizer):
                     update_mt = mt if not factored_2nd else mt.clone()
             vt = _reconstruct_state((state['mu_v_nmf'], state['mv_v_nmf']), signed=False)
-            vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2)
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped * (1.0 - beta2))
+            else:
+                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2)
             if self.use_AdEMAMix:
                 if factored_2nd:
@@ -363,7 +367,7 @@ class AdamW_adv(torch.optim.Optimizer):
                     update = update_mt.add_(mt_slow, alpha=alpha)
                 else:
                     update = grad_reshaped.add(mt_slow, alpha=alpha)
                 if not factored_2nd:
                     # Factorize
                     state['mu_m_slow_nmf'], state['mv_m_slow_nmf'], state['sign_slow'] = _factorize_state(mt_slow, signed=True)
@@ -413,7 +417,10 @@ class AdamW_adv(torch.optim.Optimizer):
                 update = update_mt if beta1 > 0 else grad.clone()
             exp_avg_sq = state['exp_avg_sq']
-            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad * (1.0 - beta2))
+            else:
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
             if group['use_atan2']:
                 denom = exp_avg_sq.sqrt()

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/optim/Adopt_adv.py RENAMED Viewed

@@ -107,7 +107,7 @@ class Adopt_adv(torch.optim.Optimizer):
             'int4': Uses 4-bit block-wise quantization (block size 32).
         nnmf_factor (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: False)
-        factored_2nd (bool): whether to keep the first moment uncompressed (dense)
+        factored_2nd (bool): whether to keep the first moment uncompressed (dense)
             while only factorizing the second moment. (default: True)
     """
@@ -189,7 +189,7 @@ class Adopt_adv(torch.optim.Optimizer):
             "scaled_optm": scaled_optm,
             "centered_wd": centered_wd,
             "centered_wd_mode": centered_wd_mode,
-            "nnmf_factor": nnmf_factor, "vector_reshape": vector_reshape, "factored_2nd": factored_2nd,
+            "nnmf_factor": nnmf_factor, "vector_reshape": vector_reshape, "factored_2nd": factored_2nd,
             "compiled_optimizer": compiled_optimizer,
         }
         self.clip_lambda = clip_lambda
@@ -222,8 +222,8 @@ class Adopt_adv(torch.optim.Optimizer):
     def load_state_dict(self, state_dict: dict) -> None:
         """
         Overrides default load_state_dict to implement a workaround for PyTorch's
-        automatic dtype casting. It ensures factorized states remain float32 for
-        stability, preserves integer/float8 quantized anchor states, and forces
+        automatic dtype casting. It ensures factorized states remain float32 for
+        stability, preserves integer/float8 quantized anchor states, and forces
         standard states onto the parameter's current dtype/device.
         """
         super().load_state_dict(state_dict)
@@ -244,6 +244,19 @@ class Adopt_adv(torch.optim.Optimizer):
         grad = p.grad
         state = self.state[p]
+        beta1, beta2 = group['betas']
+        if group.get('kourkoutas_beta', False):
+            if 'step' not in state:
+                current_step = 0
+            else:
+                current_step = state['step']
+            # Call prepare_step() once at the beginning of the step for all params
+            self.kourkoutas_helper.maybe_prepare_step(current_step, p.device)
+            # Get the dynamic beta2 calculated in prepare_step()
+            beta2 = self.kourkoutas_helper.get_beta2(p, group)
         # State Initialization
         if 'step' not in state:
             state['step'] = 0
@@ -256,6 +269,12 @@ class Adopt_adv(torch.optim.Optimizer):
             dtype = torch.float32 if state['factored'] else p.dtype
+            vt_init = grad.pow(2).to(dtype)
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                vt_init.mul_(beta2).addcmul_(grad.to(dtype), grad.to(dtype) * (1.0 - beta2))
+            else:
+                vt_init.mul_(beta2).addcmul_(grad.to(dtype), grad.to(dtype), value=1.0 - beta2)
             if state['factored']:
                 state['effective_shape'] = _get_effective_shape(p.numel())
                 d1, d2 = state['effective_shape']
@@ -279,33 +298,21 @@ class Adopt_adv(torch.optim.Optimizer):
                     if self.use_AdEMAMix:
                         state['exp_avg_slow'] = torch.zeros_like(p, device=p.device, dtype=dtype)
                 # Second moment (v)
-                vt_init = grad.to(dtype).view(d1, d2).square()
-                # Allocate NMF factors for vt
-                state['mu_v_nmf'] = torch.zeros(d1, device=p.device, dtype=dtype)
-                state['mv_v_nmf'] = torch.zeros(d2, device=p.device, dtype=dtype)
-                # Initialize v_0
-                state['mu_v_nmf'], state['mv_v_nmf'] = _nnmf(vt_init)
+                state['mu_v_nmf'], state['mv_v_nmf'] = _nnmf(vt_init.view(d1, d2))
                 del vt_init
             else: # Fallback for non-factored tensors
                 if group['betas'][0] > 0:
                     state['exp_avg'] = torch.zeros_like(p, device=p.device, dtype=dtype)
                 if self.use_AdEMAMix:
                     state['exp_avg_slow'] = torch.zeros_like(p, device=p.device, dtype=dtype)
-                state['exp_avg_sq'] = grad.to(dtype).square()
+                state['exp_avg_sq'] = vt_init
             if group.get('scaled_optm', False) and is_spectral(p):
                 init_spectral_norm(group, state, p)
             _init_anchor(p, state, group)
-        beta1, beta2 = group['betas']
         current_step = state['step']
-        if group.get('kourkoutas_beta', False):
-            # Call prepare_step() once at the beginning of the step for all params
-            self.kourkoutas_helper.maybe_prepare_step(current_step, p.device)
-            # Get the dynamic beta2 calculated in prepare_step()
-            beta2 = self.kourkoutas_helper.get_beta2(p, group)
         # The first step is for initialization only (skip when use_atan2 as it's scale invariant).
         if state['step'] == 0 and not self.use_atan2:
@@ -361,7 +368,10 @@ class Adopt_adv(torch.optim.Optimizer):
             denom = vt.sqrt()
             # Update second moment v_t for the *next* step using raw g_t
-            vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2)
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped * (1.0 - beta2))
+            else:
+                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2)
             # Factorize
             state['mu_v_nmf'], state['mv_v_nmf'] = _factorize_state(vt, signed=False)
             del vt
@@ -475,9 +485,11 @@ class Adopt_adv(torch.optim.Optimizer):
                 else:
                     update = normalized_grad
             # Update second moment v_t for the next step using raw g_t
-            vt.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                vt.mul_(beta2).addcmul_(grad, grad * (1.0 - beta2))
+            else:
+                vt.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
         update_scaling = lr * A if self.use_atan2 else lr

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/optim/Lion_adv.py RENAMED Viewed

@@ -8,6 +8,7 @@ from ..util.factorization_util import _get_effective_shape, _reconstruct_state,
 from ..util.lion_k import _get_lion_k_update
 from ..util.scaled_optm import scale_update, is_spectral, init_spectral_norm
 from ..util.centered_decay import _init_anchor
+from ..util.update_util import _get_l1_adaptive_lr
 class Lion_adv(torch.optim.Optimizer):
@@ -46,7 +47,7 @@ class Lion_adv(torch.optim.Optimizer):
             updates. Overrides explicit kappa_p value. (default: False).
         freeze_on_flip (bool): Projected SignGD One-hit freeze. Masks updates for
             coordinates where the gradient sign flips compared to the previous step. (default: False)
-        l1_adaptive (bool): Scales learning rate dynamically
+        l1_adaptive (bool): Scales learning rate dynamically
             by the L1 norm of the gradient to handle gradient heterogeneity. (default: False).
         centered_wd (float): Centered Weight Decay coefficient. Instead of decaying weights
             toward zero, they are decayed toward their initial values (anchors). This
@@ -137,8 +138,8 @@ class Lion_adv(torch.optim.Optimizer):
     def load_state_dict(self, state_dict: dict) -> None:
         """
         Overrides default load_state_dict to implement a workaround for PyTorch's
-        automatic dtype casting. It ensures factorized states remain float32 for
-        stability, preserves integer/float8 quantized anchor states, and forces
+        automatic dtype casting. It ensures factorized states remain float32 for
+        stability, preserves integer/float8 quantized anchor states, and forces
         standard states onto the parameter's current dtype/device.
         """
         super().load_state_dict(state_dict)
@@ -251,8 +252,7 @@ class Lion_adv(torch.optim.Optimizer):
             # Compute update term c_t
             update = torch.lerp(grad_reshaped, exp_avg, beta1)
-            if group.get("l1_adaptive", False) and kappa_p == 1:
-                lr = lr * (update.norm(p=1))
+            l1_mean = _get_l1_adaptive_lr(p, update, state, group, kappa_p)
             # Standard Lion momentum update
             # m_t = beta2 * m_{t-1} + (1-beta2) * g_t
@@ -286,8 +286,7 @@ class Lion_adv(torch.optim.Optimizer):
             # Compute update term
             update = torch.lerp(grad, exp_avg, beta1)
-            if group.get("l1_adaptive", False) and kappa_p == 1:
-                lr = lr * (update.norm(p=1))
+            l1_mean = _get_l1_adaptive_lr(p, update, state, group, kappa_p)
             update = _get_lion_k_update(update, kappa_p)
@@ -305,6 +304,9 @@ class Lion_adv(torch.optim.Optimizer):
                 update = torch.where(current_sign == state['prev_sign'], update, 0.0)
                 state['prev_sign'] = current_sign
+        if l1_mean is not None:
+            update.mul_(l1_mean)
         if group.get('scaled_optm', False):
             update = scale_update(p, update, lr, vector_state=state.get('spectral_v'))
         else:

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/optim/Muon_adv.py RENAMED Viewed

@@ -259,8 +259,8 @@ class Muon_adv(torch.optim.Optimizer):
     def load_state_dict(self, state_dict: dict) -> None:
         """
         Overrides default load_state_dict to implement a workaround for PyTorch's
-        automatic dtype casting. It ensures factorized states remain float32 for
-        stability, preserves integer/float8 quantized anchor states, and forces
+        automatic dtype casting. It ensures factorized states remain float32 for
+        stability, preserves integer/float8 quantized anchor states, and forces
         standard states onto the parameter's current dtype/device.
         """
         super().load_state_dict(state_dict)

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/optim/Prodigy_adv.py RENAMED Viewed

@@ -67,7 +67,7 @@ class Prodigy_adv(torch.optim.Optimizer):
             stability. (default: 100.0)
         nnmf_factor (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: False)
-        factored_2nd (bool): whether to keep the first moment uncompressed (dense)
+        factored_2nd (bool): whether to keep the first moment uncompressed (dense)
             while only factorizing the second moment. (default: True)
         d0 (float):
             Initial D estimate for D-adaptation (default 1e-6). Rarely needs changing.
@@ -255,8 +255,8 @@ class Prodigy_adv(torch.optim.Optimizer):
     def load_state_dict(self, state_dict: dict) -> None:
         """
         Overrides default load_state_dict to implement a workaround for PyTorch's
-        automatic dtype casting. It ensures factorized states remain float32 for
-        stability, preserves integer/float8 quantized anchor states, and forces
+        automatic dtype casting. It ensures factorized states remain float32 for
+        stability, preserves integer/float8 quantized anchor states, and forces
         standard states onto the parameter's current dtype/device.
         """
         super().load_state_dict(state_dict)
@@ -440,7 +440,10 @@ class Prodigy_adv(torch.optim.Optimizer):
                     update_mt = mt if not factored_2nd else mt.clone()
             vt = _reconstruct_state((state['mu_v_nmf'], state['mv_v_nmf']), signed=False)
-            vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=d * d * (1.0 - beta2))
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped * (d * d * (1.0 - beta2)))
+            else:
+                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=d * d * (1.0 - beta2))
             if self.use_AdEMAMix:
                 if factored_2nd:
@@ -453,7 +456,7 @@ class Prodigy_adv(torch.optim.Optimizer):
                     update = update_mt.add_(mt_slow, alpha=alpha)
                 else:
                     update = grad_reshaped.mul(d).add_(mt_slow, alpha=alpha)
                 if not factored_2nd:
                     # Factorize
                     state['mu_m_slow_nmf'], state['mv_m_slow_nmf'], state['sign_slow'] = _factorize_state(mt_slow, signed=True)
@@ -514,7 +517,10 @@ class Prodigy_adv(torch.optim.Optimizer):
                     update = grad.mul(d)
             exp_avg_sq = state['exp_avg_sq']
-            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=d * d * (1.0 - beta2))
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad * (d * d * (1.0 - beta2)))
+            else:
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=d * d * (1.0 - beta2))
             if group['use_atan2']:
                 denom = exp_avg_sq.sqrt()
@@ -608,4 +614,4 @@ class Prodigy_adv(torch.optim.Optimizer):
         # Increment step counter for all groups, regardless of whether d was updated
         for group in self.param_groups:
-            group['k'] += 1
+            group['k'] += 1

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/optim/SignSGD_adv.py RENAMED Viewed

@@ -6,8 +6,8 @@ from ..util import param_update
 from ..util.OrthoGrad import _orthogonalize_gradient
 from ..util.factorization_util import _get_effective_shape, _reconstruct_state, _factorize_state, _pack_bools, _unpack_bools
 from ..util.lion_k import _get_lion_k_update
+from ..util.update_util import _get_l1_adaptive_lr
 from ..util.scaled_optm import scale_update, is_spectral, init_spectral_norm
-from ..util.update_util import _scale_sim_AdEMAMix_update
 from ..util.centered_decay import _init_anchor
@@ -49,8 +49,8 @@ class SignSGD_adv(torch.optim.Optimizer):
             stability. (default: 100.0)
         freeze_on_flip (bool): Projected SignGD One-hit freeze. Masks updates for
             coordinates where the gradient sign flips compared to the previous step. (default: False)
-        l1_adaptive (bool): Scales learning rate dynamically.
-            by the L1 norm of the gradient to handle gradient heterogeneity. (default: False)
+        l1_adaptive (bool): Scales the update step magnitude dynamically
+            by the mean L1 norm of the momentum/gradient to handle gradient heterogeneity.(default: False)
         centered_wd (float): Centered Weight Decay coefficient. Instead of decaying weights
             toward zero, they are decayed toward their initial values (anchors). This
             can be used together with standard weight decay. (default: 0.0)
@@ -140,8 +140,8 @@ class SignSGD_adv(torch.optim.Optimizer):
     def load_state_dict(self, state_dict: dict) -> None:
         """
         Overrides default load_state_dict to implement a workaround for PyTorch's
-        automatic dtype casting. It ensures factorized states remain float32 for
-        stability, preserves integer/float8 quantized anchor states, and forces
+        automatic dtype casting. It ensures factorized states remain float32 for
+        stability, preserves integer/float8 quantized anchor states, and forces
         standard states onto the parameter's current dtype/device.
         """
         super().load_state_dict(state_dict)
@@ -269,9 +269,7 @@ class SignSGD_adv(torch.optim.Optimizer):
                 if freeze_on_flip:
                     state['sign'] = _pack_bools(raw_update > 0)
-            if group.get("l1_adaptive", False) and kappa_p == 1:
-                scale_factor = 1 / _scale_sim_AdEMAMix_update(momentum, state["step"] + 1, alpha_grad, 1, False)
-                lr = lr * (raw_update.norm(p=1)/scale_factor)
+            l1_mean = _get_l1_adaptive_lr(p, raw_update, state, group, kappa_p)
             update = _get_lion_k_update(raw_update, kappa_p)
             update = update.view(p.shape)
@@ -296,9 +294,7 @@ class SignSGD_adv(torch.optim.Optimizer):
             else:
                 raw_update = grad.clone()
-            if group.get("l1_adaptive", False) and kappa_p == 1:
-                scale_factor = 1 / _scale_sim_AdEMAMix_update(momentum, state["step"] + 1, alpha_grad, 1, False)
-                lr = lr * (raw_update.norm(p=1)/scale_factor)
+            l1_mean = _get_l1_adaptive_lr(p, raw_update, state, group, kappa_p)
             update = _get_lion_k_update(raw_update, kappa_p)
@@ -307,6 +303,9 @@ class SignSGD_adv(torch.optim.Optimizer):
                 update = torch.where(current_sign == state['prev_sign'], update, 0.0)
                 state['prev_sign'] = current_sign
+        if l1_mean is not None:
+            update.mul_(l1_mean)
         if group.get('scaled_optm', False):
             update = scale_update(p, update, lr, vector_state=state.get('spectral_v'))
         else:

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/optim/Simplified_AdEMAMix.py RENAMED Viewed

@@ -86,7 +86,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
             'int4': Uses 4-bit block-wise quantization (block size 32).
         nnmf_factor (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: False)
-        factored_2nd (bool): whether to keep the first moment uncompressed (dense)
+        factored_2nd (bool): whether to keep the first moment uncompressed (dense)
             while only factorizing the second moment. (default: True)
     """
@@ -176,8 +176,8 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
     def load_state_dict(self, state_dict: dict) -> None:
         """
         Overrides default load_state_dict to implement a workaround for PyTorch's
-        automatic dtype casting. It ensures factorized states remain float32 for
-        stability, preserves integer/float8 quantized anchor states, and forces
+        automatic dtype casting. It ensures factorized states remain float32 for
+        stability, preserves integer/float8 quantized anchor states, and forces
         standard states onto the parameter's current dtype/device.
         """
         super().load_state_dict(state_dict)
@@ -320,7 +320,10 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
             mt.mul_(beta1).add_(grad_reshaped)
             vt = _reconstruct_state((state['mu_v_nmf'], state['mv_v_nmf']), signed=False)
-            vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2)
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped * (1.0 - beta2))
+            else:
+                vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2)
             # update = mt + (grad_reshaped * alpha_grad)
             update = torch.add(mt, grad_reshaped, alpha=alpha_grad)
@@ -347,7 +350,10 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
             update = torch.add(exp_avg, grad, alpha=alpha_grad)
-            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+            if isinstance(beta2, torch.Tensor) and beta2.dim() > 0:
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad * (1.0 - beta2))
+            else:
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
             denom = exp_avg_sq.sqrt().add_(sqrt_den_eps)
             update.div_(denom)

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/util/Kourkoutas.py RENAMED Viewed

@@ -34,8 +34,12 @@ class KourkoutasHelper:
         else:
             # No key function was provided. Default to coarse, shape-based bucketing.
             self.optimizer.layer_key_fn = lambda p: \
-                (id(p),) if p.dim() == 2 and 1 <= p.shape[0] <= 10 and p.shape[1] in {768, 1280, 4096} \
-                else tuple(p.shape)
+                (id(p),) if (
+                    getattr(p, '_is_oft', False) or
+                    getattr(p, '_is_lora_A', False) or
+                    getattr(p, '_is_lora_B', False) or
+                    getattr(p, '_is_dora_scale', False)
+                ) else tuple(p.shape)
             # This ensures that we won't mix embeddings with tokens (1 to 10)
             # TODO find a better way to safeguard the embeddings
@@ -55,13 +59,21 @@ class KourkoutasHelper:
     def _get_or_init_layer_ema_tensor(self, layer_key, layer_params, device):
         """
         Retrieves the EMA tensor for this layer.
-        It handles synchronization between the internal layer_state and
+        It handles synchronization between the internal layer_state and
         the external optimizer.state (which is required for state_dict saving/loading).
         """
         # Initialize container in layer_state if missing
         if layer_key not in self.layer_state:
+            p = layer_params[0]
+            if getattr(p, '_is_oft', False) or getattr(p, '_is_lora_A', False):
+                shape = (p.shape[0], 1)
+            elif getattr(p, '_is_lora_B', False):
+                shape = (1, p.shape[1])
+            else:
+                shape = ()
             self.layer_state[layer_key] = {
-                'sum_sq_accumulator': torch.tensor(0.0, device=device, dtype=torch.float32)
+                'sum_sq_accumulator': torch.zeros(shape, device=device, dtype=torch.float32)
             }
         internal_ema = self.layer_state[layer_key].get('kourkoutas_r_ema')
@@ -87,7 +99,15 @@ class KourkoutasHelper:
         # Case B: No state anywhere. Create new.
         if internal_ema is None:
-            new_ema = torch.tensor(0.0, device=device, dtype=torch.float32)
+            p = layer_params[0]
+            if getattr(p, '_is_oft', False) or getattr(p, '_is_lora_A', False):
+                shape = (p.shape[0], 1)
+            elif getattr(p, '_is_lora_B', False):
+                shape = (1, p.shape[1])
+            else:
+                shape = ()
+            new_ema = torch.zeros(shape, device=device, dtype=torch.float32)
             self.layer_state[layer_key]['kourkoutas_r_ema'] = new_ema
             # Register this tensor in optimizer.state for ALL params so it gets saved
@@ -107,7 +127,7 @@ class KourkoutasHelper:
     def prepare_step(self, current_step: int, device):
         """
-        Calculates dynamic beta2 for all layers using the completed scalar accumulators
+        Calculates dynamic beta2 for all layers using the completed accumulators
         from the PREVIOUS step. Should be called once at the start of an optimizer step.
         """
         beta2_log = []
@@ -154,7 +174,10 @@ class KourkoutasHelper:
                 beta2 = beta2_max - (beta2_max - beta2_min) * sun
             # Store the final calculated beta2 in the helper's transient state for this step.
-            self.layer_state[layer_key]['dynamic_beta2'] = beta2.item() if isinstance(beta2, torch.Tensor) and not group.get('compiled_optimizer', False) else beta2
+            if isinstance(beta2, torch.Tensor) and beta2.numel() == 1 and not group.get('compiled_optimizer', False):
+                self.layer_state[layer_key]['dynamic_beta2'] = beta2.item()
+            else:
+                self.layer_state[layer_key]['dynamic_beta2'] = beta2
             # Reset the accumulator for the next optimizer step.
             accumulator.zero_()
@@ -163,10 +186,11 @@ class KourkoutasHelper:
         # Compute stats for TensorBoard
         if beta2_log:
-            beta2_tensor = torch.as_tensor(beta2_log, device='cpu')
+            # Handles lists containing both standard floats and heterogeneous tensors
+            means = [b.mean().item() if isinstance(b, torch.Tensor) else float(b) for b in beta2_log]
             self.last_beta2_stats = {
-                'mean': beta2_tensor.mean().item()
-                }
+                'mean': sum(means) / len(means)
+            }
     def maybe_prepare_step(self, current_step: int, device):
         """
@@ -184,9 +208,16 @@ class KourkoutasHelper:
         if layer_key in self.layer_info and layer_key in self.layer_state:
             # Accumulate for the *next* step's prepare_step call
-            self.layer_state[layer_key]['sum_sq_accumulator'] += torch.sum(grad.detach().pow(2)).float()
+            if getattr(p, '_is_oft', False) or getattr(p, '_is_lora_A', False):
+                sq_norm = torch.sum(grad.detach().pow(2), dim=1, keepdim=True).float()
+            elif getattr(p, '_is_lora_B', False):
+                sq_norm = torch.sum(grad.detach().pow(2), dim=0, keepdim=True).float()
+            else:
+                sq_norm = torch.sum(grad.detach().pow(2)).float()
+            self.layer_state[layer_key]['sum_sq_accumulator'] += sq_norm
-    def get_beta2(self, p: torch.Tensor, group: dict) -> float:
+    def get_beta2(self, p: torch.Tensor, group: dict) -> float | torch.Tensor:
         """
         Gets the appropriate beta2 for the current parameter, handling warmup and dynamic value fetching.
         """

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/util/Muon_AuxAdam.py RENAMED Viewed

@@ -87,7 +87,10 @@ def _adam_step_parameter(self, p, grad, state, group, beta1_adam, beta2_adam, sq
                 update_mt = mt
         vt = _reconstruct_state((state['mu_v_nmf'], state['mv_v_nmf']), signed=False)
-        vt.mul_(beta2_adam).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2_adam)
+        if isinstance(beta2_adam, torch.Tensor) and beta2_adam.dim() > 0:
+            vt.mul_(beta2_adam).addcmul_(grad_reshaped, grad_reshaped * (1.0 - beta2_adam))
+        else:
+            vt.mul_(beta2_adam).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2_adam)
         if group.get('adam_use_AdEMAMix'):
             mt_slow = _reconstruct_state((state['mu_m_slow_nmf'], state['mv_m_slow_nmf'], state['sign_slow'], d2), signed=True)
@@ -148,7 +151,10 @@ def _adam_step_parameter(self, p, grad, state, group, beta1_adam, beta2_adam, sq
             update = update_mt if beta1_adam > 0 else grad.clone()
         exp_avg_sq = state['exp_avg_sq']
-        exp_avg_sq.mul_(beta2_adam).addcmul_(grad, grad, value=1 - beta2_adam)
+        if isinstance(beta2_adam, torch.Tensor) and beta2_adam.dim() > 0:
+            exp_avg_sq.mul_(beta2_adam).addcmul_(grad, grad * (1.0 - beta2_adam))
+        else:
+            exp_avg_sq.mul_(beta2_adam).addcmul_(grad, grad, value=1.0 - beta2_adam)
         if group.get('adam_use_atan2'):
             denom = exp_avg_sq.sqrt()

adv_optm-2.4.dev4/adv_optm/util/OrthoGrad.py ADDED Viewed

@@ -0,0 +1,50 @@
+import torch
+def _orthogonalize_gradient(p: torch.Tensor, grad: torch.Tensor) -> torch.Tensor:
+    """
+    Projects the gradient `grad` to be orthogonal to the parameter `p`.
+    Modified from:
+    https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability/blob/720d2444df12b851d6cb417ab08cf125c822b2ae/orthograd.py
+    """
+    if getattr(p, '_is_oft', False) or getattr(p, '_is_lora_A', False):
+        return _orthogonalize_gradient_granular(p, grad, dim=1)
+    elif getattr(p, '_is_lora_B', False):
+        return _orthogonalize_gradient_granular(p, grad, dim=0)
+    original_shape = grad.shape
+    original_dtype = grad.dtype
+    w = p.view(-1).float()
+    g = grad.view(-1).float()
+    w_norm_sq = torch.dot(w, w).add_(1e-30)
+    proj = torch.dot(w, g) / w_norm_sq
+    g_orth = g.sub(w * proj)
+    g_norm = g.norm(2)
+    g_orth_norm = g_orth.norm(2).add_(1e-30)
+    g_orth_scaled = g_orth * (g_norm / g_orth_norm)
+    return g_orth_scaled.view(original_shape).to(original_dtype)
+def _orthogonalize_gradient_granular(p: torch.Tensor, grad: torch.Tensor, dim: int = 1, eps: float = 1e-30) -> torch.Tensor:
+    """
+    Projects the gradient `grad` to be orthogonal to the parameter `p` row/col-wise,
+    while preserving the original norm of the gradient for each row/col.
+    """
+    original_dtype = grad.dtype
+    p_f32 = p.float()
+    grad_f32 = grad.float()
+    # Calculate the dot product <p, grad> for each row/col
+    dot_prod = torch.sum(p_f32 * grad_f32, dim=dim, keepdim=True)
+    # Calculate ||p||^2 for each row/col
+    p_norm_sq = torch.sum(p_f32 * p_f32, dim=dim, keepdim=True).add_(eps)
+    # Project: g_orth = g - (p * <p, g> / ||p||^2)
+    proj = dot_prod / p_norm_sq
+    grad_orth = grad_f32 - (proj * p_f32)
+    # Magnitude Preservation
+    g_norm = torch.norm(grad_f32, p=2, dim=dim, keepdim=True)
+    g_orth_norm = torch.norm(grad_orth, p=2, dim=dim, keepdim=True).add_(eps)
+    grad_orth_scaled = grad_orth * (g_norm / g_orth_norm)
+    return grad_orth_scaled.to(original_dtype)

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/util/centered_decay.py RENAMED Viewed

@@ -109,4 +109,4 @@ def dequantize_anchor(p, state, group, dtype):
     anchor_blocks = quantized_blocks.to(dtype) * scales.unsqueeze(1) + mins.unsqueeze(1)
     # Flatten, truncate any padding added during quantization, and reshape
-    return anchor_blocks.view(-1)[:orig_numel].view(orig_shape)
+    return anchor_blocks.view(-1)[:orig_numel].view(orig_shape)

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/util/param_update.py RENAMED Viewed

@@ -138,7 +138,7 @@ def set_seed(device: torch.device):
 def get_generator(device: torch.device) -> torch.Generator:
     """
-    Retrieves (and initializes if necessary) the deterministic generator
+    Retrieves (and initializes if necessary) the deterministic generator
     for the specified device.
     """
     if device not in _generators:
@@ -241,9 +241,9 @@ def post_process_loaded_state(optimizer: Optimizer) -> None:
             # Deterministically check if this parameter skipped quantization
             numel = p.numel()
             is_skipped = (
-                numel == 0 or
-                (mode in ['int8', 'int4'] and numel < 10000) or
-                p.ndim == 1 or
+                numel == 0 or
+                (mode in ['int8', 'int4'] and numel < 10000) or
+                p.ndim == 1 or
                 getattr(p, '_is_dora_scale', False)
             )
@@ -283,4 +283,4 @@ def post_process_loaded_state(optimizer: Optimizer) -> None:
                 # Ensure device match
                 if state[key].device != p.device:
-                    state[key] = state[key].to(p.device)
+                    state[key] = state[key].to(p.device)

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/util/scaled_optm.py RENAMED Viewed

@@ -9,7 +9,7 @@ def scale_update(
     vector_state: torch.Tensor | None = None
 ) -> torch.Tensor:
     """
-    Applies adaptive scaling to the parameter update based on the parameter's
+    Applies adaptive scaling to the parameter update based on the parameter's
     role (DoRA, OFT, or LoRA/Full Finetuning).
     Args:
@@ -28,11 +28,15 @@ def scale_update(
     if is_dora_scale or p.ndim == 1:
         return rms_normalization(update, dim=None, lr=lr)
-    # Orthogonal Fine-Tuning (OFT)
-    # RMS normalization (dim=1 normalizes per block)
+    # Orthogonal Fine-Tuning (OFT)
     # This guarantees O(1) update complexity scaling, independent of block sizes.
     if is_oft:
-        return rms_normalization(update, dim=1, lr=lr)
+        n = update.shape[1]
+        # Calculate block size (b)
+        b = (1 + (1 + 8 * n) ** 0.5) / 2
+        target_norm = (b / 8) ** 0.5
+        scale = target_norm / (n ** 0.5)
+        return rms_normalization(update, dim=1, lr=lr * scale)
     # LoRA Factors or Full Finetuning weights
     # Scales update to maintain consistent spectral norm across different layer sizes and ranks.
@@ -44,7 +48,7 @@ def scale_update(
 def scale_wds(wd: float, cwd: float, p: torch.Tensor) -> tuple[float, float]:
     """
-    Adjusts standard weight decay and centered weight decay based on the parameter's
+    Adjusts standard weight decay and centered weight decay based on the parameter's
     shape and type to maintain effective regularization strength.
     """
     # DoRA Scale (Magnitude Vector)

adv_optm-2.4.dev4/adv_optm/util/update_util.py ADDED Viewed

@@ -0,0 +1,73 @@
+import torch
+def _grams_update(mt: torch.Tensor, grad: torch.Tensor, inplace: bool=False):
+    """
+    Applies the update rule of "Gradient Descent with Adaptive Momentum Scaling"
+    (https://arxiv.org/abs/2412.17107).
+    """
+    if inplace:
+        return mt.abs_().mul_(grad.sign())
+    return grad.sign().mul_(mt.abs())
+def _cautious_update(mt: torch.Tensor, grad: torch.Tensor, inplace: bool=False):
+    """
+    Applies the update rule of "Cautious Optimizers: Improving Training with One
+    Line of Code" (https://arxiv.org/abs/2411.16085).
+    """
+    mask = (mt * grad > 0).to(grad.dtype)
+    mask.div_(mask.mean().clamp_min_(1e-3))
+    if inplace:
+        update_mt = mt.mul_(mask)
+    else:
+        update_mt = mt.mul(mask)
+    del mask
+    return update_mt
+def _scale_sim_AdEMAMix_update(beta: float, current_step: int, alpha_grad: float, lr: float, scaled_optm: bool=False):
+    if scaled_optm:
+        return lr
+    momentum_scale = (1 - beta ** current_step) / (1 - beta)
+    total_scale = 1 / (momentum_scale + alpha_grad)
+    lr = lr * total_scale
+    return lr
+def _get_l1_adaptive_lr(
+    p: torch.Tensor,
+    update: torch.Tensor,
+    state: dict,
+    group: dict,
+    kappa_p: float
+) -> torch.Tensor:
+    """
+    Calculates the L1 adaptive learning rate based on gradient heterogeneity.
+    """
+    if not group.get("l1_adaptive", False) and kappa_p != 1:
+        return None
+    momentum = group["momentum"]
+    alpha_grad = group["alpha_grad"]
+    update_view = update.view(p.shape)
+    # Calculate scale factor based on momentum/update magnitude
+    scale_factor = _scale_sim_AdEMAMix_update(
+        momentum, state["step"] + 1, alpha_grad, 1, False
+    )
+    # Determine dimension for mean calculation based on parameter type
+    if getattr(p, '_is_oft', False) or getattr(p, '_is_lora_A', False):
+        l1_dim = 1
+    elif getattr(p, '_is_lora_B', False):
+        l1_dim = 0
+    else:
+        update_abs = update_view.abs() * scale_factor
+        if update_abs.ndim >= 2:
+            orig_shape = update_abs.shape
+            update_2d = update_abs.view(orig_shape[0], -1)
+            mean_l1_norm_2d = torch.outer(update_2d.mean(dim=1), update_2d.mean(dim=0))
+            return mean_l1_norm_2d.view(orig_shape)
+        else:
+            return update_abs.mean()
+    mean_l1_norm = update_view.abs().mean(dim=l1_dim, keepdim=True) * scale_factor
+    return mean_l1_norm

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 2.4.dev2
+Version: 2.4.dev4
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setup(
     name="adv_optm",
-    version="2.4.dev2",
+    version="2.4.dev4",
     author="Koratahiu",
     author_email="hiuhonor@gmail.com",
     license='Apache 2.0',

adv_optm-2.4.dev2/adv_optm/util/OrthoGrad.py DELETED Viewed

@@ -1,21 +0,0 @@
-import torch
-def _orthogonalize_gradient(p: torch.Tensor, grad: torch.Tensor) -> torch.Tensor:
-    """
-    Projects the gradient `grad` to be orthogonal to the parameter `p`.
-    Modified from:
-    https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability/blob/720d2444df12b851d6cb417ab08cf125c822b2ae/orthograd.py
-    """
-    if grad.is_sparse:
-        raise RuntimeError("OrthoGrad logic does not support sparse gradients.")
-    original_shape = grad.shape
-    original_dtype = grad.dtype
-    w = p.view(-1).float()
-    g = grad.view(-1).float()
-    w_norm_sq = torch.dot(w, w).add_(1e-30)
-    proj = torch.dot(w, g) / w_norm_sq
-    g_orth = g.sub(w * proj)
-    g_norm = g.norm(2)
-    g_orth_norm = g_orth.norm(2).add_(1e-30)
-    g_orth_scaled = g_orth * (g_norm / g_orth_norm)
-    return g_orth_scaled.view(original_shape).to(original_dtype)

adv_optm-2.4.dev2/adv_optm/util/update_util.py DELETED Viewed

@@ -1,32 +0,0 @@
-import torch
-def _grams_update(mt: torch.Tensor, grad: torch.Tensor, inplace: bool=False):
-    """
-    Applies the update rule of "Gradient Descent with Adaptive Momentum Scaling"
-    (https://arxiv.org/abs/2412.17107).
-    """
-    if inplace:
-        return mt.abs_().mul_(grad.sign())
-    return grad.sign().mul_(mt.abs())
-def _cautious_update(mt: torch.Tensor, grad: torch.Tensor, inplace: bool=False):
-    """
-    Applies the update rule of "Cautious Optimizers: Improving Training with One
-    Line of Code" (https://arxiv.org/abs/2411.16085).
-    """
-    mask = (mt * grad > 0).to(grad.dtype)
-    mask.div_(mask.mean().clamp_min_(1e-3))
-    if inplace:
-        update_mt = mt.mul_(mask)
-    else:
-        update_mt = mt.mul(mask)
-    del mask
-    return update_mt
-def _scale_sim_AdEMAMix_update(beta: float, current_step: int, alpha_grad: float, lr: float, scaled_optm: bool=False):
-    if scaled_optm:
-        return lr
-    momentum_scale = (1 - beta ** current_step) / (1 - beta)
-    total_scale = 1 / (momentum_scale + alpha_grad)
-    lr = lr * total_scale
-    return lr

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/LICENSE RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/README.md RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/optim/Lion_Prodigy_adv.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/optim/__init__.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/util/Muon_util.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/util/__init__.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/util/factorization_util.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm/util/lion_k.py RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm.egg-info/requires.txt RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/adv_optm.egg-info/top_level.txt RENAMED Viewed

File without changes

{adv_optm-2.4.dev2 → adv_optm-2.4.dev4}/setup.cfg RENAMED Viewed

File without changes

adv-optm 2.4.dev2__tar.gz → 2.4.dev4__tar.gz

adv-optm 2.4.dev2tar.gz → 2.4.dev4tar.gz