PyPI - adv-optm - Versions diffs - 0.1.5__tar.gz → 0.1.7__tar.gz - Mend

adv-optm 0.1.5tar.gz → 0.1.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of adv-optm might be problematic. Click here for more details.

Files changed (24) hide show

{adv_optm-0.1.5 → adv_optm-0.1.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 0.1.5
+Version: 0.1.7
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-0.1.5 → adv_optm-0.1.7}/adv_optm/__init__.py RENAMED Viewed

@@ -2,6 +2,7 @@ from .optim import (
     AdamW_adv,
     Prodigy_adv,
     Adopt_adv,
+    Simplified_AdEMAMix,
     Lion_adv,
     Lion_Prodigy_adv,
 )
@@ -10,8 +11,9 @@ __all__ = [
     "AdamW_adv",
     "Prodigy_adv",
     "Adopt_adv",
+    "Simplified_AdEMAMix",
     "Lion_adv",
     "Lion_Prodigy_adv",
 ]
-__version__ = "0.1.5"
+__version__ = "0.1.7"

{adv_optm-0.1.5 → adv_optm-0.1.7}/adv_optm/optim/AdamW_adv.py RENAMED Viewed

@@ -230,7 +230,7 @@ class AdamW_adv(torch.optim.Optimizer):
                 update.div_(denom)
             del denom
-            update.view(p.shape).mul_(step_size)
+            update = update.view(p.shape).mul_(step_size)
             # Compress updated moments and store new factors
             if beta1 > 0:

{adv_optm-0.1.5 → adv_optm-0.1.7}/adv_optm/optim/Prodigy_adv.py RENAMED Viewed

@@ -52,7 +52,17 @@ class Prodigy_adv(torch.optim.Optimizer):
             highly recommended to prevent instability at the beginning of training,
             as it gradually introduces the stabilizing slow momentum term. During
             the warmup, `alpha` ramps from 0 to its target value. If `None`,
-            the scheduler is disabled and th
+            the scheduler is disabled.
+        Simplified_AdEMAMix (bool): whether to use the Simplified AdEMAMix update rule.
+            This changes the EMA to accumulator and the update numerator to `alpha_grad * grad + mt`, which can be
+            more responsive, especially for small batch sizes. Enabling this will
+            automatically disable `use_AdEMAMix`, `use_cautious`, `use_grams`,
+            and `use_atan2`. (default: False)
+        alpha_grad (float): Mixing coefficient for the Simplified AdEMAMix update rule
+            (only used when `Simplified_AdEMAMix` is `True`). Controls the weight of the
+            current gradient. For small batch sizes, use high values (e.g., 10-100) to be
+            more responsive. For large batch sizes, use low values (e.g., 0-1) for
+            stability. (default: 100.0)
         factored (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: True)
         d0 (float):
@@ -91,6 +101,8 @@ class Prodigy_adv(torch.optim.Optimizer):
         beta3_ema: float = 0.9999,
         alpha: float = 5.0,
         t_alpha: int | None = None,
+        Simplified_AdEMAMix: bool = False,
+        alpha_grad: float = 100.0,
         factored: bool = True,
         # prodigy parameters
         beta3: float = None,
@@ -109,6 +121,17 @@ class Prodigy_adv(torch.optim.Optimizer):
             raise ValueError(f"Epsilon should be >= 0.0. Got {eps}")
         if not (weight_decay >= 0.0):
             raise ValueError(f"Weight-decay should be >= 0.0. Got {weight_decay}")
+        if betas[0] == 0.0 and Simplified_AdEMAMix:
+            raise ValueError(f"Beta 1 cannot be 0.0 when using Simplified_AdEMAMix. Got {betas[0]}")
+        if use_AdEMAMix and Simplified_AdEMAMix:
+            print("Warning: use_AdEMAMix is incompatible with Simplified_AdEMAMix, Disabling use_AdEMAMix.")
+        if use_grams and Simplified_AdEMAMix:
+            print("Warning: use_grams is incompatible with Simplified_AdEMAMix, Disabling use_grams.")
+        if use_cautious and Simplified_AdEMAMix:
+            print("Warning: use_cautious is incompatible with Simplified_AdEMAMix, Disabling use_cautious.")
+        if use_atan2 and Simplified_AdEMAMix:
+            print("Warning: use_atan2 is incompatible with Simplified_AdEMAMix. Disabling use_atan2.")
+            use_atan2 = False
         defaults = {
             "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
@@ -118,11 +141,13 @@ class Prodigy_adv(torch.optim.Optimizer):
             "beta3": beta3, "d": d0, "d0": d0, "d_max": d0, "d_numerator": 0.0, "d_coef": d_coef,
             "growth_rate": growth_rate, "safeguard_warmup": safeguard_warmup, "k": 0, "slice_p": slice_p,
             "fsdp_in_use": fsdp_in_use,
+            "alpha_grad": alpha_grad,
         }
         self.stochastic_rounding = stochastic_rounding
-        self.use_cautious = use_cautious
-        self.use_grams = use_grams
-        self.use_AdEMAMix = use_AdEMAMix
+        self.use_cautious = use_cautious and not Simplified_AdEMAMix
+        self.use_grams = use_grams and not Simplified_AdEMAMix
+        self.use_AdEMAMix = use_AdEMAMix and not Simplified_AdEMAMix
+        self.Simplified_AdEMAMix = Simplified_AdEMAMix
         self.factored = factored
         self.fsdp_in_use = fsdp_in_use
         super().__init__(params, defaults)
@@ -229,6 +254,8 @@ class Prodigy_adv(torch.optim.Optimizer):
             alpha_t = alpha
             if t_alpha is not None and t_alpha > 0 and current_step < t_alpha:
                 alpha_t = min(current_step * alpha / t_alpha, alpha)
+        if self.Simplified_AdEMAMix:
+            alpha_grad = group["alpha_grad"]
         if state['factored']:
             d1, d2 = state['effective_shape']
@@ -243,7 +270,10 @@ class Prodigy_adv(torch.optim.Optimizer):
                     torch.where(unpacked_sign, mt, -mt, out=mt)
                     del unpacked_sign
                 # Update momentum in full-size
-                mt.mul_(self.beta1).add_(grad_reshaped, alpha=self.d * (1.0 - self.beta1))
+                if self.Simplified_AdEMAMix:
+                    mt.mul_(self.beta1).add_(grad_reshaped, alpha=self.d)
+                else:
+                    mt.mul_(self.beta1).add_(grad_reshaped, alpha=self.d * (1.0 - self.beta1))
                 if self.use_grams:
                     mt.copy_(grad_reshaped.sign() * mt.abs())
                 elif self.use_cautious:
@@ -264,6 +294,8 @@ class Prodigy_adv(torch.optim.Optimizer):
                 del unpacked_sign_slow
                 mt_slow.mul_(beta3_ema).add_(grad_reshaped, alpha=self.d * (1.0 - beta3_ema))
                 update = mt + (alpha_t * mt_slow) if self.beta1 > 0 else grad_reshaped + (alpha_t * mt_slow)
+            elif self.Simplified_AdEMAMix:
+                update = torch.add(mt, grad_reshaped, alpha=alpha_grad * self.d)
             else:
                 update = mt.clone() if self.beta1 > 0 else grad_reshaped.clone()
             del grad_reshaped
@@ -277,7 +309,7 @@ class Prodigy_adv(torch.optim.Optimizer):
                 update.div_(denom.add_(self.d * group['eps']))
             del denom
-            update.view(p.shape).mul_(self.dlr)
+            update = update.view(p.shape).mul_(self.dlr)
             # Compress updated moments and store new factors
             if self.beta1 > 0:
@@ -297,7 +329,10 @@ class Prodigy_adv(torch.optim.Optimizer):
             if self.beta1 > 0:
                 exp_avg = state['exp_avg']
-                exp_avg.mul_(self.beta1).add_(grad, alpha=self.d * (1.0 - self.beta1))
+                if self.Simplified_AdEMAMix:
+                    exp_avg.mul_(self.beta1).add_(grad, alpha=self.d)
+                else:
+                    exp_avg.mul_(self.beta1).add_(grad, alpha=self.d * (1.0 - self.beta1))
                 if self.use_grams:
                     exp_avg = grad.sign() * exp_avg.abs()
                 elif self.use_cautious:
@@ -310,6 +345,8 @@ class Prodigy_adv(torch.optim.Optimizer):
                 exp_avg_slow = state['exp_avg_slow']
                 exp_avg_slow.mul_(beta3_ema).add_(grad, alpha=self.d * (1.0 - beta3_ema))
                 update = exp_avg + (alpha_t * exp_avg_slow) if self.beta1 > 0 else grad + (alpha_t * exp_avg_slow)
+            elif self.Simplified_AdEMAMix:
+                update = torch.add(exp_avg, grad, alpha=alpha_grad * self.d)
             else:
                 update = exp_avg.clone() if self.beta1 > 0 else grad.clone()

adv_optm-0.1.7/adv_optm/optim/Simplified_AdEMAMix.py ADDED Viewed

@@ -0,0 +1,246 @@
+import torch
+import math
+from ..util.BF16_Stochastic_Rounding import add_stochastic_
+from ..util.Effective_Shape import _get_effective_shape
+from ..util.NNMF import _nnmf,_unnmf
+from ..util.OrthoGrad import _orthogonalize_gradient
+from ..util.One_Bit_Boolean import _pack_bools, _unpack_bools
+# A little helper from the original simplified_AdEMAMix
+def linear_hl_warmup_scheduler(step, beta_end, beta_start=0, warmup=1):
+    def f(beta, eps=1e-8):
+        return math.log(0.5)/math.log(beta+eps)-1
+    def f_inv(t):
+        return math.pow(0.5, 1/(t+1))
+    if step < warmup:
+        a = step / float(warmup)
+        return f_inv((1.0-a) * f(beta_start) + a * f(beta_end))
+    return beta_end
+class Simplified_AdEMAMix(torch.optim.Optimizer):
+    """
+    Implements the Simplified AdEMAMix algorithm.
+    Refactored from:
+    https://github.com/DepenM/Simplified-AdEMAMix/blob/main/simplified_AdEMAMix.py
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float): learning rate (default: 1e-5)
+        betas (tuple[float, float]): coefficients used for computing running
+            averages of gradient and its square (default: (0.99, 0.999))
+        eps (float): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float): weight decay (L2 penalty) (default: 0).
+        alpha_grad (float): Coeficient for mixing the current gradient and EMA. for small batch
+            sizes set it to high values, up to 100. And for large batch sized set it to small
+            value, down to 0. (default: 100)
+        beta1_warmup (int, optional): number of warmup steps used to increase beta1 (default: None)
+        min_beta1 (float, optional): minimum value of beta1 to start from (default 0.9)
+        vector_reshape (bool): whether to reshape 1D vectors into 2D
+            matrices to apply low-rank compression (default: True).
+        stochastic_rounding (bool): whether to use stochastic
+            rounding for BF16 parameter updates (default: True).
+        use_orthograd (bool): whether to use OrthoGrad. (default: False)
+        factored (bool): whether to use the factorization or disable it to use
+            the uncompressed optimizer. (default: False)
+    """
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-5,
+        betas: tuple[float, float] = (0.99, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 0.0,
+        alpha_grad: float = 100.0,
+        beta1_warmup: int | None = None,
+        min_beta1: float | None = 0.9,
+        use_bias_correction: bool = True,
+        vector_reshape: bool = True,
+        stochastic_rounding: bool = True,
+        use_orthograd: bool = False,
+        factored: bool = False,
+    ):
+        if not (lr >= 0.0):
+            raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
+        if not (0.0 <= betas[0] < 1.0 and 0.0 <= betas[1] < 1.0):
+            raise ValueError(f"Betas should be in [0.0, 1.0). Got {betas}")
+        if not (eps >= 0.0):
+            raise ValueError(f"Epsilon should be >= 0.0. Got {eps}")
+        if not (weight_decay >= 0.0):
+            raise ValueError(f"Weight-decay should be >= 0.0. Got {weight_decay}")
+        if not 0.0 <= alpha_grad:
+            raise ValueError("Invalid alpha value: {}".format(alpha_grad))
+        defaults = {
+            "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
+            "alpha_grad": alpha_grad, "beta1_warmup": beta1_warmup, "min_beta1": min_beta1,
+            "vector_reshape": vector_reshape,
+            "use_orthograd": use_orthograd, "use_bias_correction": use_bias_correction,
+        }
+        self.stochastic_rounding = stochastic_rounding
+        self.factored = factored
+        super().__init__(params, defaults)
+    @property
+    def supports_fused_back_pass(self):
+        return True
+    @property
+    def supports_memory_efficient_fp16(self):
+        return True
+    @property
+    def supports_flat_params(self):
+        return False
+    @torch.no_grad()
+    def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
+        if p.grad is None:
+            return
+        grad = p.grad
+        if grad.dtype != torch.float32 and self.factored:
+            grad = grad.float()
+        if group["use_orthograd"]:
+            grad = _orthogonalize_gradient(p, grad)
+        state = self.state[p]
+        # State Initialization
+        if len(state) == 0:
+            state['step'] = 0
+            should_factor = (
+                self.factored and
+                not (len(p.shape) == 1 and not group['vector_reshape'])
+            )
+            state['factored'] = should_factor
+            dtype = torch.float32 if self.factored else p.dtype
+            device = p.device
+            if state['factored']:
+                state['effective_shape'] = _get_effective_shape(p.numel())
+                d1, d2 = state['effective_shape']
+                # First moment (m)
+                state['mu_m_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                state['mv_m_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+                packed_d2 = (d2 + 7) // 8
+                state['sign'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=device)
+                # Second moment (v)
+                state['mu_v_nmf'] = torch.zeros(d1, device=device, dtype=dtype)
+                state['mv_v_nmf'] = torch.zeros(d2, device=device, dtype=dtype)
+            else:  # Fallback to standard optimizer for non-factored tensors
+                state['exp_avg'] = torch.zeros_like(p, device=device, dtype=dtype)
+                state['exp_avg_sq'] = torch.zeros_like(p, device=device, dtype=dtype)
+            if group['use_bias_correction']:
+                state['num_sum'] = 0.0
+                state['den_sum'] = 0.0
+            else:
+                state['num_sum'] = 1.0
+                state['den_sum'] = 1.0
+        beta1_final, beta2 = group["betas"]
+        beta1_warmup = group["beta1_warmup"]
+        alpha_grad = group["alpha_grad"]
+        if beta1_warmup is not None:
+            step = state['step'] + 1
+            beta1 = linear_hl_warmup_scheduler(step, beta_end=beta1_final, beta_start=group['min_beta1'], warmup=beta1_warmup)
+        else:
+            beta1 = beta1_final
+        if group['use_bias_correction']:
+            state['num_sum'] = beta1 * state['num_sum'] + 1.0
+            state['den_sum'] = beta2 * state['den_sum'] + (1.0 - beta2)
+        if state['factored']:
+            d1, d2 = state['effective_shape']
+            # Reconstruct momentum from previous step's factors
+            mt = _unnmf((state['mu_m_nmf'], state['mv_m_nmf']))
+            unpacked_sign = _unpack_bools(state['sign'], original_m=d2)
+            torch.where(unpacked_sign, mt, -mt, out=mt)
+            del unpacked_sign
+            # Update momentum in full-size
+            grad_reshaped = grad.view(d1, d2)
+            mt.mul_(beta1).add_(grad_reshaped, alpha=1.0)
+            vt = _unnmf((state['mu_v_nmf'], state['mv_v_nmf']))
+            vt.mul_(beta2).addcmul_(grad_reshaped, grad_reshaped, value=1.0 - beta2)
+            update = torch.add(mt, grad_reshaped, alpha=alpha_grad)
+            del grad_reshaped
+            denom = vt.sqrt().add_(group['eps'] * math.sqrt(state['den_sum']))
+            update.div_(denom)
+            del denom
+            if group['use_bias_correction']:
+                update = (update / state['num_sum']) * math.sqrt(state['den_sum'])
+            update = update.view(p.shape).mul_(group['lr'])
+            # Compress updated moments and store new factors
+            state['sign'] = _pack_bools(mt > 0)
+            _nnmf(mt.abs(), out=(state['mu_m_nmf'], state['mv_m_nmf']))
+            del mt
+            _nnmf(vt, out=(state['mu_v_nmf'], state['mv_v_nmf']))
+            del vt
+        else:  # Standard optimizer logic for non-factored tensors
+            exp_avg_sq = state['exp_avg_sq']
+            exp_avg = state['exp_avg']
+            exp_avg.mul_(beta1).add_(grad, alpha=1.0)
+            update = torch.add(exp_avg, grad, alpha=alpha_grad)
+            exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+            denom = exp_avg_sq.sqrt().add_(group['eps'] * math.sqrt(state['den_sum']))
+            update.div_(denom)
+            del denom
+            if group['use_bias_correction']:
+                update = (update / state['num_sum']) * math.sqrt(state['den_sum'])
+            update.mul_(group['lr'])
+        # Decoupled weight decay
+        if group["weight_decay"] != 0:
+            if p.dtype == torch.bfloat16 and self.stochastic_rounding:
+                add_stochastic_(p.data, p.data, alpha=-group["weight_decay"] * group["lr"])
+            else:
+                p.data.add_(p.data, alpha=-group["weight_decay"] * group["lr"])
+        if p.dtype == torch.bfloat16 and self.stochastic_rounding:
+            add_stochastic_(p.data, -update)
+        else:
+            p.data.add_(-update)
+        del update
+        state['step'] += 1
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step."""
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            for i, p in enumerate(group['params']):
+                self.step_parameter(p, group, i)
+        return loss

{adv_optm-0.1.5 → adv_optm-0.1.7}/adv_optm/optim/__init__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from .AdamW_adv import AdamW_adv
 from .Prodigy_adv import Prodigy_adv
 from .Adopt_adv import Adopt_adv
+from .Simplified_AdEMAMix import Simplified_AdEMAMix
 from .Lion_adv import Lion_adv
 from .Lion_Prodigy_adv import Lion_Prodigy_adv
@@ -8,6 +9,7 @@ __all__ = [
     "AdamW_adv",
     "Prodigy_adv",
     "Adopt_adv",
+    "Simplified_AdEMAMix",
     "Lion_adv",
     "Lion_Prodigy_adv",
 ]

{adv_optm-0.1.5 → adv_optm-0.1.7}/adv_optm.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 0.1.5
+Version: 0.1.7
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-0.1.5 → adv_optm-0.1.7}/adv_optm.egg-info/SOURCES.txt RENAMED Viewed

@@ -12,6 +12,7 @@ adv_optm/optim/Adopt_adv.py
 adv_optm/optim/Lion_Prodigy_adv.py
 adv_optm/optim/Lion_adv.py
 adv_optm/optim/Prodigy_adv.py
+adv_optm/optim/Simplified_AdEMAMix.py
 adv_optm/optim/__init__.py
 adv_optm/util/BF16_Stochastic_Rounding.py
 adv_optm/util/Effective_Shape.py

{adv_optm-0.1.5 → adv_optm-0.1.7}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setup(
     name="adv_optm",
-    version="0.1.5",
+    version="0.1.7",
     author="Koratahiu",
     author_email="hiuhonor@gmail.com",
     license='Apache 2.0',