PyPI - adv-optm - Versions diffs - 1.2.dev13__py3-none-any.whl → 2.dev1__py3-none-any.whl - Mend

adv-optm 1.2.dev13py3-none-any.whl → 2.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of adv-optm might be problematic. Click here for more details.

Files changed (22) hide show

adv_optm/__init__.py +1 -1
adv_optm/optim/AdamW_adv.py +85 -64
adv_optm/optim/Adopt_adv.py +114 -69
adv_optm/optim/Lion_Prodigy_adv.py +79 -81
adv_optm/optim/Lion_adv.py +37 -42
adv_optm/optim/Prodigy_adv.py +105 -85
adv_optm/optim/Simplified_AdEMAMix.py +92 -51
adv_optm/optim/__init__.py +1 -1
adv_optm/util/BF16_Stochastic_Rounding.py +1 -1
adv_optm/util/Effective_Shape.py +1 -1
adv_optm/util/Kourkoutas.py +11 -12
adv_optm/util/NNMF.py +7 -2
adv_optm/util/Newton_Schulz.py +1 -2
adv_optm/util/One_Bit_Boolean.py +1 -1
adv_optm/util/OrthoGrad.py +4 -3
adv_optm/util/__init__.py +1 -1
{adv_optm-1.2.dev13.dist-info → adv_optm-2.dev1.dist-info}/METADATA +20 -20
adv_optm-2.dev1.dist-info/RECORD +23 -0
adv_optm-1.2.dev13.dist-info/RECORD +0 -23
{adv_optm-1.2.dev13.dist-info → adv_optm-2.dev1.dist-info}/WHEEL +0 -0
{adv_optm-1.2.dev13.dist-info → adv_optm-2.dev1.dist-info}/licenses/LICENSE +0 -0
{adv_optm-1.2.dev13.dist-info → adv_optm-2.dev1.dist-info}/top_level.txt +0 -0

adv_optm/optim/Lion_Prodigy_adv.py CHANGED Viewed

@@ -27,17 +27,13 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
         stochastic_rounding (bool, optional): whether to use stochastic
             rounding for BF16 parameter updates (default: True).
         cautious_mask (bool): whether to use the cautious masking technique. (default: False).
-        clip_threshold (float, optional): whether to clip the gradients norm
-            per-parameter as proposed in the paper `Lions and Muons: Optimization via
-            Stochastic Frank-Wolfe` (https://arxiv.org/abs/2506.04192) to make Lion more stable
-            (default: 0.0).
         nnmf_factor (bool): whether to use the factorization or use the
             uncompressed optimizer. (default: True)
         d0 (float):
             Initial D estimate for D-adaptation (default 1e-6). Rarely needs changing.
         d_coef (float):
             Coefficient in the expression for the estimate of d (default 1.0).
-            Values such as 0.5 and 2.0 typically work as well.
+            Values such as 0.5 and 2.0 typically work as well.
             Changing this parameter is the preferred way to tune the method.
         growth_rate (float):
             prevent the D estimate from growing faster than this multiplicative rate.
@@ -47,8 +43,8 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
             If you're using sharded parameters, this should be set to True. The optimizer
             will attempt to auto-detect this, but if you're using an implementation other
             than PyTorch's builtin version, the auto-detection won't work.
-        slice_p (int): Reduce memory usage by calculating LR adaptation statistics on only every
-            pth entry of each tensor. For values greater than 1 this an an approximation to standard
+        slice_p (int): Reduce memory usage by calculating LR adaptation statistics on only every
+            pth entry of each tensor. For values greater than 1 this an an approximation to standard
             Prodigy. Values ~11 are reasonable (default 11).
         prodigy_steps (int): If greater than zero, disable Prodigy's stepsize adjustments
             after the specified optimiser step and release all state memory required by Prodigy
@@ -64,11 +60,10 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
         lr: float = 1,
         betas: Tuple[float, float] = (0.9, 0.99),
         weight_decay: float = 0.0,
-        vector_reshape: bool = True,
+        vector_reshape: bool = False,
         stochastic_rounding: bool = True,
         orthogonal_gradient: bool = False,
         cautious_mask: bool = False,
-        clip_threshold: float = 0.0,
         nnmf_factor: bool = False,
         # prodigy parameters
         beta3: float = None,
@@ -80,6 +75,8 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
         slice_p: int = 11,
         prodigy_steps: int = 0,
         d_limiter: bool = True,
+        # Compiled
+        compiled_optimizer: bool = False,
     ):
         if not lr > 0.0:
             raise ValueError(f"Learning rate must be > 0.0, but got {lr}")
@@ -94,21 +91,28 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
             weight_decay=weight_decay,
             vector_reshape=vector_reshape,
             orthogonal_gradient=orthogonal_gradient,
-            clip_threshold=clip_threshold,
             beta3=beta3, d=d0, d0=d0, d_max=d0, d_numerator=0.0, d_coef=d_coef,
-            growth_rate=growth_rate, safeguard_warmup=safeguard_warmup, k=0, slice_p=slice_p,
+            growth_rate=growth_rate, safeguard_warmup=safeguard_warmup, slice_p=slice_p,
             fsdp_in_use=fsdp_in_use,
             prodigy_steps=prodigy_steps,
             d_limiter=d_limiter,
+            compiled_optimizer=compiled_optimizer,
         )
         self.stochastic_rounding = stochastic_rounding
         self.cautious_mask = cautious_mask
         self.factored = nnmf_factor
         self.fsdp_in_use = fsdp_in_use
         super().__init__(params, defaults)
-        # Global state for accumulating metrics across parameter updates within a single step.
+        # Use the device of the first parameter to avoid hardcoding '.cuda()'
+        self.device = self.param_groups[0]['params'][0].device
+        self.global_step = 0
         self.init_step()
+        if compiled_optimizer:
+            torch._dynamo.config.cache_size_limit = 8192
+            self.compile(fullgraph=False, dynamic=False) #FIXME
     @property
     def supports_fused_back_pass(self) -> bool:
         return True
@@ -124,14 +128,13 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
     def init_step(self):
         """Resets accumulators and calculates dlr for the upcoming step."""
         self.d_denom = 0.0
         g_group = self.param_groups[0]
         self.beta1, self.beta2 = g_group['betas']
         self.beta3 = g_group['beta3']
         if self.beta3 is None:
             self.beta3 = math.sqrt(self.beta2)
-        k = g_group['k']
         self.d = g_group['d']
         lr = g_group['lr']
@@ -139,38 +142,21 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
         self.d_numerator = g_group.get('d_numerator', 0.0) * self.beta3
-    @torch.no_grad()
-    def step_parameter(self, p: torch.Tensor, group: dict, i: Optional[int] = None):
-        """Performs a single optimization step on a single parameter."""
-        if p.grad is None:
-            return
-        if hasattr(p, "_fsdp_flattened"):
-            self.fsdp_in_use = True
+        for group in self.param_groups:
+            for i, p in enumerate(group['params']):
+                self.__init_state(p, group)
-        grad = p.grad
-        if grad.dtype != torch.float32 and self.factored:
-            grad = grad.float()
-        if group["clip_threshold"] > 0.0:
-            grad_norm = torch.norm(grad.detach())
-            if grad_norm > group["clip_threshold"]:
-                clip_coef = group["clip_threshold"] / grad_norm
-                grad.mul_(clip_coef)
-        if group["orthogonal_gradient"]:
-            grad = _orthogonalize_gradient(p, grad)
+    @torch.no_grad()
+    def __init_state(self, p, group):
         state = self.state[p]
-        # State Initialization
-        if 'step' not in state:
-            state['step'] = 0
+        if len(state) == 0:
-            should_factor = (
+            state['factored'] = (
                 self.factored and
                 not (len(p.shape) == 1 and not group['vector_reshape'])
             )
-            state['factored'] = should_factor
             dtype = torch.float32 if self.factored else p.dtype
             slice_p = group['slice_p']
@@ -185,13 +171,28 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
             if state['factored']:
                 state['effective_shape'] = _get_effective_shape(p.numel())
                 d1, d2 = state['effective_shape']
-                state['mu_m_nmf'] = torch.zeros(d1, device=p.device, dtype=dtype)
+                state['mu_m_nmf'] = torch.zeros(d1, device=p.device, dtype=dtype)
                 state['mv_m_nmf'] = torch.zeros(d2, device=p.device, dtype=dtype)
                 packed_d2 = (d2 + 7) // 8
                 state['sign'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=p.device)
             else: # Fallback to standard Lion
                 state['exp_avg'] = torch.zeros_like(p, device=p.device, dtype=dtype)
+    @torch.no_grad()
+    def __step_parameter(self, p: torch.Tensor, group: dict, d: torch.Tensor | float, dlr: torch.Tensor | float):
+        """Performs a single optimization step on a single parameter."""
+        if p.grad is None:
+            return
+        grad = p.grad
+        if grad.dtype != torch.float32 and self.factored:
+            grad = grad.float()
+        if group["orthogonal_gradient"]:
+            grad = _orthogonalize_gradient(p, grad)
+        state = self.state[p]
         if state['factored']:
             # Factored Path
             d1, d2 = state['effective_shape']
@@ -205,7 +206,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
                 exp_avg = exp_avg.float()
             # Compute update term c_t = β1*m_{t-1} + (1-β1)*g_t
-            signed_update = exp_avg.clone().mul_(self.beta1).add_(grad_reshaped, alpha=self.d * (1-self.beta1)).sign_()
+            signed_update = exp_avg.clone().mul_(self.beta1).add_(grad_reshaped, alpha=d * (1-self.beta1)).sign_()
             if self.cautious_mask:
                 mask = (signed_update * grad_reshaped > 0).to(grad_reshaped.dtype)
@@ -214,10 +215,10 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
                 del mask
             # Parameter update: p_t = p_{t-1} - lr * sign(c_t)
-            update_for_param = signed_update.view(p.shape).mul(self.dlr)
+            update_for_param = signed_update.view(p.shape).mul(dlr)
             # Update momentum m_t = β2*m_{t-1} + (1-β2)*lr*g_t
-            exp_avg.mul_(self.beta2).add_(grad_reshaped, alpha=self.d * (1 - self.beta2))
+            exp_avg.mul_(self.beta2).add_(grad_reshaped, alpha=d * (1 - self.beta2))
             del grad_reshaped
             # Compress new momentum m_t and store factors
@@ -232,7 +233,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
             # Compute update term and sign for the update
             if exp_avg.dtype != torch.float32 and self.factored:
                 exp_avg = exp_avg.float()
-            signed_update = exp_avg.clone().mul_(self.beta1).add_(grad, alpha=self.d * (1-self.beta1)).sign_()
+            signed_update = exp_avg.clone().mul_(self.beta1).add_(grad, alpha=d * (1-self.beta1)).sign_()
             if self.cautious_mask:
                 mask = (signed_update * grad > 0).to(grad.dtype)
@@ -240,41 +241,18 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
                 signed_update.mul_(mask)
                 del mask
-            update_for_param = signed_update.mul(self.dlr)
-            # Update momentum
-            exp_avg.mul_(self.beta2).add_(grad, alpha=self.d * (1 - self.beta2))
-        prodigy_steps = group['prodigy_steps']
-        if prodigy_steps <= 0 or group['k'] < prodigy_steps:
-            # --- Accumulate Prodigy stats ---
-            d0, safeguard_warmup, slice_p = group['d0'], group['safeguard_warmup'], group['slice_p']
-            s, p0 = state['s'], state['p0']
-            grad_flat = grad.flatten().float()
-            p_flat = p.data.flatten().float()
-            p0 = p0.float()
-            self.d_numerator += (self.d / d0) * self.dlr * torch.dot(grad_flat[::slice_p], p0.data - p_flat[::slice_p]).item()
-            alpha = ((self.d / d0) * self.d) if safeguard_warmup else ((self.d / d0) * self.dlr)
-            s.mul_(self.beta3).add_(grad_flat[::slice_p], alpha=alpha)
-            self.d_denom += s.abs().sum().item()
+            update_for_param = signed_update.mul(dlr)
-            del s, p0, grad_flat, p_flat, alpha
-        else:
-            # Free memory if prodigy_steps is reached
-            if 's' in state:
-                del state['s']
-            if 'p0' in state:
-                del state['p0']
+            # Update momentum
+            exp_avg.mul_(self.beta2).add_(grad, alpha=d * (1 - self.beta2))
         if group["weight_decay"] != 0:
             if p.dtype == torch.bfloat16 and self.stochastic_rounding:
                 add_stochastic_(p.data, p.data,
-                                alpha=-group["weight_decay"] * self.dlr)
+                                alpha=-group["weight_decay"] * dlr)
             else:
                 p.data.add_(
-                    p.data, alpha=-group["weight_decay"] * self.dlr
+                    p.data, alpha=-group["weight_decay"] * dlr
                 )
         if p.dtype == torch.bfloat16 and self.stochastic_rounding:
@@ -284,6 +262,29 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
         del update_for_param
+    @torch.no_grad()
+    def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
+        if hasattr(p, "_fsdp_flattened"):
+            self.fsdp_in_use = True
+        if self.global_step is None and 'step' in self.state[p]:
+            # For backward compatibility
+            self.global_step = self.state[p]['step']
+        if isinstance(self.d_numerator, float):
+            self.d_numerator = torch.tensor(self.d_numerator, device=p.device)
+            self.d_denom = torch.tensor(self.d_denom, device=p.device)
+        if not group.get('compiled_optimizer', False):
+            self.__step_parameter(p, group, self.d, self.dlr)
+        else:
+            d_tensor = torch.tensor(self.d, device=p.device)
+            dlr_tensor = torch.tensor(self.dlr, device=p.device)
+            self._compiled_step_parameter(p, group, d_tensor, dlr_tensor)
+    def compile(self, *args, **kwargs):
+        self._compiled_step_parameter = torch.compile(self.__step_parameter, *args, **kwargs)
     @torch.no_grad()
     def step(self, closure: Optional[callable] = None):
         """Performs a single optimization step."""
@@ -306,21 +307,19 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
         """Calculates the new `d` based on the accumulated stats."""
         g_group = self.param_groups[0]
         # Only perform d-adaptation if prodigy_steps has not been reached
-        prodigy_active = not (g_group.get('prodigy_steps', 0) > 0 and g_group['k'] >= g_group['prodigy_steps'])
+        prodigy_active = not (g_group.get('prodigy_steps', 0) > 0 and self.global_step >= g_group['prodigy_steps'])
         if prodigy_active:
             d_max, d_coef, growth_rate = g_group['d_max'], g_group['d_coef'], g_group['growth_rate']
             if self.fsdp_in_use and dist.is_available() and dist.is_initialized():
-                # Use the device of the first parameter to avoid hardcoding '.cuda()'
-                device = self.param_groups[0]['params'][0].device
-                dist_tensor = torch.tensor([self.d_numerator, self.d_denom], device=device)
+                dist_tensor = torch.stack([self.d_numerator, self.d_denom])
                 dist.all_reduce(dist_tensor, op=dist.ReduceOp.SUM)
                 global_d_numerator = dist_tensor[0].item()
                 global_d_denom = dist_tensor[1].item()
             else:
-                global_d_numerator = self.d_numerator
-                global_d_denom = self.d_denom
+                global_d_numerator = self.d_numerator.item()
+                global_d_denom = self.d_denom.item()
             d_hat = self.d
             if global_d_denom > 0:
@@ -337,5 +336,4 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
                 group['d'] = self.d
                 group['d_max'] = d_max
         # Increment step counter for all groups, regardless of whether d was updated
-        for group in self.param_groups:
-            group['k'] += 1
+        self.global_step += 1

adv_optm/optim/Lion_adv.py CHANGED Viewed

@@ -27,10 +27,6 @@ class Lion_adv(torch.optim.Optimizer):
         stochastic_rounding (bool, optional): whether to use stochastic
             rounding for BF16 parameter updates (default: True).
         cautious_mask (bool): whether to use the cautious masking technique. (default: False).
-        clip_threshold (float, optional): whether to clip the gradients norm
-            per-parameter as proposed in the paper `Lions and Muons: Optimization via
-            Stochastic Frank-Wolfe` (https://arxiv.org/abs/2506.04192) to make Lion more stable
-            (default: 0.0).
         nnmf_factor (bool): whether to use the factorization or use the
             uncompressed optimizer. (default: True)
     """
@@ -41,11 +37,10 @@ class Lion_adv(torch.optim.Optimizer):
         lr: float = 1e-4,
         betas: Tuple[float, float] = (0.9, 0.99),
         weight_decay: float = 0.0,
-        vector_reshape: bool = True,
+        vector_reshape: bool = False,
         stochastic_rounding: bool = True,
         orthogonal_gradient: bool = False,
         cautious_mask: bool = False,
-        clip_threshold: float = 0.0,
         nnmf_factor: bool = True,
     ):
         if not lr > 0.0:
@@ -61,7 +56,6 @@ class Lion_adv(torch.optim.Optimizer):
             weight_decay=weight_decay,
             vector_reshape=vector_reshape,
             orthogonal_gradient=orthogonal_gradient,
-            clip_threshold=clip_threshold,
         )
         self.stochastic_rounding = stochastic_rounding
         self.cautious_mask = cautious_mask
@@ -80,48 +74,49 @@ class Lion_adv(torch.optim.Optimizer):
     def supports_flat_params(self) -> bool:
         return False
-    @torch.no_grad()
-    def step_parameter(self, p: torch.Tensor, group: dict, i: Optional[int] = None):
-        """Performs a single optimization step on a single parameter."""
-        if p.grad is None:
-            return
+    def init_step(self):
+        for group in self.param_groups:
+            for i, p in enumerate(group['params']):
+                self.__init_state(p, group)
-        grad = p.grad
-        if grad.dtype != torch.float32 and self.factored:
-            grad = grad.float()
-        if group["clip_threshold"] > 0.0:
-            grad_norm = torch.norm(grad.detach())
-            if grad_norm > group["clip_threshold"]:
-                clip_coef = group["clip_threshold"] / grad_norm
-                grad.mul_(clip_coef)
-        if group["orthogonal_gradient"]:
-            grad = _orthogonalize_gradient(p, grad)
+    @torch.no_grad()
+    def __init_state(self, p, group):
         state = self.state[p]
-        # State Initialization
-        if 'step' not in state:
-            state['step'] = 0
+        if len(state) == 0:
-            should_factor = (
+            state['factored'] = (
                 self.factored and
                 not (len(p.shape) == 1 and not group['vector_reshape'])
             )
-            state['factored'] = should_factor
             dtype = torch.float32 if self.factored else p.dtype
             if state['factored']:
                 state['effective_shape'] = _get_effective_shape(p.numel())
                 d1, d2 = state['effective_shape']
-                state['mu_m_nmf'] = torch.zeros(d1, device=p.device, dtype=dtype)
+                state['mu_m_nmf'] = torch.zeros(d1, device=p.device, dtype=dtype)
                 state['mv_m_nmf'] = torch.zeros(d2, device=p.device, dtype=dtype)
                 packed_d2 = (d2 + 7) // 8
                 state['sign'] = torch.zeros((d1, packed_d2), dtype=torch.uint8, device=p.device)
             else: # Fallback to standard Lion
                 state['exp_avg'] = torch.zeros_like(p, device=p.device, dtype=dtype)
-        state['step'] += 1
+    @torch.no_grad()
+    def __step_parameter(self, p: torch.Tensor, group: dict, lr: torch.Tensor | float):
+        """Performs a single optimization step on a single parameter."""
+        if p.grad is None:
+            return
+        grad = p.grad
+        if grad.dtype != torch.float32 and self.factored:
+            grad = grad.float()
+        if group["orthogonal_gradient"]:
+            grad = _orthogonalize_gradient(p, grad)
+        state = self.state[p]
         beta1, beta2 = group["betas"]
         lr = group["lr"]
@@ -138,16 +133,16 @@ class Lion_adv(torch.optim.Optimizer):
                 exp_avg = exp_avg.float()
             # Compute update term c_t
-            signed_update = exp_avg.clone().mul_(beta1).add_(grad_reshaped, alpha=(1-beta1)).sign_()
+            update = exp_avg.clone().mul_(beta1).add_(grad_reshaped, alpha=(1-beta1)).sign_()
             if self.cautious_mask:
-                mask = (signed_update * grad_reshaped > 0).to(grad_reshaped.dtype)
+                mask = (update * grad_reshaped > 0).to(grad_reshaped.dtype)
                 mask.div_(mask.mean().clamp_(min=1e-3))
-                signed_update.mul_(mask)
+                update.mul_(mask)
                 del mask
             # Parameter update
-            update_for_param = signed_update.view(p.shape).mul_(lr)
+            update = update.view(p.shape).mul_(lr)
             # Standard Lion momentum update
             exp_avg.mul_(beta2).add_(grad_reshaped, alpha=1-beta2)
@@ -165,15 +160,15 @@ class Lion_adv(torch.optim.Optimizer):
             # Compute update term and sign for the update
             if exp_avg.dtype != torch.float32 and self.factored:
                 exp_avg = exp_avg.float()
-            signed_update = exp_avg.clone().mul_(beta1).add_(grad, alpha=(1-beta1)).sign_()
+            update = exp_avg.clone().mul_(beta1).add_(grad, alpha=(1-beta1)).sign_()
             if self.cautious_mask:
-                mask = (signed_update * grad > 0).to(grad.dtype)
+                mask = (update * grad > 0).to(grad.dtype)
                 mask.div_(mask.mean().clamp_(min=1e-3))
-                signed_update.mul_(mask)
+                update.mul_(mask)
                 del mask
-            update_for_param = signed_update.mul_(lr)
+            update.mul_(lr)
             # Standard Lion momentum update
             exp_avg.mul_(beta2).add_(grad, alpha=1-beta2)
@@ -188,11 +183,11 @@ class Lion_adv(torch.optim.Optimizer):
                 )
         if p.dtype == torch.bfloat16 and self.stochastic_rounding:
-            add_stochastic_(p.data, -update_for_param)
+            add_stochastic_(p.data, -update)
         else:
-            p.data.add_(-update_for_param)
+            p.data.add_(-update)
-        del update_for_param
+        del update
     @torch.no_grad()
     def step(self, closure: Optional[callable] = None):
@@ -207,4 +202,4 @@ class Lion_adv(torch.optim.Optimizer):
                 if p.grad is not None:
                     self.step_parameter(p, group, i)
-        return loss
+        return loss

adv-optm 1.2.dev13__py3-none-any.whl → 2.dev1__py3-none-any.whl

Potentially problematic release.

adv-optm 1.2.dev13py3-none-any.whl → 2.dev1py3-none-any.whl