PyPI - adv-optm - Versions diffs - 2.2.0__tar.gz → 2.2.dev1__tar.gz - Mend

adv-optm 2.2.0tar.gz → 2.2.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 2.2.0
+Version: 2.2.dev1
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/__init__.py RENAMED Viewed

@@ -22,4 +22,4 @@ __all__ = [
     "SignSGD_adv",
 ]
-__version__ = "2.2.0"
+__version__ = "2.2.dev1"

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/optim/AdaMuon_adv.py RENAMED Viewed

@@ -225,7 +225,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
             "adam_k_warmup_steps": adam_k_warmup_steps, "adam_nnmf_factor": adam_nnmf_factor,
         }
         self.stochastic_rounding = stochastic_rounding
-        self._init_lr = lr
         super().__init__(params, defaults)
@@ -319,13 +318,11 @@ class AdaMuon_adv(torch.optim.Optimizer):
             # Spectral Normalization
             if group.get('spectral_normalization', False):
-                gen = param_update.get_generator(device)
                 # Case A: Factored Muon
                 if state['factored']:
                     d1, d2 = state['effective_shape']
                     # We need a vector matching the 'inner' dimension d2
-                    state['spectral_v'] = torch.randn(d2, device=device, dtype=dtype, generator=gen)
+                    state['spectral_v'] = torch.randn(d2, device=device, dtype=dtype)
                 # Case B: Standard Muon (Linear, Conv2d, etc.)
                 elif len(p.shape) >= 2:
@@ -333,7 +330,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
                     # (p.shape[0], product_of_rest).
                     d_in_flat = p.numel() // p.shape[0]
-                    state['spectral_v'] = torch.randn(d_in_flat, device=device, dtype=dtype, generator=gen)
+                    state['spectral_v'] = torch.randn(d_in_flat, device=device, dtype=dtype)
                 # Normalize initial vector for stability
                 if 'spectral_v' in state:
@@ -442,13 +439,9 @@ class AdaMuon_adv(torch.optim.Optimizer):
             scaled_eps, adaptive_eps, spectral_target, wd_scale = get_spectral_scaling(shape_for_scaling, group['n_layers'])
             weight_decay = group['weight_decay'] * wd_scale
-            decoupled_wd = True
             ns_eps = scaled_eps
         else:
             weight_decay = group['weight_decay']
-            decoupled_wd = False
             ns_eps = group['ns_eps']
             adaptive_eps = group['eps']
@@ -594,7 +587,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
             update = update.reshape(original_shape)
-        param_update.apply_parameter_update(self, p, group, update, lr, wd=weight_decay, random_int_tensor=random_int_tensor, decoupled=decoupled_wd)
+        param_update.apply_parameter_update(self, p, group, update, lr, wd=weight_decay, random_int_tensor=random_int_tensor)
     @torch.no_grad()
     def step(self, closure=None):

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/optim/Muon_adv.py RENAMED Viewed

@@ -204,7 +204,6 @@ class Muon_adv(torch.optim.Optimizer):
         }
         self.stochastic_rounding = stochastic_rounding
         self.compiled_optimizer = compiled_optimizer
-        self._init_lr = lr
         super().__init__(params, defaults)
@@ -285,13 +284,11 @@ class Muon_adv(torch.optim.Optimizer):
             # Spectral Normalization
             if group.get('spectral_normalization', False):
-                gen = param_update.get_generator(device)
                 # Case A: Factored Muon
                 if state['factored']:
                     d1, d2 = state['effective_shape']
                     # We need a vector matching the 'inner' dimension d2
-                    state['spectral_v'] = torch.randn(d2, device=device, dtype=dtype, generator=gen)
+                    state['spectral_v'] = torch.randn(d2, device=device, dtype=dtype)
                 # Case B: Standard Muon (Linear, Conv2d, etc.)
                 elif len(p.shape) >= 2:
@@ -299,11 +296,11 @@ class Muon_adv(torch.optim.Optimizer):
                     # (p.shape[0], product_of_rest).
                     d_in_flat = p.numel() // p.shape[0]
-                    state['spectral_v'] = torch.randn(d_in_flat, device=device, dtype=dtype, generator=gen)
+                    state['spectral_v'] = torch.randn(d_in_flat, device=device, dtype=dtype)
                 # Normalize initial vector for stability
                 if 'spectral_v' in state:
-                    state['spectral_v'].div_(state['spectral_v'].norm())
+                     state['spectral_v'].div_(state['spectral_v'].norm())
             # MARS-M state initialization
             if group.get('approx_mars', False):
@@ -407,12 +404,9 @@ class Muon_adv(torch.optim.Optimizer):
             scaled_eps, _, spectral_target, wd_scale = get_spectral_scaling(shape_for_scaling, group['n_layers'])
             weight_decay = group['weight_decay'] * wd_scale
-            decoupled_wd = True
             ns_eps = scaled_eps
         else:
             weight_decay = group['weight_decay']
-            decoupled_wd = False
             ns_eps = group['ns_eps']
         # MARS-M Approximated (Variance Reduction)
@@ -527,7 +521,7 @@ class Muon_adv(torch.optim.Optimizer):
                 update = update.reshape(original_shape)
-        param_update.apply_parameter_update(self, p, group, update, lr, wd=weight_decay, random_int_tensor=random_int_tensor, decoupled=decoupled_wd)
+        param_update.apply_parameter_update(self, p, group, update, lr, wd=weight_decay, random_int_tensor=random_int_tensor)
     @torch.no_grad()
     def step(self, closure=None):

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/util/Muon_util.py RENAMED Viewed

@@ -352,12 +352,7 @@ def spectral_norm_update(update: torch.Tensor, vector_state: torch.Tensor, targe
     # Normalize v_new to get next state
     v_norm = torch.linalg.vector_norm(v_new)
-    # if v_norm >= 0.5:
-    #    vector_state.copy_(v_new.div_(v_norm.clamp_min_(1e-12))).to(vector_state.dtype))
-    candidate_v = v_new / v_norm
-    next_state = torch.where(v_norm >= 0.5, candidate_v, vector_state)
-    vector_state.copy_(next_state.to(vector_state.dtype))
-    # Else: We keep the old vector_state (which is a random unit vector at init)
+    vector_state.copy_(v_new.div_(v_norm.clamp_min_(1e-12)).to(vector_state.dtype))
     # Estimate sigma = ||A @ v|| (since v is unit norm)
     # Re-compute A @ v_new with the updated vector for better estimate
@@ -384,7 +379,7 @@ def get_spectral_scaling(shape: torch.Size, n_layers: int):
         wd_scale: Weight decay scale
     """
     d_out, d_in = shape[0], shape[1]
     # Handle Convolutional/Flattened tensors
     if len(shape) > 2:
         d_in = shape[1:].numel()

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/util/param_update.py RENAMED Viewed

@@ -14,7 +14,6 @@ def apply_parameter_update(
     lr: float | Tensor,
     wd: float | None = None,
     random_int_tensor: Tensor | None = None,
-    decoupled: bool = False,
 ) -> None:
     """
     Applies decoupled weight decay (standard or cautious) and the final
@@ -28,14 +27,9 @@ def apply_parameter_update(
         wd: Optional float value for weight decay, if another value other than group["weight_decay"] is needed.
         random_int_tensor: Optional pre-generated random tensor for stochastic
             rounding. Required for the `torch.compile` path.
-        decoupled: Whenever to use the true decoupled weight decay.
     """
     wd = group["weight_decay"] if wd is None else wd
     cautious = group.get('cautious_wd', False)
-    if decoupled:
-        scaled_wd = wd * (lr / self._init_lr)
-    else:
-        scaled_wd = wd * lr
     # Compute full update in float32 if using bfloat16 with stochastic rounding
     if p.dtype == torch.bfloat16 and self.stochastic_rounding:
@@ -47,11 +41,11 @@ def apply_parameter_update(
             if cautious:
                 # Cautious Weight Decay
                 mask = (update_fp32 * p_fp32 >= 0).float()
-                p_fp32.addcmul_(p_fp32, mask, value=-scaled_wd)
+                p_fp32.addcmul_(p_fp32, mask, value=-wd * lr)
                 del mask
             else:
                 # Standard decoupled weight decay
-                p_fp32.add_(p_fp32, alpha=-scaled_wd)
+                p_fp32.add_(p_fp32, alpha=-wd * lr)
         # Apply main update
         p_fp32.add_(-update_fp32)
@@ -60,7 +54,6 @@ def apply_parameter_update(
         if random_int_tensor is not None:
             # Compiled path: use the pre-computed random tensor
             _copy_stochastic_core_(p, p_fp32, random_int_tensor)
-            del random_int_tensor
         else:
             # Uncompiled path: generate randoms inside
             copy_stochastic_(p, p_fp32)
@@ -72,11 +65,11 @@ def apply_parameter_update(
             if cautious:
                 # Cautious Weight Decay
                 mask = (update * p >= 0).to(p.dtype)
-                p.addcmul_(p, mask, value=-scaled_wd)
+                p.addcmul_(p, mask, value=-wd * lr)
                 del mask
             else:
                 # Standard decoupled weight decay
-                p.add_(p, alpha=-scaled_wd)
+                p.add_(p, alpha=-wd * lr)
         # Apply main update
         p.add_(-update)
@@ -94,14 +87,6 @@ def set_seed(device: torch.device):
         _generators[device] = torch.Generator(device=device)
     _generators[device].manual_seed(42)
-def get_generator(device: torch.device) -> torch.Generator:
-    """
-    Retrieves (and initializes if necessary) the deterministic generator
-    for the specified device.
-    """
-    if device not in _generators:
-        set_seed(device)
-    return _generators[device]
 def _get_random_int_for_sr(source: Tensor) -> Tensor:
     """
@@ -133,7 +118,7 @@ def _copy_stochastic_core_(target: Tensor, source: Tensor, random_int_tensor: Te
     Core logic for stochastic rounding using a pre-computed random integer tensor.
     This version is designed to be torch.compile-friendly.
     """
-    result = random_int_tensor
+    result = random_int_tensor.clone()
     # add the random number to the lower 16 bit of the mantissa
     result.add_(source.view(dtype=torch.int32))
@@ -143,6 +128,8 @@ def _copy_stochastic_core_(target: Tensor, source: Tensor, random_int_tensor: Te
     # copy the higher 16 bit into the target tensor
     target.copy_(result.view(dtype=torch.float32))
+    del result
 def copy_stochastic_(target: Tensor, source: Tensor):
     """

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 2.2.0
+Version: 2.2.dev1
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setup(
     name="adv_optm",
-    version="2.2.0",
+    version="2.2.dev1",
     author="Koratahiu",
     author_email="hiuhonor@gmail.com",
     license='Apache 2.0',

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/LICENSE RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/README.md RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/optim/AdamW_adv.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/optim/Adopt_adv.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/optim/Lion_Prodigy_adv.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/optim/Lion_adv.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/optim/Prodigy_adv.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/optim/SignSGD_adv.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/optim/Simplified_AdEMAMix.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/optim/__init__.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/util/Kourkoutas.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/util/Muon_AuxAdam.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/util/OrthoGrad.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/util/__init__.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/util/factorization_util.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/util/lion_k.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm/util/update_util.py RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm.egg-info/requires.txt RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/adv_optm.egg-info/top_level.txt RENAMED Viewed

File without changes

{adv_optm-2.2.0 → adv_optm-2.2.dev1}/setup.cfg RENAMED Viewed

File without changes

adv-optm 2.2.0__tar.gz → 2.2.dev1__tar.gz

adv-optm 2.2.0tar.gz → 2.2.dev1tar.gz