PyPI - adv-optm - Versions diffs - 1.2.dev7__py3-none-any.whl → 1.2.dev9__py3-none-any.whl - Mend

adv-optm 1.2.dev7py3-none-any.whl → 1.2.dev9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of adv-optm might be problematic. Click here for more details.

Files changed (9) hide show

adv_optm/__init__.py CHANGED Viewed

@@ -20,4 +20,4 @@ __all__ = [
     "AdaMuon_adv",
 ]
-__version__ = "1.2.dev7"
+__version__ = "1.2.dev9"

adv_optm/optim/AdaMuon_adv.py CHANGED Viewed

@@ -3,7 +3,6 @@ from typing import Optional, Callable
 from .AdamW_adv import AdamW_adv
 from ..util.MuonAdam_helper import MuonAdamHelper
-from ..util.Kourkoutas import KourkoutasHelper
 from ..util.BF16_Stochastic_Rounding import add_stochastic_
 from ..util.Newton_Schulz import _newton_schulz_iteration
@@ -64,22 +63,13 @@ class AdaMuon_adv(torch.optim.Optimizer):
             matrices for muon NewtonSchulz (default: False).
         vector_reshape (bool): whether to reshape 1D vectors into 2D
             matrices to apply low-rank compression (default: True).
+        low_rank_ortho (bool): If True, enables low-rank orthogonalization, which
+            projects the update to a lower rank before orthogonalization.
+            (default: False)
+        ortho_rank (int): The rank for low-rank orthogonalization.
+            (default: 128)
         nnmf_factor (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: False)
-        kourkoutas_beta (bool): whether to enable the layer-wise dynamic β₂ logic.
-            If `False`, the optimizer behaves as standard AdamW. (default: False)
-        beta2_min (float): The minimum value for dynamic β₂, used during periods of
-            high gradient variance ("sunspikes"). Must be less than `betas[1]`.
-            (default: 0.88)
-        ema_alpha (float): The decay rate for the Exponential Moving Average (EMA) of
-            the pooled gradient norms. Corresponds to `α` in the paper.
-            (default: 0.93)
-        tiny_spike (float): A small constant added to the denominator of the
-            "sunspike" ratio calculation to prevent division by zero. Corresponds
-            to `ε_spike` in the paper. (default: 1e-9)
-        k_warmup_steps (int): The number of initial steps during which β₂ is held
-            at a fixed beta2 value before the
-            dynamic logic activates. (default: 0)
         MuonWithAuxAdam (bool): If True, enables the hybrid optimizer mode.
             Parameters designated by `layer_key_fn` will be optimized with
             AdamW_adv instead of Muon. (default: False)
@@ -110,15 +100,10 @@ class AdaMuon_adv(torch.optim.Optimizer):
         alpha_grad: float = 100.0,
         vector_reshape_muon: bool = False,
         vector_reshape: bool = False,
+        # Low-rank Muon
+        low_rank_ortho: bool = False,
+        ortho_rank: int = 128,
         nnmf_factor: bool = False,
-        # K-b parameters
-        kourkoutas_beta: bool = False,
-        beta2_min: float = 0.9,
-        ema_alpha: float = 0.95,
-        tiny_spike: float = 1e-9,
-        k_warmup_steps: int = 0,
-        k_logging: int = 0,
-        layer_key_kb_fn: Optional[Callable] = None,
         # hybrid optimizer mode
         MuonWithAuxAdam: bool = False,
         layer_key_fn: Optional[Callable] = None,
@@ -142,14 +127,11 @@ class AdaMuon_adv(torch.optim.Optimizer):
             "vector_reshape": vector_reshape,
             "vector_reshape_muon": vector_reshape_muon,
             "nesterov":nesterov, "use_atan2":use_atan2,
-            "Simplified_AdEMAMix": Simplified_AdEMAMix, "alpha_grad": alpha_grad,
-            "_kourkoutas_beta": kourkoutas_beta, "beta2_min": beta2_min, "ema_alpha": ema_alpha,
-            "tiny_spike": tiny_spike, "k_warmup_steps": k_warmup_steps, "k_logging": k_logging,
+            "Simplified_AdEMAMix": Simplified_AdEMAMix, "alpha_grad": alpha_grad,
+            # Low-rank Ortho
+            "low_rank_ortho": low_rank_ortho, "ortho_rank": ortho_rank,
         }
         self.stochastic_rounding = stochastic_rounding
-        self._kourkoutas_beta = kourkoutas_beta
-        self._kourkoutas_helper = None
-        self.layer_key_kb_fn = layer_key_kb_fn
         self.MuonWithAuxAdam = MuonWithAuxAdam
         self.helper = None
         self.aux_adam = None
@@ -182,14 +164,9 @@ class AdaMuon_adv(torch.optim.Optimizer):
             for key, value in defaults_to_use.items():
                 new_group.setdefault(key, value)
-            if '_kourkoutas_beta' not in new_group:
-                 if optim_type == 'adam':
-                     new_group['_kourkoutas_beta'] = False
-                 else:
-                     new_group['_kourkoutas_beta'] = muon_defaults['_kourkoutas_beta']
             final_param_groups.append(new_group)
-        super().__init__(final_param_groups, {})
+        super().__init__(final_param_groups, muon_defaults)
         # Now that self is initialized, create the helper
         self.helper = MuonAdamHelper(self, layer_key_fn)
@@ -219,9 +196,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
     @torch.no_grad()
     def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
-        if group['_kourkoutas_beta'] and self._kourkoutas_helper is None:
-            self._kourkoutas_helper = KourkoutasHelper(self)
         if self.MuonWithAuxAdam:
             optim_type = self.helper.get_optimizer_type(p)
             if optim_type == 'adam':
@@ -277,7 +251,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
         # Retrieve hyperparameters
         beta1, beta2 = group['betas']
-        current_step = state['step']
         nesterov = group['nesterov']
         Simplified_AdEMAMix = group['Simplified_AdEMAMix']
         alpha_grad = group['alpha_grad']
@@ -303,20 +276,37 @@ class AdaMuon_adv(torch.optim.Optimizer):
                 signed_m_buf = torch.sign(mt_buf)
             del grad_reshaped
-            update = _newton_schulz_iteration(
-                signed_m_buf,
-                steps=group['ns_steps'],
-                eps=group['ns_eps'],
-                coeffs=group['ns_coeffs'],
-            )
-            if group['_kourkoutas_beta']:
-                # Call prepare_step() once at the beginning of the step for all params
-                self._kourkoutas_helper.maybe_prepare_step(current_step)
-                # Accumulate current sign-stabilized orthogonal update's norm for the *next* step
-                self._kourkoutas_helper.accumulate_gradient_sq_norm(p, update.view(p.shape))
-                # Get the dynamic beta2 calculated in prepare_step()
-                beta2 = self._kourkoutas_helper.get_beta2(p, group, current_step)
+            # Orthogonalization step
+            if group['low_rank_ortho']:
+                # Low-Rank Orthogonalization on the reconstructed matrix
+                M = signed_m_buf
+                r = min(group['ortho_rank'], M.shape[0], M.shape[1])
+                if r > 0:
+                    G_sketch = torch.randn(M.shape[1], r, device=M.device, dtype=M.dtype)
+                    MG = M @ G_sketch
+                    if MG.dtype != torch.float32:
+                        MG_dtype = M.dtype
+                        Q, _ = torch.linalg.qr(MG.float())
+                        Q = Q.to(MG_dtype)
+                    else:
+                        Q, _ = torch.linalg.qr(MG)
+                    projected_M = Q.T @ M
+                    ortho_projected_M = _newton_schulz_iteration(
+                        projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                    )
+                    update = Q @ ortho_projected_M
+                else: # Fallback for invalid rank
+                    update = _newton_schulz_iteration(
+                        signed_m_buf, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                    )
+            else:
+                # Original full Newton-Schulz
+                update = _newton_schulz_iteration(
+                    signed_m_buf,
+                    steps=group['ns_steps'],
+                    eps=group['ns_eps'],
+                    coeffs=group['ns_coeffs'],
+                )
             # Reconstruct second momentum from previous step's factors
             vt_buf = _unnmf((state['mu_v_nmf'], state['mv_v_nmf']))
@@ -337,7 +327,8 @@ class AdaMuon_adv(torch.optim.Optimizer):
             # RMS-aligned rescaling
             rms_target = group['rms_target']
             num_elements = update.numel()
-            scaling_factor = rms_target * (num_elements ** 0.5) / (update.norm())
+            # Add eps to prevent division by zero
+            scaling_factor = rms_target * (num_elements ** 0.5) / (update.norm() + group['eps'])
             update.mul_(scaling_factor)
             update = update.view(p.shape).mul_(group['lr'])
@@ -381,25 +372,41 @@ class AdaMuon_adv(torch.optim.Optimizer):
                 if len(p.shape) > 2:
                     signed_m_buf = signed_m_buf.view(p.shape[0], -1)
-                # NewtonSchulz
-                update = _newton_schulz_iteration(
-                    signed_m_buf,
-                    steps=group['ns_steps'],
-                    eps=group['ns_eps'],
-                    coeffs=group['ns_coeffs'],
-                )
+                # Orthogonalization step
+                if group['low_rank_ortho']:
+                    # Low-Rank Orthogonalization on the reconstructed matrix
+                    M = signed_m_buf
+                    r = min(group['ortho_rank'], M.shape[0], M.shape[1])
+                    if r > 0:
+                        G_sketch = torch.randn(M.shape[1], r, device=M.device, dtype=M.dtype)
+                        MG = M @ G_sketch
+                        if MG.dtype != torch.float32:
+                            MG_dtype = M.dtype
+                            Q, _ = torch.linalg.qr(MG.float())
+                            Q = Q.to(MG_dtype)
+                        else:
+                            Q, _ = torch.linalg.qr(MG)
+                        projected_M = Q.T @ M
+                        ortho_projected_M = _newton_schulz_iteration(
+                            projected_M, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                        )
+                        update = Q @ ortho_projected_M
+                    else: # Fallback for invalid rank
+                        update = _newton_schulz_iteration(
+                            signed_m_buf, steps=group['ns_steps'], eps=group['ns_eps'], coeffs=group['ns_coeffs']
+                        )
+                else:
+                    # Original full Newton-Schulz
+                    update = _newton_schulz_iteration(
+                        signed_m_buf,
+                        steps=group['ns_steps'],
+                        eps=group['ns_eps'],
+                        coeffs=group['ns_coeffs'],
+                    )
                 if len(p.shape) > 2 or state['reshaped_1d_muon']:
                     update = update.view(p.shape)
-            if group['_kourkoutas_beta']:
-                # Call prepare_step() once at the beginning of the step for all params
-                self._kourkoutas_helper.maybe_prepare_step(current_step)
-                # Accumulate current sign-stabilized orthogonal update's norm for the *next* step
-                self._kourkoutas_helper.accumulate_gradient_sq_norm(p, update)
-                # Get the dynamic beta2 calculated in prepare_step()
-                beta2 = self._kourkoutas_helper.get_beta2(p, group, current_step)
                 vt_buf = state['second_momentum_buffer']
                 vt_buf.mul_(beta2).addcmul_(update, update, value=1 - beta2)
@@ -416,7 +423,8 @@ class AdaMuon_adv(torch.optim.Optimizer):
                 # RMS-aligned rescaling
                 rms_target = group['rms_target']
                 num_elements = update.numel()
-                scaling_factor = rms_target * (num_elements ** 0.5) / (update.norm())
+                # Add eps to prevent division by zero
+                scaling_factor = rms_target * (num_elements ** 0.5) / (update.norm() + group['eps'])
                 update.mul_(scaling_factor)
                 del num_elements, scaling_factor

adv_optm/optim/Muon_adv.py CHANGED Viewed

@@ -178,7 +178,7 @@ class Muon_adv(torch.optim.Optimizer):
             final_param_groups.append(new_group)
-        super().__init__(final_param_groups, {})
+        super().__init__(final_param_groups, muon_defaults)
         # Now that self is initialized, create the helper
         self.helper = MuonAdamHelper(self, layer_key_fn)
@@ -292,10 +292,10 @@ class Muon_adv(torch.optim.Optimizer):
             del grad_reshaped
             # Orthogonalization step
-            if group['low_rank_muon']:
+            if group['low_rank_ortho']:
                 # Low-Rank Orthogonalization on the reconstructed matrix
                 M = update
-                r = min(group['low_rank_rank'], M.shape[0], M.shape[1])
+                r = min(group['ortho_rank'], M.shape[0], M.shape[1])
                 if r > 0:
                     G_sketch = torch.randn(M.shape[1], r, device=M.device, dtype=M.dtype)
                     MG = M @ G_sketch

adv_optm/util/Kourkoutas.py CHANGED Viewed

@@ -94,7 +94,7 @@ class KourkoutasHelper:
         for layer_key, info in self.layer_info.items():
             params, group = info['params'], info['group_ref']
-            if not group.get('kourkoutas_beta', False):
+            if not group.get('kourkoutas_beta', False) and not group.get('_kourkoutas_beta', False):
                 continue
             first_param_in_layer = info['params'][0]

{adv_optm-1.2.dev7.dist-info → adv_optm-1.2.dev9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 1.2.dev7
+Version: 1.2.dev9
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-1.2.dev7.dist-info → adv_optm-1.2.dev9.dist-info}/RECORD RENAMED Viewed

@@ -1,24 +1,24 @@
-adv_optm/__init__.py,sha256=93-4akpaONvZ7BCkwDSYs3i28lI-aFV1RVwsEq1UZhU,379
-adv_optm/optim/AdaMuon_adv.py,sha256=hTGSH8wzmQ-NYIcqV6EAEbqCxxfEwmmMWaIadX1qiuQ,21009
+adv_optm/__init__.py,sha256=TzvKgGTLkK0_XANeZzhURcSO9xmtUi-H9_C7tV3rXn4,379
+adv_optm/optim/AdaMuon_adv.py,sha256=yr1oJV339Zv7D8n148O1FJJAgdOsH8NZDZTKlcDOyu0,21181
 adv_optm/optim/AdamW_adv.py,sha256=7IvdD1rqYeHZwQCZU9X0H7x87MCKcHQ5M68GLuMCkvE,17702
 adv_optm/optim/Adopt_adv.py,sha256=C2FsEZGvCk9q4YNKAj0qIxdZ5AfPlda-1lIpSX0a1nE,21256
 adv_optm/optim/Lion_Prodigy_adv.py,sha256=LEA3UYJpPeFnmxeniLNv1u2LKKj4ufx3Bq_MLw-nWXk,14617
 adv_optm/optim/Lion_adv.py,sha256=aGNAplZlyXYgVllYcV_s4bK8iC4fv6EizFoWIMNLdBc,8299
-adv_optm/optim/Muon_adv.py,sha256=JBLLfU83lRwezowI6A4JQAO1-NBLvSDOB8Dsad5zuHU,22775
+adv_optm/optim/Muon_adv.py,sha256=HaF06fPKcKpVZY29_vqjWHAfivjvGntBuRyDDKj3Ozw,22784
 adv_optm/optim/Prodigy_adv.py,sha256=bmwuO8GrJHH4NaEaqE-ffcR9wHhQ57457xoN-P6hyks,25909
 adv_optm/optim/Simplified_AdEMAMix.py,sha256=sY-vThMVgADRh0ar9WHkrM2n8UcgQLQC1YV1Wx8uFz4,12983
 adv_optm/optim/__init__.py,sha256=hpUWE6CKtt_rvMdgQVb3PtjhfZAvAxTq6hp8H8rIpBo,489
 adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
 adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
-adv_optm/util/Kourkoutas.py,sha256=WPAjxaH9pGVtLK_QJcwjkJOnN02Hfyu0F2T90hbhtqo,9662
+adv_optm/util/Kourkoutas.py,sha256=lObJGXmz3MqGSuu3DKqotSpZ0fuQFPE80R3zO_j3Z_Q,9707
 adv_optm/util/MuonAdam_helper.py,sha256=7rnNMujZVDaqo1g22QscMyPlZvIHQQSLHMED9_I8QWU,1250
 adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
 adv_optm/util/Newton_Schulz.py,sha256=wJ_sKRaGVIsOofQ737my4ng494qX_pfgOqlDDmYtnCg,1377
 adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
 adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
 adv_optm/util/__init__.py,sha256=jAaUfaAjFrTJ6-Q915ezAbq0efRbpYjriW2OdeCbSzo,433
-adv_optm-1.2.dev7.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-adv_optm-1.2.dev7.dist-info/METADATA,sha256=mAEVDwu_gh6S-fN6LBfEJoYdn_5LJLOw_nHRZcE7orw,14022
-adv_optm-1.2.dev7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-adv_optm-1.2.dev7.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
-adv_optm-1.2.dev7.dist-info/RECORD,,
+adv_optm-1.2.dev9.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+adv_optm-1.2.dev9.dist-info/METADATA,sha256=GmAYWjZdfgvg9QbzyiV2PUNmzQFgJz8AjaY5F0x7Nv8,14022
+adv_optm-1.2.dev9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+adv_optm-1.2.dev9.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
+adv_optm-1.2.dev9.dist-info/RECORD,,

{adv_optm-1.2.dev7.dist-info → adv_optm-1.2.dev9.dist-info}/WHEEL RENAMED Viewed

File without changes

{adv_optm-1.2.dev7.dist-info → adv_optm-1.2.dev9.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{adv_optm-1.2.dev7.dist-info → adv_optm-1.2.dev9.dist-info}/top_level.txt RENAMED Viewed

File without changes

adv-optm 1.2.dev7__py3-none-any.whl → 1.2.dev9__py3-none-any.whl

Potentially problematic release.

adv-optm 1.2.dev7py3-none-any.whl → 1.2.dev9py3-none-any.whl