PyPI - adv-optm - Versions diffs - 2.4.dev22__py3-none-any.whl → 2.4.dev24__py3-none-any.whl - Mend

adv-optm 2.4.dev22py3-none-any.whl → 2.4.dev24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

adv_optm/__init__.py +1 -1
adv_optm/optim/AdaMuon_adv.py +5 -8
adv_optm/optim/AdamW_adv.py +4 -5
adv_optm/optim/Adopt_adv.py +5 -6
adv_optm/optim/Lion_adv.py +0 -2
adv_optm/optim/Muon_adv.py +4 -8
adv_optm/optim/Prodigy_adv.py +3 -4
adv_optm/optim/SignSGD_adv.py +19 -10
adv_optm/optim/SinkSGD_adv.py +20 -11
adv_optm/util/Muon_AuxAdam.py +1 -0
adv_optm/util/OrthoGrad.py +61 -0
adv_optm/util/centered_decay.py +22 -15
adv_optm/util/param_update.py +0 -68
adv_optm/util/scaled_optm.py +46 -27
adv_optm/util/state_util.py +6 -30
adv_optm-2.4.dev24.dist-info/METADATA +109 -0
adv_optm-2.4.dev24.dist-info/RECORD +29 -0
adv_optm-2.4.dev22.dist-info/METADATA +0 -202
adv_optm-2.4.dev22.dist-info/RECORD +0 -29
{adv_optm-2.4.dev22.dist-info → adv_optm-2.4.dev24.dist-info}/WHEEL +0 -0
{adv_optm-2.4.dev22.dist-info → adv_optm-2.4.dev24.dist-info}/licenses/LICENSE +0 -0
{adv_optm-2.4.dev22.dist-info → adv_optm-2.4.dev24.dist-info}/top_level.txt +0 -0

adv_optm/__init__.py CHANGED Viewed

@@ -20,4 +20,4 @@ __all__ = [
     "SinkSGD_adv",
 ]
-__version__ = "2.4.dev22"
+__version__ = "2.4.dev24"

adv_optm/optim/AdaMuon_adv.py CHANGED Viewed

@@ -99,7 +99,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
         use_muon (bool | None): whether to use Muon or AuxAdamW. MUST be provided
             either here or via `optim_type` in parameter groups. (default: None)
         state_precision (str): Precision for Muon optimizer states. Options: 'auto' (parameter dtype), 'fp32',
-            'bf16_sr' (BF16 with stochastic rounding), 'fp8_sr', 'int8_sr'.
+            'bf16_sr' (BF16 with stochastic rounding), 'int8_sr'.
             (default: 'auto')
         factored_2nd (bool): Factorize only the second moment (v_t) using SMMF
             low-rank compression while keeping the first moment (momentum_buffer)
@@ -123,7 +123,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
         adam_tiny_spike (float): Tiny spike for Kourkoutas-β. (default: 1e-9)
         adam_k_warmup_steps (int): Warmup steps for Kourkoutas-β. (default: 0)
         adam_spectral_normalization (bool): Enable explicit spectral normalization for AdamW. (default: False)
-        adam_state_precision (str): Precision for AuxAdam states. Options: 'auto', 'fp32', 'bf16_sr', 'fp16', 'fp8_sr', 'int8_sr', 'factored'. (default: 'auto')
+        adam_state_precision (str): Precision for AuxAdam states. Options: 'auto', 'fp32', 'bf16_sr', 'fp16', 'int8_sr', 'factored'. (default: 'auto')
         adam_nnmf_factor (bool): 1-bit factored for AdamW.
         adam_factored_2nd (bool): Factorize only the second moment (v_t) for AuxAdam. (default: False)
     """
@@ -157,7 +157,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
         # Boolean to spilt param
         use_muon: bool | None = None,
         # States precision (Muon path)
-        state_precision: str = "auto",  # 'fp32', 'bf16_sr', 'fp8_sr', 'int8_sr'
+        state_precision: str = "auto",  # 'fp32', 'bf16_sr', 'int8_sr'
         # Factorized second moment only
         factored_2nd: bool = False,
         # Update geometry parameters
@@ -220,7 +220,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
             state_precision = "factored"
         state_precision = state_precision.lower()
-        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "fp8_sr", "int8_sr"}
+        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "int8_sr"}
         if state_precision not in valid_precisions:
             raise ValueError(f"state_precision must be one of {valid_precisions}. Got {state_precision}")
@@ -374,6 +374,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
                     d1, d2 = state['effective_shape']
                     state['mu_vbuf_nmf'] = torch.zeros(d1, device=p.device, dtype=torch.float32)
                     state['mv_vbuf_nmf'] = torch.zeros(d2, device=p.device, dtype=torch.float32)
+                    state['shifter'] = torch.tensor([1, 2, 4, 8, 16, 32, 64, 128], device=p.device, dtype=torch.uint8)
                 elif not group['normuon_variant']:
                     init_state_tensor(state, 'second_momentum_buffer', p.shape, actual_precision, p.device, default_dtype, non_neg=True)
@@ -454,8 +455,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
                     random_int_state_tensor = param_update._get_random_int_for_sr(p)
                 elif actual_precision == 'int8_sr':
                     random_int_state_tensor = param_update._get_random_int_for_8bit_sr(p)
-                elif actual_precision == 'fp8_sr':
-                    random_int_state_tensor = param_update._get_random_int_for_fp8_sr(p)
             else:
                 adam_step_param = Muon_AuxAdam._adam_step_parameter
@@ -475,8 +474,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
                     random_int_state_tensor = param_update._get_random_int_for_sr(p)
                 elif actual_precision == 'int8_sr':
                     random_int_state_tensor = param_update._get_random_int_for_8bit_sr(p)
-                elif actual_precision == 'fp8_sr':
-                    random_int_state_tensor = param_update._get_random_int_for_fp8_sr(p)
                 if group['low_rank_ortho']:
                     random_G_sketch = param_update._get_random_noise_for_low_rank_ortho(p, group['ortho_rank'])
             else:

adv_optm/optim/AdamW_adv.py CHANGED Viewed

@@ -84,7 +84,7 @@ class AdamW_adv(torch.optim.Optimizer):
             while only factorizing the second moment. (default: False)
         state_precision (str): Precision method for Adopt states. Options: 'auto'
             (parameter precision), 'fp32', 'factored' (SMMF low-rank FP32), 'bf16_sr' (with
-            stochastic rounding), 'fp16' , 'fp8_sr', 'int8_sr'. (default: 'auto')
+            stochastic rounding), 'fp16' , 'int8_sr'. (default: 'auto')
     """
     def __init__(
@@ -124,7 +124,7 @@ class AdamW_adv(torch.optim.Optimizer):
         centered_wd: float = 0.0,
         centered_wd_mode: str = 'float8',
         # States precision
-        state_precision: str = "auto", # 'fp32', 'factored', 'bf16_sr', 'fp8_sr', 'int8_sr'.
+        state_precision: str = "auto", # 'fp32', 'factored', 'bf16_sr', 'int8_sr'.
         # Factorized second moment only
         factored_2nd: bool = False,
         # SMMF factorization (legacy)
@@ -145,7 +145,7 @@ class AdamW_adv(torch.optim.Optimizer):
             raise ValueError(f"For Kourkoutas-β, betas[1] (as beta2_max) must be > beta2_min. Got {betas[1]} and {beta2_min}")
         state_precision = state_precision.lower()
-        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "fp8_sr", "int8_sr"}
+        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "int8_sr"}
         if state_precision not in valid_precisions:
             raise ValueError(f"state_precision must be one of {valid_precisions}. Got {state_precision}")
@@ -264,6 +264,7 @@ class AdamW_adv(torch.optim.Optimizer):
                     d1, d2 = state['effective_shape']
                     state['mu_v_nmf'] = torch.zeros(d1, device=device, dtype=torch.float32)
                     state['mv_v_nmf'] = torch.zeros(d2, device=device, dtype=torch.float32)
+                    state['shifter'] = torch.tensor([1, 2, 4, 8, 16, 32, 64, 128], device=device, dtype=torch.uint8)
                 else:
                     init_state_tensor(state, 'exp_avg_sq', p.shape, actual_precision, p.device, dtype, non_neg=True)
@@ -314,8 +315,6 @@ class AdamW_adv(torch.optim.Optimizer):
                 random_int_state_tensor = param_update._get_random_int_for_sr(p)
             elif group['actual_state_precision'] == 'int8_sr':
                 random_int_state_tensor = param_update._get_random_int_for_8bit_sr(p)
-            elif group['actual_state_precision'] == 'fp8_sr':
-                random_int_state_tensor = param_update._get_random_int_for_fp8_sr(p)
             step_param_fn = self._compiled_step_parameter
         else:
             step_param_fn = self._step_parameter

adv_optm/optim/Adopt_adv.py CHANGED Viewed

@@ -88,7 +88,7 @@ class Adopt_adv(torch.optim.Optimizer):
             while only factorizing the second moment. (default: False)
         state_precision (str): Precision method for Adopt states. Options: 'auto'
             (parameter precision), 'fp32', 'factored' (SMMF low-rank FP32), 'bf16_sr' (with
-            stochastic rounding), 'fp16' , 'fp8_sr', 'int8_sr'. (default: 'auto')
+            stochastic rounding), 'fp16' , 'int8_sr'. (default: 'auto')
     """
     def __init__(
@@ -126,7 +126,7 @@ class Adopt_adv(torch.optim.Optimizer):
         centered_wd: float = 0.0,
         centered_wd_mode: str = 'float8',
         # States precision
-        state_precision: str = "auto", # 'fp32', 'factored', 'bf16_sr', 'fp8_sr', 'int8_sr'.
+        state_precision: str = "auto", # 'fp32', 'factored', 'bf16_sr', 'int8_sr'.
         # Factorized second moment only
         factored_2nd: bool = False,
         # SMMF factorization (legacy)
@@ -148,7 +148,7 @@ class Adopt_adv(torch.optim.Optimizer):
         state_precision = state_precision.lower()
-        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "fp8_sr", "int8_sr"}
+        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "int8_sr"}
         if state_precision not in valid_precisions:
             raise ValueError(f"state_precision must be one of {valid_precisions}. Got {state_precision}")
@@ -236,7 +236,7 @@ class Adopt_adv(torch.optim.Optimizer):
             dtype = torch.float32 if (state['factored'] or req_precision == 'factored') else p.dtype
-            vt_dtype = torch.float32 if (state['factored'] or state['factored_2nd'] or req_precision in ['factored', 'bf16_sr', 'fp8_sr', 'int8_sr']) else dtype
+            vt_dtype = torch.float32 if (state['factored'] or state['factored_2nd'] or req_precision in ['factored', 'bf16_sr', 'int8_sr']) else dtype
             vt_init = grad.pow(2).to(vt_dtype) * (1 - group['betas'][1])
             if state['factored']:
@@ -262,6 +262,7 @@ class Adopt_adv(torch.optim.Optimizer):
                     state['effective_shape'] = _get_effective_shape(p.numel())
                     d1, d2 = state['effective_shape']
                     state['mu_v_nmf'], state['mv_v_nmf'] = _nnmf(vt_init.view(d1, d2))
+                    state['shifter'] = torch.tensor([1, 2, 4, 8, 16, 32, 64, 128], device=p.device, dtype=torch.uint8)
                 else:
                     init_state_tensor(state, 'exp_avg_sq', p.shape, actual_precision, p.device, dtype)
                     set_state(state, 'exp_avg_sq', vt_init, actual_precision, None, non_neg=True)
@@ -316,8 +317,6 @@ class Adopt_adv(torch.optim.Optimizer):
                 random_int_state_tensor = param_update._get_random_int_for_sr(p)
             elif group['actual_state_precision'] == 'int8_sr':
                 random_int_state_tensor = param_update._get_random_int_for_8bit_sr(p)
-            elif group['actual_state_precision'] == 'fp8_sr':
-                random_int_state_tensor = param_update._get_random_int_for_fp8_sr(p)
             step_param_fn = self._compiled_step_parameter
         else:
             lr = group['lr']

adv_optm/optim/Lion_adv.py CHANGED Viewed

@@ -33,8 +33,6 @@ class Lion_adv(torch.optim.Optimizer):
         stochastic_rounding (bool, optional): whether to use stochastic
             rounding for BF16 parameter updates (default: True).
         orthogonal_gradient (bool): whether to orthogonalize the gradient (default: False).
-        clip_threshold (float, optional): whether to clip the gradients norm
-            per-parameter (default: 0.0).
         kappa_p (float, optional): The p-value for the Lp-norm in Lion-K (domain [1.0, 2.0]).
             - 1.0: Standard Lion (sign update).
             - 2.0: Spherical Lion (normalized L2 update).

adv_optm/optim/Muon_adv.py CHANGED Viewed

@@ -47,7 +47,7 @@ class Muon_adv(torch.optim.Optimizer):
         use_muon (bool | None): whether to use Muon or AuxAdamW. MUST be provided
             either here or via `optim_type` in parameter groups. (default: None)
         state_precision (str): Precision for Muon optimizer states. Options: 'auto' (parameter dtype), 'fp32',
-            'bf16_sr' (BF16 with stochastic rounding), 'fp8_sr', 'int8_sr'.
+            'bf16_sr' (BF16 with stochastic rounding), 'int8_sr'.
             (default: 'auto')
         low_rank_ortho (bool): If True, enables low-rank orthogonalization, which
             projects the update to a lower rank before orthogonalization.
@@ -98,7 +98,7 @@ class Muon_adv(torch.optim.Optimizer):
         adam_tiny_spike (float): Tiny spike for Kourkoutas-β. (default: 1e-9)
         adam_k_warmup_steps (int): Warmup steps for Kourkoutas-β. (default: 0)
         adam_spectral_normalization (bool): Enable explicit spectral normalization for AdamW. (default: False)
-        adam_state_precision (str): Precision for AuxAdam states. Options: 'auto', 'fp32', 'bf16_sr', 'fp16', 'fp8_sr', 'int8_sr', 'factored'. (default: 'auto')
+        adam_state_precision (str): Precision for AuxAdam states. Options: 'auto', 'fp32', 'bf16_sr', 'fp16', 'int8_sr', 'factored'. (default: 'auto')
         adam_nnmf_factor (bool): 1-bit factored for AdamW.
         adam_factored_2nd (bool): Factorize only the second moment (v_t) for AuxAdam. (default: False)
         """
@@ -130,7 +130,7 @@ class Muon_adv(torch.optim.Optimizer):
         # Boolean to spilt param
         use_muon: bool | None = None,
         # States precision (Muon path)
-        state_precision: str = "auto",  # 'fp32', 'bf16_sr', 'fp8_sr', 'int8_sr'
+        state_precision: str = "auto",  # 'fp32', 'bf16_sr', 'int8_sr'
         # Low-rank Muon
         low_rank_ortho: bool = False,
         ortho_rank: int = 128,
@@ -193,7 +193,7 @@ class Muon_adv(torch.optim.Optimizer):
             state_precision = "factored"
         state_precision = state_precision.lower()
-        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "fp8_sr", "int8_sr"}
+        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "int8_sr"}
         if state_precision not in valid_precisions:
             raise ValueError(f"state_precision must be one of {valid_precisions}. Got {state_precision}")
@@ -406,8 +406,6 @@ class Muon_adv(torch.optim.Optimizer):
                     random_int_state_tensor = param_update._get_random_int_for_sr(p)
                 elif actual_precision == 'int8_sr':
                     random_int_state_tensor = param_update._get_random_int_for_8bit_sr(p)
-                elif actual_precision == 'fp8_sr':
-                    random_int_state_tensor = param_update._get_random_int_for_fp8_sr(p)
             else:
                 adam_step_param = Muon_AuxAdam._adam_step_parameter
@@ -427,8 +425,6 @@ class Muon_adv(torch.optim.Optimizer):
                     random_int_state_tensor = param_update._get_random_int_for_sr(p)
                 elif actual_precision == 'int8_sr':
                     random_int_state_tensor = param_update._get_random_int_for_8bit_sr(p)
-                elif actual_precision == 'fp8_sr':
-                    random_int_state_tensor = param_update._get_random_int_for_fp8_sr(p)
                 if group['low_rank_ortho']:
                     random_G_sketch = param_update._get_random_noise_for_low_rank_ortho(p, group['ortho_rank'])
             else:

adv_optm/optim/Prodigy_adv.py CHANGED Viewed

@@ -124,7 +124,7 @@ class Prodigy_adv(torch.optim.Optimizer):
         nesterov: bool = False,
         nesterov_coef: float | None = None,
         # States precision
-        state_precision: str = "auto", # 'fp32', 'factored', 'bf16_sr', 'fp8_sr', 'int8_sr'.
+        state_precision: str = "auto", # 'fp32', 'factored', 'bf16_sr', 'int8_sr'.
         # Factorized second moment only
         factored_2nd: bool = False,
         # SMMF factorization (legacy)
@@ -168,7 +168,7 @@ class Prodigy_adv(torch.optim.Optimizer):
             raise ValueError(f"For Kourkoutas-β, betas[1] (as beta2_max) must be > beta2_min. Got {betas[1]} and {beta2_min}")
         state_precision = state_precision.lower()
-        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "fp8_sr", "int8_sr"}
+        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "int8_sr"}
         if state_precision not in valid_precisions:
             raise ValueError(f"state_precision must be one of {valid_precisions}. Got {state_precision}")
@@ -311,6 +311,7 @@ class Prodigy_adv(torch.optim.Optimizer):
                     d1, d2 = state['effective_shape']
                     state['mu_v_nmf'] = torch.zeros(d1, device=device, dtype=torch.float32)
                     state['mv_v_nmf'] = torch.zeros(d2, device=device, dtype=torch.float32)
+                    state['shifter'] = torch.tensor([1, 2, 4, 8, 16, 32, 64, 128], device=p.device, dtype=torch.uint8)
                 else:
                     init_state_tensor(state, 'exp_avg_sq', p.shape, actual_precision, p.device, dtype, non_neg=True)
@@ -358,8 +359,6 @@ class Prodigy_adv(torch.optim.Optimizer):
                 random_int_state_tensor = param_update._get_random_int_for_sr(p)
             elif group['actual_state_precision'] == 'int8_sr':
                 random_int_state_tensor = param_update._get_random_int_for_8bit_sr(p)
-            elif group['actual_state_precision'] == 'fp8_sr':
-                random_int_state_tensor = param_update._get_random_int_for_fp8_sr(p)
             step_param_fn = self._compiled_step_parameter
         else:
             d = group['d']

adv_optm/optim/SignSGD_adv.py CHANGED Viewed

@@ -44,7 +44,7 @@ class SignSGD_adv(torch.optim.Optimizer):
             'int4': Uses 4-bit block-wise quantization (block size 32).
         state_precision (str): Precision method for Adopt states. Options: 'auto'
             (parameter precision), 'fp32', 'factored' (SMMF low-rank FP32), 'bf16_sr' (with
-            stochastic rounding), 'fp16' , 'fp8_sr', 'int8_sr'. (default: 'auto')
+            stochastic rounding), 'fp16' , 'int8_sr'. (default: 'auto')
         nnmf_factor (bool): whether to use the factorization or use the
             uncompressed optimizer. (default: True)
     """
@@ -70,13 +70,13 @@ class SignSGD_adv(torch.optim.Optimizer):
         nesterov_coef: float | None = None,
         # Normalization then Momentum
         normed_momentum: bool = False,
-        # SNR Precondition
+        # SNR Precondition (requires normed_momentum)
         snr_cond: bool = False,
         # Centered WD
         centered_wd: float = 0.0,
         centered_wd_mode: str = 'float8',
         # States precision
-        state_precision: str = "auto", # 'fp32', 'factored', 'bf16_sr', 'fp8_sr', 'int8_sr'.
+        state_precision: str = "auto", # 'fp32', 'factored', 'bf16_sr', 'int8_sr'.
         # Spectral Normed Optimizer
         spectral_normalization: bool = False,
         # SMMF factorization
@@ -95,7 +95,7 @@ class SignSGD_adv(torch.optim.Optimizer):
             raise NotImplementedError(f"snr_cond is intended to be used with normed_momentum")
         state_precision = state_precision.lower()
-        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "fp8_sr", "int8_sr"}
+        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "int8_sr"}
         if state_precision not in valid_precisions:
             raise ValueError(f"state_precision must be one of {valid_precisions}. Got {state_precision}")
@@ -230,8 +230,6 @@ class SignSGD_adv(torch.optim.Optimizer):
                     random_int_state_tensor = param_update._get_random_int_for_sr(p)
                 elif group['actual_state_precision'] == 'int8_sr':
                     random_int_state_tensor = param_update._get_random_int_for_8bit_sr(p)
-                elif group['actual_state_precision'] == 'fp8_sr':
-                    random_int_state_tensor = param_update._get_random_int_for_fp8_sr(p)
             if group.get('stochastic_sign', False) and not is_vector:
                 random_noise_tensor = param_update._get_random_noise_for_sso(p)
@@ -254,7 +252,8 @@ class SignSGD_adv(torch.optim.Optimizer):
         nesterov = group.get('nesterov', False)
         nesterov_coef = group.get('nesterov_coef', None)
         sso = group.get('stochastic_sign', False)
-        snr_cond = group.get('snr_cond', False) and group.get('normed_momentum', False) and momentum > 0
+        normed_mt = group.get('normed_momentum', False)
+        snr_cond = group.get('snr_cond', False) and normed_mt and momentum > 0
         denom = None
         wd_target = None
@@ -263,7 +262,7 @@ class SignSGD_adv(torch.optim.Optimizer):
         if group["orthogonal_gradient"]:
             grad = _orthogonalize_gradient(p, grad)
-        if group.get('normed_momentum', False):
+        if normed_mt:
             if sso:
                 grad = apply_stochastic_sign_(grad, noise=random_noise_tensor, is_vector=is_vector)
             else:
@@ -285,7 +284,12 @@ class SignSGD_adv(torch.optim.Optimizer):
                 if nesterov:
                     nv_coef = momentum if nesterov_coef is None else nesterov_coef
-                    raw_update = grad_reshaped.lerp(exp_avg, nv_coef)
+                    if normed_mt:
+                        # Scale the normalized gradient down to match the buffer's variance
+                        ema_std = math.sqrt((1 - momentum) / (1 + momentum))
+                        raw_update = (grad_reshaped * ema_std).lerp_(exp_avg, nv_coef)
+                    else:
+                        raw_update = grad.lerp(exp_avg, nv_coef)
                 else:
                     raw_update = exp_avg.clone()
@@ -309,7 +313,12 @@ class SignSGD_adv(torch.optim.Optimizer):
                 if nesterov:
                     nv_coef = momentum if nesterov_coef is None else nesterov_coef
-                    raw_update = grad.lerp(exp_avg, nv_coef)
+                    if normed_mt:
+                        # Scale the normalized gradient down to match the buffer's variance
+                        ema_std = math.sqrt((1 - momentum) / (1 + momentum))
+                        raw_update = (grad * ema_std).lerp_(exp_avg, nv_coef)
+                    else:
+                        raw_update = grad.lerp(exp_avg, nv_coef)
                 else:
                     raw_update = exp_avg.clone()

adv_optm/optim/SinkSGD_adv.py CHANGED Viewed

@@ -42,7 +42,7 @@ class SinkSGD_adv(torch.optim.Optimizer):
         nnmf_factor (bool): whether to use factorization or disable it. (default: False)
         state_precision (str): Precision method for states. Options: 'auto'
             (parameter precision), 'fp32', 'factored' (SMMF low-rank FP32), 'bf16_sr',
-            'fp8_sr', 'int8_sr'. (default: 'auto')
+            'int8_sr'. (default: 'auto')
         compiled_optimizer (bool): Compiles the core step function using torch.compile
             for faster execution. (default: False)
     """
@@ -58,7 +58,7 @@ class SinkSGD_adv(torch.optim.Optimizer):
         orthogonal_sinkhorn: bool = False,
         # Normalization then Momentum
         normed_momentum: bool = False,
-        # SNR Precondition
+        # SNR Precondition (requires normed_momentum)
         snr_cond: bool = False,
         # Nesterov Momentum
         nesterov: bool = False,
@@ -93,7 +93,7 @@ class SinkSGD_adv(torch.optim.Optimizer):
             raise NotImplementedError(f"snr_cond is intended to be used with normed_momentum")
         state_precision = state_precision.lower()
-        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "fp8_sr", "int8_sr"}
+        valid_precisions = {"auto", "fp32", "factored", "bf16_sr", "fp16", "int8_sr"}
         if state_precision not in valid_precisions:
             raise ValueError(f"state_precision must be one of {valid_precisions}. Got {state_precision}")
@@ -209,8 +209,6 @@ class SinkSGD_adv(torch.optim.Optimizer):
                 random_int_state_tensor = param_update._get_random_int_for_sr(p)
             elif group['actual_state_precision'] == 'int8_sr':
                 random_int_state_tensor = param_update._get_random_int_for_8bit_sr(p)
-            elif group['actual_state_precision'] == 'fp8_sr':
-                random_int_state_tensor = param_update._get_random_int_for_fp8_sr(p)
             step_param_fn = self._compiled_step_parameter
         else:
             step_param_fn = self._step_parameter
@@ -226,6 +224,7 @@ class SinkSGD_adv(torch.optim.Optimizer):
         orthogonal_sinkhorn = group['orthogonal_sinkhorn']
         momentum = group['momentum']
+        normed_mt = group.get('normed_momentum', False)
         nesterov = group['nesterov']
         nesterov_coef = group.get('nesterov_coef', None)
         snr_cond = group.get('snr_cond', False)
@@ -238,7 +237,10 @@ class SinkSGD_adv(torch.optim.Optimizer):
         wd_target = None
         cwd_target = None
-        if group.get('normed_momentum', False):
+        if group["orthogonal_gradient"]:
+            grad = _orthogonalize_gradient(p, grad)
+        if normed_mt:
             if not is_vector:
                 # Sinkhorn iterative normalization
                 grad = apply_sr_sinkhorn(grad, iters=sinkhorn_iterations, p=p, ortho_project=orthogonal_sinkhorn)
@@ -246,9 +248,6 @@ class SinkSGD_adv(torch.optim.Optimizer):
                 # For vectors, apply sign operation
                 grad = grad.sign_()
-        if group["orthogonal_gradient"]:
-            grad = _orthogonalize_gradient(p, grad)
         if state['factored']:
             d1, d2 = state['effective_shape']
             grad_reshaped = grad.view(d1, d2)
@@ -272,7 +271,12 @@ class SinkSGD_adv(torch.optim.Optimizer):
                 if nesterov:
                     nv_coef = momentum if nesterov_coef is None else nesterov_coef
-                    update = grad_reshaped.lerp(buf, nv_coef)
+                    if normed_mt:
+                        # Scale the normalized gradient down to match the buffer's variance
+                        ema_std = math.sqrt((1 - momentum) / (1 + momentum))
+                        update = (grad_reshaped * ema_std).lerp_(buf, nv_coef)
+                    else:
+                        update = grad_reshaped.lerp(buf, nv_coef)
                 else:
                     update = buf.clone()
             else:
@@ -301,7 +305,12 @@ class SinkSGD_adv(torch.optim.Optimizer):
                 if nesterov:
                     nv_coef = momentum if nesterov_coef is None else nesterov_coef
-                    update = grad.lerp(buf, nv_coef)
+                    if normed_mt:
+                        # Scale the normalized gradient down to match the buffer's variance
+                        ema_std = math.sqrt((1 - momentum) / (1 + momentum))
+                        update = (grad * ema_std).lerp_(buf, nv_coef)
+                    else:
+                        update = grad.lerp(buf, nv_coef)
                 else:
                     update = buf.clone()
             else:

adv_optm/util/Muon_AuxAdam.py CHANGED Viewed

@@ -56,6 +56,7 @@ def _init_auxadam_state(self, p, group):
             d1, d2 = state['effective_shape']
             state['mu_v_nmf'] = torch.zeros(d1, device=device, dtype=torch.float32)
             state['mv_v_nmf'] = torch.zeros(d2, device=device, dtype=torch.float32)
+            state['shifter'] = torch.tensor([1, 2, 4, 8, 16, 32, 64, 128], device=device, dtype=torch.uint8)
         else:
             init_state_tensor(state, 'exp_avg_sq', p.shape, actual_precision, p.device, dtype, non_neg=True)

adv_optm/util/OrthoGrad.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import torch
+import math
 def _orthogonalize_gradient(p: torch.Tensor, grad: torch.Tensor) -> torch.Tensor:
     """
@@ -17,3 +18,63 @@ def _orthogonalize_gradient(p: torch.Tensor, grad: torch.Tensor) -> torch.Tensor
     g_orth_norm = g_orth.norm(2).add_(1e-30)
     g_orth_scaled = g_orth * (g_norm / g_orth_norm)
     return g_orth_scaled.view(original_shape).to(original_dtype)
+def iterative_ortho_project(p: torch.Tensor, update: torch.Tensor, iters: int = 5) -> torch.Tensor:
+    """
+    Applies iterative alternating orthogonal projection to a 2D matrix.
+    Projects the update to be orthogonal to the parameter matrix along
+    rows and columns sequentially, alternating dimensions.
+    Inspired from Sinkhorn algorithm, 2 iterations is enough to converge
+    to cosine similarity of -1e4 to -1e-5 (semi orthogonal).
+    """
+    # 1D Vector Case fallback to the standard OrthoGrad
+    is_vector = p.ndim < 2 or getattr(p, '_is_dora_scale', False) or getattr(p, 'is_vector', False)
+    if is_vector:
+        return _orthogonalize_gradient(p, update)
+    original_shape = update.shape
+    # 2D+ Matrix Case
+    update_2d = update.view(update.shape[0], -1)
+    param_2d = p.view(p.shape[0], -1)
+    m, n = update_2d.shape
+    # Dynamically determine the order based on aspect ratio
+    row_first = m > n
+    dim = 0 if row_first else 1
+    p_norm_sq_dim = torch.sum(param_2d * param_2d, dim=dim, keepdim=True).add_(1e-30)
+    p_norm_sq_adim = torch.sum(param_2d * param_2d, dim=1-dim, keepdim=True).add_(1e-30)
+    for _ in range(iters):
+        # First dimension
+        update_2d = _ortho_normed_dim(param_2d, update_2d, p_norm_sq_dim, dim)
+        # Second dimension
+        update_2d = _ortho_normed_dim(param_2d, update_2d, p_norm_sq_adim, 1 - dim)
+    return update_2d.view(original_shape)
+def _ortho_normed_dim(p_2d: torch.Tensor, update_2d: torch.Tensor, p_norm_sq: torch.Tensor, dim: int) -> torch.Tensor:
+    """
+    Projects the update to be orthogonal to p along 'dim' and dynamically restores
+    the original magnitude of that dimension pre-projection.
+    """
+    # Record target magnitude before projection
+    norm_lb = 1 / math.sqrt(update_2d.shape[dim])
+    target_norm = update_2d.norm(p=2, dim=dim, keepdim=True).clamp_min_(norm_lb)
+    # Project: g_orth = g - (p * <p, g> / ||p||^2)
+    dot_prod = torch.sum(p_2d * update_2d, dim=dim, keepdim=True)
+    proj = dot_prod / p_norm_sq
+    # In-place subtraction: update_2d = update_2d - (proj * p_2d)
+    # Standard gamma is -1, but -1.01 proved to converge faster
+    update_2d.addcmul_(proj, p_2d, value=-1.01)
+    # Magnitude Preservation
+    g_orth_norm = update_2d.norm(p=2, dim=dim, keepdim=True).clamp_min_(norm_lb)
+    scale_factor = target_norm / g_orth_norm
+    return update_2d.mul_(scale_factor)

adv_optm/util/centered_decay.py CHANGED Viewed

@@ -19,9 +19,9 @@ def quantize_blockwise(p, block_size, bits=8):
     # Pad to multiple of block_size
     pad_len = (block_size - (numel % block_size)) % block_size
     if pad_len > 0:
-        val_padded = F.pad(val_flat, (0, pad_len), mode='replicate')
+        val_padded = F.pad(val_flat.unsqueeze(0), (0, pad_len), mode='replicate').squeeze(0)
     else:
-        val_padded = val_flat
+        val_padded = val_flat.clone()
     # Block Reshape
     val_blocks = val_padded.view(-1, block_size).float()
@@ -29,13 +29,14 @@ def quantize_blockwise(p, block_size, bits=8):
     # Calc Stats
     min_vals, max_vals = torch.aminmax(val_blocks, dim=1, keepdim=True)
-    # Scale calculation
-    max_int = (1 << bits) - 1
-    scales = (max_vals - min_vals).div_(float(max_int))
+    # Scale calculation for signed ints
+    q_min = -(1 << (bits - 1))
+    q_max = (1 << (bits - 1)) - 1
+    scales = (max_vals - min_vals).div_(float(q_max - q_min))
     scales.masked_fill_(scales == 0, 1.0)
-    # Quantize: (val - min) / scale
-    quantized = val_blocks.sub_(min_vals).div_(scales).round_().clamp_(0, max_int).to(torch.uint8)
+    # Quantize: (val - min) / scale + q_min
+    quantized = val_blocks.sub_(min_vals).div_(scales).add_(q_min).round_().clamp_(q_min, q_max).to(torch.int8)
     return quantized, scales.squeeze(1), min_vals.squeeze(1)
@@ -64,9 +65,10 @@ def _init_anchor(p, state, group):
         q_blocks, scales, mins = quantize_blockwise(p, block_size=32, bits=4)
         q_flat = q_blocks.view(-1)
         # Vectorized packing: High bits | Low bits
-        packed = (q_flat[0::2] << 4) | q_flat[1::2]
+        # Masking with 0x0F prevents two's complement sign extension from overwriting bits
+        packed = ((q_flat[0::2] & 0x0F) << 4) | (q_flat[1::2] & 0x0F)
-        state['anchor_data'] = packed
+        state['anchor_data'] = packed.to(torch.int8)
         state['anchor_scale'] = scales.to(p.dtype)
         state['anchor_min'] = mins.to(p.dtype)
@@ -88,24 +90,29 @@ def dequantize_anchor(p, state, group, dtype):
     orig_shape = p.shape
     orig_numel = p.numel()
-    if mode == 'int4' and anchor_data.dtype == torch.uint8:
+    if mode == 'int4' and anchor_data.dtype == torch.int8:
         block_size = 32
-        unpacked = torch.empty(anchor_data.numel() * 2, dtype=torch.uint8, device=anchor_data.device)
+        unpacked = torch.empty(anchor_data.numel() * 2, dtype=torch.int8, device=anchor_data.device)
+        # Unpack utilizing standard PyTorch arithmetic shift (sign extends natively)
         unpacked[0::2] = anchor_data >> 4
-        unpacked[1::2] = anchor_data & 0x0F
+        unpacked[1::2] = (anchor_data << 4) >> 4
         quantized_blocks = unpacked.view(-1, block_size)
+        q_min = -8
-    elif mode == 'int8' and anchor_data.dtype == torch.uint8:
+    elif mode == 'int8' and anchor_data.dtype == torch.int8:
         block_size = 128
         quantized_blocks = anchor_data
+        q_min = -128
     else:
         # Unrecognised mode/dtype combination
         return anchor_data.to(dtype)
-    # Core Dequantization: (q * scale) + min
+    # Core Dequantization: (q - q_min) * scale + min
     anchor_blocks = (
-        quantized_blocks.float() * scales.float().unsqueeze(1)
+        (quantized_blocks.float() - q_min) * scales.float().unsqueeze(1)
         + mins.float().unsqueeze(1)
     )

adv_optm/util/param_update.py CHANGED Viewed

@@ -236,74 +236,6 @@ def copy_stochastic_(target: Tensor, source: Tensor, inplace: bool = False):
     _copy_stochastic_core_(target, source, random_int_tensor, inplace)
     del random_int_tensor
-def _get_random_int_for_fp8_sr(source: torch.Tensor) -> torch.Tensor:
-    """
-    Generates a random int32 tensor for FP8 stochastic rounding.
-    This function is not torch.compile-path friendly due to its use of torch.Generator.
-    """
-    device = source.device
-    if device not in _generators:
-        set_seed(device)
-    # TODO: this is a workaround until torch compile error
-    # NotImplementedError: UserDefinedObjectVariable(generator) is fixed
-    generator = _generators[device]
-    # FP8 e4m3fn always preserves exactly 3 mantissa bits from FP32 (23 bits).
-    # We need uniform noise in [0, 2^20 - 1]
-    return torch.randint(
-        size=source.shape,
-        device=source.device,
-        dtype=torch.int32,
-        low=0,
-        high=1048576,  # 1 << 20
-        generator=generator,
-    )
-def _copy_fp8_stochastic_core_(
-    target: torch.Tensor,
-    source: torch.Tensor,
-    scale: torch.Tensor,
-    random_int_tensor: torch.Tensor
-):
-    """
-    Core logic for FP8 (float8_e4m3fn) stochastic rounding using a pre-computed
-    random integer tensor.
-    """
-    # Scale the source to FP32
-    buffer = (source * scale).to(torch.float32)
-    # Extract magnitude and sign
-    sign_x = torch.sign(buffer)
-    buffer.abs_()
-    # Create and apply the magic offset
-    offset = (buffer < 0.015625).to(torch.float32).mul_(0.015625)
-    buffer.add_(offset)
-    # Apply Stochastic Rounding
-    buffer_int = buffer.view(torch.int32)
-    buffer_int.add_(random_int_tensor)
-    buffer_int.bitwise_and_(-1048576)
-    # Remove offset and reapply sign
-    buffer = buffer_int.view(torch.float32)
-    buffer.sub_(offset)
-    buffer.mul_(sign_x)
-    target.copy_(buffer.to(torch.float8_e4m3fn))
-def copy_fp8_stochastic_(target: torch.Tensor, source: torch.Tensor, scale: torch.Tensor):
-    """
-    Stochastic rounding implementation for FP8 e4m3fn states.
-    """
-    random_int_tensor = _get_random_int_for_fp8_sr(source)
-    _copy_fp8_stochastic_core_(target, source, scale, random_int_tensor)
-    del random_int_tensor
 def _get_random_int_for_8bit_sr(source: torch.Tensor, numel: int | None = None) -> torch.Tensor:
     """

adv_optm/util/scaled_optm.py CHANGED Viewed

@@ -29,13 +29,13 @@ def scale_update(
     # DoRA Magnitude Scales (1D) or 1D Bias/Norm layers
     if p.ndim < 2 or is_dora_scale:
-        return rms_normalization(update, dim=None, lr=lr)
+        return max_abs_normalization(update, dim=None, lr=lr)
     # OFT Block Parameters: shape (k, C(b,2))
     # Normalise by max per-block row norm so that
     #   ‖ΔR_block‖_spec = max_i ‖ΔRᵢ‖_spec ≤ 2 · max_i ‖Δθᵢ‖₂ ≤ target_scale · lr
     if is_oft:
-        return max_row_norm_normalization(update, lr)
+        return max_row_norm_oft_normalization(p, update, lr)
     # LoRA Factors or Full Finetuning weights
     # Scales update to maintain consistent spectral norm across different layer sizes and ranks.
@@ -124,9 +124,7 @@ def init_spectral_norm(state: dict, p: torch.Tensor):
 @torch.no_grad()
 def l2_normalization(update: torch.Tensor, dim: int | None, lr: float) -> torch.Tensor:
     """Performs L2 normalization on the update tensor."""
-    n = update.numel() if dim is None else update.shape[dim]
-    norm_eps = 1 / math.sqrt(n)
-    norm = torch.linalg.vector_norm(update, ord=2, dim=dim, keepdim=True).clamp_min(norm_eps)
+    norm = torch.linalg.vector_norm(update, ord=2, dim=dim, keepdim=True).clamp_min(1e-12)
     return update.mul_(lr / norm)
@@ -134,16 +132,25 @@ def l2_normalization(update: torch.Tensor, dim: int | None, lr: float) -> torch.
 def rms_normalization(update: torch.Tensor, dim: int | None, lr: float) -> torch.Tensor:
     """Performs Root Mean Square normalization on the update tensor."""
     n = update.numel() if dim is None else update.shape[dim]
-    norm_eps = 1 / math.sqrt(n)
-    norm = torch.linalg.vector_norm(update, ord=2, dim=dim, keepdim=True).clamp_min(norm_eps)
+    norm = torch.linalg.vector_norm(update, ord=2, dim=dim, keepdim=True).clamp_min(1e-12)
     scale_n = math.sqrt(n)
     return update.mul_(lr * scale_n / norm)
 @torch.no_grad()
-def max_row_norm_normalization(
+def max_abs_normalization(update: torch.Tensor, dim: int | None, lr: float) -> torch.Tensor:
+    """
+    Performs L-infinity (Max Absolute) normalization.
+    Strictly bounds the maximum update of any single element to 'lr'.
+    """
+    # ord=float('inf') computes the maximum absolute value
+    norm = torch.linalg.vector_norm(update, ord=float('inf'), dim=dim, keepdim=True).clamp_min(1e-12)
+    return update.mul_(lr / norm)
+@torch.no_grad()
+def max_row_norm_oft_normalization(
+    p: torch.Tensor,
     update: torch.Tensor,
     lr: float,
-    target_scale: float = 0.5,
 ) -> torch.Tensor:
     """
     Normalizes OFT parameter updates by the maximum per-block (row) L2 norm.
@@ -154,31 +161,43 @@ def max_row_norm_normalization(
         ‖ΔR_block‖_spec = max_i ‖ΔRᵢ‖_spec ≤ 2 · max_i ‖Δθᵢ‖₂
-    Unlike spectral normalization of the full (k × C(b,2)) parameter matrix,
-    this guarantee is exact for all update distributions — including worst-case
-    concentrated updates where all energy sits in a single block.
     Result: Var[Δyⱼ] ≤ (target_scale · lr)² = O(1) for every block configuration.
+    """
+    # keeps the effective rotation step ‖ΔR_block‖_spec ≤ lr.
+    target_scale = 0.5
-    Args:
-        update: OFT parameter update, shape (k, C(b,2)).
-        lr: Learning rate.
-        target_scale: Desired bound on max_i ‖Δθᵢ‖₂ / lr. Default 0.5 keeps
-                      the effective rotation step ‖ΔR_block‖_spec ≤ lr.
+    # Row norms: shape (k,) - one per block
+    row_norms = torch.linalg.vector_norm(update, ord=2, dim=-1)
+    # Stability floor: equivalent to a single-element vector norm lower bound
+    norm_lb = 1.0 / math.sqrt(update.shape[1])
+    max_norm = row_norms.max().clamp_min(norm_lb)
-    Returns:
-        Scaled update tensor (in-place).
+    # Get the magnitude correction factor
+    cayley_correction = get_oft_magnitude_correction(p)
+    return update.mul_(lr * cayley_correction *  target_scale / max_norm)
+@torch.no_grad()
+def get_oft_magnitude_correction(p: torch.Tensor) -> torch.Tensor:
+    """
+    Approximates the magnitude correction of exact Riemannian preconditioning (M @ G @ M).
+    Neutralizes the derivative shrinkage of the Cayley transform using a scalar multiplier.
     """
-    # Row norms: shape (k,) — one per block
-    row_norms = torch.linalg.vector_norm(update, ord=2, dim=1)
-    max_norm = row_norms.max()
+    n_el = p.shape[-1]
+    b = (1 + math.sqrt(1 + 8 * n_el)) / 2
-    # Stability floor: equivalent to a single-element vector norm lower bound
-    norm_eps = 1.0 / math.sqrt(update.shape[1])
-    max_norm = max_norm.clamp_min(norm_eps)
+    # Calculate the squared L2 norm for each block independently.
+    p_norm_sq = torch.linalg.vector_norm(p, ord=2, dim=-1).square_()
+    # The expected shrinkage of the Cayley derivative is roughly (1 + lambda^2)^-1,
+    # where lambda^2 is the average eigenvalue of -Q^2.
+    # Since Tr(-Q^2) = 2 * ||p||_2^2, the average eigenvalue is 2 * ||p||_2^2 / b.
+    cayley_correction = 1.0 + (2.0 * p_norm_sq / b)
-    return update.mul_(lr * target_scale / max_norm)
+    # Reshape correction to broadcast against the update tensor (shape (k, 1))
+    cayley_correction = cayley_correction.unsqueeze(-1)
+    return cayley_correction
 @torch.no_grad()
 def spectral_normalization(

adv_optm/util/state_util.py CHANGED Viewed

@@ -3,7 +3,6 @@ import torch.nn.functional as F
 from .param_update import (
     copy_stochastic_, _copy_stochastic_core_,
-    copy_fp8_stochastic_, _copy_fp8_stochastic_core_,
     copy_int8_blockwise_stochastic_, _copy_int8_blockwise_stochastic_core_,
     copy_int8_sym_blockwise_stochastic_, _copy_int8_sym_blockwise_stochastic_core_,
 )
@@ -22,17 +21,12 @@ def init_state_tensor(state: dict, key: str, shape: tuple, state_precision: str,
         store_dtype = torch.bfloat16
     elif state_precision == 'fp16':
         store_dtype = torch.float16
-    elif state_precision in ['fp8', 'fp8_sr']:
-        store_dtype = torch.float8_e4m3fn
     elif state_precision == 'int8_sr':
         store_dtype = torch.uint8 if non_neg else torch.int8
     else:  # 'auto'
         store_dtype = default_dtype
-    if store_dtype == getattr(torch, 'float8_e4m3fn', None):
-        state[key] = torch.zeros(shape, device=device, dtype=store_dtype)
-        state[f"{key}_scale"] = torch.tensor(1.0, device=device, dtype=torch.float32)
-    elif store_dtype in (torch.uint8, torch.int8):
+    if store_dtype in (torch.uint8, torch.int8):
         numel = 1
         for s in shape:
             numel *= s
@@ -48,10 +42,7 @@ def get_state(state: dict, key: str, state_precision: str) -> torch.Tensor:
     Retrieves and dequantizes the state tensor to float32.
     """
     tensor = state[key]
-    if state_precision in ['fp8', 'fp8_sr']:
-        scale = state[f"{key}_scale"]
-        return tensor.float() / scale
-    elif state_precision == 'int8_sr':
+    if state_precision == 'int8_sr':
         scales = state[f"{key}_scale"] # (n_blocks,) fp32
         blocks, orig_shape, orig_numel = _prepare_int8_blocks(state[key], _int8_sr_BLOCK_SIZE)
@@ -123,19 +114,6 @@ def set_state(state: dict, key: str, value: torch.Tensor, state_precision: str,
         if state[key] is not value:
             state[key].copy_(value)
-    elif state_precision == 'fp8_sr':
-        amax = value.abs().max().clamp_min(1e-12)
-        # Calculate amax
-        scale = 448.0 / amax
-        state[f"{key}_scale"].copy_(scale)
-        # Quantize with bitwise Stochastic Rounding
-        if random_int_state_tensor is None:
-            copy_fp8_stochastic_(state[key], value, scale)
-        else:
-            _copy_fp8_stochastic_core_(state[key], value, scale, random_int_state_tensor)
     elif state_precision == 'bf16_sr':
         # Apply stochastic rounding for BF16 states
         if random_int_state_tensor is None:
@@ -195,7 +173,7 @@ def upcast_grad_for_precision(grad: torch.Tensor, state: dict, state_precision:
     # Low-precision storage modes benefit from FP32 accumulation to
     # maintain accuracy before quantizing back down in set_state.
-    if state_precision in ['fp32', 'bf16_sr', 'fp8_sr', 'int8_sr', 'factored']:
+    if state_precision in ['fp32', 'bf16_sr', 'int8_sr', 'factored']:
         return grad.float()
     return grad
@@ -216,8 +194,6 @@ def fix_loaded_state_dtype(state: dict, p: torch.Tensor, group: dict) -> None:
         base_dtype = torch.float32
     elif actual_precision == 'bf16_sr':
         base_dtype = torch.bfloat16
-    elif actual_precision in ['fp8', 'fp8_sr']:
-        base_dtype = torch.float8_e4m3fn
     elif actual_precision == 'int8_sr':
         base_dtype = torch.uint8
     else:
@@ -245,8 +221,8 @@ def fix_loaded_state_dtype(state: dict, p: torch.Tensor, group: dict) -> None:
                 if val.dtype != p.dtype:
                     state[key] = val.to(p.dtype)
             elif mode in ['int8', 'int4']:
-                if val.dtype != torch.uint8:
-                    state[key] = val.to(torch.uint8)
+                if val.dtype != torch.int8:
+                    state[key] = val.to(torch.int8)
             elif mode == 'float8':
                 if val.dtype != torch.float8_e4m3fn:
                     state[key] = val.to(torch.float8_e4m3fn)
@@ -263,7 +239,7 @@ def fix_loaded_state_dtype(state: dict, p: torch.Tensor, group: dict) -> None:
                 state[key] = val.to(torch.uint8)
             continue
-        # Handle Factorized Tensors, FP8 Scales, and blockwise INT8 scale
+        # Handle Factorized Tensors, and blockwise INT8 scale
         if key in fp32_keys or (key.endswith('_scale') and key != 'anchor_scale'):
             if val.dtype != torch.float32:
                 state[key] = val.to(torch.float32)

adv_optm-2.4.dev24.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,109 @@
+Metadata-Version: 2.4
+Name: adv_optm
+Version: 2.4.dev24
+Summary: A family of highly efficient, lightweight yet powerful optimizers.
+Home-page: https://github.com/Koratahiu/Advanced_Optimizers
+Author: Koratahiu
+Author-email: hiuhonor@gmail.com
+License: Apache 2.0
+Keywords: llm,fine-tuning,memory-efficient,low-rank,compression,pytorch,optimizer,adam
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch>=2.1
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: license-file
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# Advanced Optimizers (AIO)
+A comprehensive, all-in-one collection of optimization algorithms for deep learning, designed for **maximum efficiency**, **minimal memory footprint**, and **superior performance** across diverse model architectures and training scenarios.
+[![PyPI](https://img.shields.io/pypi/v/adv_optm)](https://pypi.org/project/adv_optm/)
+## 🔥 What's New
+### In 2.4.x:
+This update introduces a whole refactor of the library with many new features and changes:
+- New optimizers state mode option (`state_precision`) with many precision settings for the optimizer states: rank-2 factored mode (`factored`), full FP32 (`fp32`), BF16 with Stochastic Rounding (`bf16_sr`), int8/uint8 with Stochastic Rounding (`int8_sr`), FP16 (`fp16`)
+- Added new powerful optimizer: SinkSGD_adv.
+- Added spectral scaling option to all optimizers, achieving width/rank invariant updates.
+- Added Nesterov momentum (`nesterov`) and its coef (`nesterov_coef`) to all optimizers.
+- Added centered weight decay (`centered_wd`), to pull the weights toward their pre-train state (anchor)
+    - anchor precision can be changed to save memory (`centered_wd_mode`): full, float8, int8, int4
+- Added Fisher Weight Decay option for Adam variants (`fisher_wd`).
+    - Paper: [FAdam...](https://arxiv.org/abs/2405.12807)
+- Added Factored Second Moment option for Adam variants (`factored_2nd`). This works alongside any `state_precision` setting.
+- Added Geometric Weight Decay for SinkSGD_adv and SignSGD_adv.
+- Added new powerful mode: variance normalized momentum (`normed_momentum`). Which applies the optimizer normalization before the momentum (also called as Normalization then momentum NtM)
+    - For: AdamW_adv, SignSGD_adv, SinkSGD_adv.
+- Added Variance/Confidence Preconditioning (`snr_cond`) for SignSGD_adv, SinkSGD_adv.
+    - Only works with `normed_momentum`.
+    - Technical reports: [AASS](https://koratahiu.github.io/aass/), and [sink-v](https://koratahiu.github.io/sink-v/).
+- Added Adaptive Stochastic Sign with L_inf preconditioning (`stochastic_sign`) for SignSGD_Adv and Lion_adv.
+- Improved CANS (`accelerated_ns`) for Muon variants, by integrating dynamic lower bound.
+- Removed Simplified_AdEMAMix optimizer and its settings in other optimizers, they are now replaced by Nesterov momentum and its coef. Which is better and less hard to tune.
+- Removed cautious and grams modes, as they were heuristic and not working well.
+- Removed optimizers: Lion_Prodigy_adv, and Simplified_AdEMAMix.
+### in 2.1.x
+- Added Signum (SignSGD with momentum): A new optimizer in the family (SignSGD_adv)
+- More info coming soon.
+### in 2.0.x
+* Implemented torch.compile for all advanced optimizers. Enabled via (compiled_optimizer=True) to fuse and optimize the optimizer step path.
+* Better and improved 1-bit factored mode via (nnmf_factor=True).
+* Various improvements across the optimizers.
+### in 1.2.x
+* Added **advanced variants** of [Muon optimizer](https://kellerjordan.github.io/posts/muon/) with **features** and **settings** from recent papers.
+| Optimizer | Description |
+|---|---|
+| `Muon_adv` | Advanced Muon implementation with CANS, NorMuon, Low-Rank ortho, etc. features. |
+| `AdaMuon_adv` | Advanced AdaMuon implementation, which combines Muon's geometry with Adam-like adaptive scaling and sign-based orthogonalization. |
+> *Documentation coming soon.*
+* Implemented [Cautious Weight Decay](https://arxiv.org/abs/2510.12402) for all advanced optimizers.
+* Improved parameter update and weight decay for **BF16** with **stochastic rounding**. The updates are now accumulated in **float32** and rounded once at the end.
+* Use fused and in-place operations whenever possible for all advanced optimizers.
+* **Prodigy variants** are now **50% faster** by [avoiding CUDA syncs](https://github.com/Koratahiu/Advanced_Optimizers/pull/5). Thanks to **@dxqb**!
+---
+## 📦 Installation
+```bash
+pip install adv_optm
+```
+---
+## 🧠 Core Innovations
+This library integrates multiple state-of-the-art optimization techniques validated through extensive research and practical training.
+---

adv_optm-2.4.dev24.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,29 @@
+adv_optm/__init__.py,sha256=FszXXnlL8-PPppcRuJ96wKzbflM1jf7vXLop8mK3XnI,356
+adv_optm/optim/AdaMuon_adv.py,sha256=KfpOAOHMshnF2R-Cn00VRraGmAXQrVZFo4ZXMOyMXDc,33368
+adv_optm/optim/AdamW_adv.py,sha256=Gq6FsIpA0RT35zuyk_lEBCngrftgbaeLIiCW0ExN6Qw,22688
+adv_optm/optim/Adopt_adv.py,sha256=f423WZTq120jg4TJoCqckzPTY30TyskbBLBXaPKhdWM,22919
+adv_optm/optim/Lion_adv.py,sha256=pXQRPKNstyc2u3Mq34WDpaLP-TAZJLewFLHXHMaXvbE,12475
+adv_optm/optim/Muon_adv.py,sha256=nN_AdQ7dcPSHJseEdmygj7vVK7No2CpoC6pbcUiHjoE,27553
+adv_optm/optim/Prodigy_adv.py,sha256=hYY1B167J-t-BPkLM_ZL3L-Kc4ud4feDnta3AjiMGoQ,26942
+adv_optm/optim/SignSGD_adv.py,sha256=PXSR5l2Pze0zsz9yj7CPnRt7EzibzOafT8xF4mUlU-g,16489
+adv_optm/optim/SinkSGD_adv.py,sha256=IyPoDikVHHKG6_GrGWQ-bdkqGfg08OJ6KX_FXBdWubE,16632
+adv_optm/optim/__init__.py,sha256=RkpzWpEgAOdQAppTBLc5AilfJq-wn0aabNsQDj_7-4A,452
+adv_optm/util/Kourkoutas.py,sha256=tTo2QbOWPhI29hWQ4ERosoZXRAFXXWDzIMCj1KMFaOE,13325
+adv_optm/util/Muon_AuxAdam.py,sha256=C79uwrwqRplIVdJV1-omkNkfdPwcLrokzHetAvt6fk8,8423
+adv_optm/util/Muon_util.py,sha256=w6f6aeBpuJwNhltXiVlJy8X52hlaGQACdPHFFN7-hJg,15280
+adv_optm/util/OrthoGrad.py,sha256=bn9PifXd85Xb2padHFqGKw-huEpTwmckyt8gz5Zrias,3266
+adv_optm/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+adv_optm/util/centered_decay.py,sha256=5xaxkU-Vf6CDpN6gXv6A6RUylNPXY2vsiSVZe_mCViY,4257
+adv_optm/util/factorization_util.py,sha256=I75o2Cxlp4XKMBBnuRa6nDWAT9aJqNS75zv7-OGO5K8,3974
+adv_optm/util/lion_k.py,sha256=b3oqTHK1spoD_Hi8wxKZ9lEWUkHejovT4piU-wUp_eI,1686
+adv_optm/util/param_update.py,sha256=7-iHC0oQ72zHEfjPza27wgV6vEC77by4eayKItWqNPc,16167
+adv_optm/util/scaled_optm.py,sha256=e6hY90mD_gSrDr_rDglLe6p0S2-YH3tXb71LNgm8B8Y,8925
+adv_optm/util/signed_util.py,sha256=ebQn-O8pNvld3RvsJ_MAdCL4C8pBNS15Uqzr2xHChss,1873
+adv_optm/util/sinkhorn.py,sha256=oFdWY243jBeuBKLBS47I5dE8DsK5OqJNDWlMBKXsxUM,5075
+adv_optm/util/state_util.py,sha256=mpsyjhvtmNUey5xMWPFwhDzDTg3m4XA6Lo4RS2z-WCI,10126
+adv_optm/util/update_util.py,sha256=IAaYuxrJ9kP2P4Z0v02pesRe4mRzP5SvXbLHjLrrdZw,1332
+adv_optm-2.4.dev24.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+adv_optm-2.4.dev24.dist-info/METADATA,sha256=JvPcViuM3LwAF28oknH2JAkDL3vc_7rfIzwZ-KORotQ,5240
+adv_optm-2.4.dev24.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+adv_optm-2.4.dev24.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
+adv_optm-2.4.dev24.dist-info/RECORD,,

adv_optm-2.4.dev22.dist-info/METADATA DELETED Viewed

@@ -1,202 +0,0 @@
-Metadata-Version: 2.4
-Name: adv_optm
-Version: 2.4.dev22
-Summary: A family of highly efficient, lightweight yet powerful optimizers.
-Home-page: https://github.com/Koratahiu/Advanced_Optimizers
-Author: Koratahiu
-Author-email: hiuhonor@gmail.com
-License: Apache 2.0
-Keywords: llm,fine-tuning,memory-efficient,low-rank,compression,pytorch,optimizer,adam
-Classifier: Programming Language :: Python :: 3
-Classifier: License :: OSI Approved :: Apache Software License
-Classifier: Operating System :: OS Independent
-Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
-Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Requires-Python: >=3.8
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: torch>=2.1
-Dynamic: author
-Dynamic: author-email
-Dynamic: classifier
-Dynamic: description
-Dynamic: description-content-type
-Dynamic: home-page
-Dynamic: keywords
-Dynamic: license
-Dynamic: license-file
-Dynamic: requires-dist
-Dynamic: requires-python
-Dynamic: summary
-# Advanced Optimizers (AIO)
-A comprehensive, all-in-one collection of optimization algorithms for deep learning, designed for **maximum efficiency**, **minimal memory footprint**, and **superior performance** across diverse model architectures and training scenarios.
-[![PyPI](https://img.shields.io/pypi/v/adv_optm)](https://pypi.org/project/adv_optm/)
-## 🔥 What's New
-### in 2.1.x
-- Added Signum (SignSGD with momentum): A new optimizer in the family (SignSGD_adv)
-- More info coming soon.
-### in 2.0.x
-* Implemented torch.compile for all advanced optimizers. Enabled via (compiled_optimizer=True) to fuse and optimize the optimizer step path.
-* Better and improved 1-bit factored mode via (nnmf_factor=True).
-* Various improvements across the optimizers.
-### in 1.2.x
-* Added **advanced variants** of [Muon optimizer](https://kellerjordan.github.io/posts/muon/) with **features** and **settings** from recent papers.
-| Optimizer | Description |
-|---|---|
-| `Muon_adv` | Advanced Muon implementation with CANS, NorMuon, Low-Rank ortho, etc. features. |
-| `AdaMuon_adv` | Advanced AdaMuon implementation, which combines Muon's geometry with Adam-like adaptive scaling and sign-based orthogonalization. |
-> *Documentation coming soon.*
-* Implemented [Cautious Weight Decay](https://arxiv.org/abs/2510.12402) for all advanced optimizers.
-* Improved parameter update and weight decay for **BF16** with **stochastic rounding**. The updates are now accumulated in **float32** and rounded once at the end.
-* Use fused and in-place operations whenever possible for all advanced optimizers.
-* **Prodigy variants** are now **50% faster** by [avoiding CUDA syncs](https://github.com/Koratahiu/Advanced_Optimizers/pull/5). Thanks to **@dxqb**!
----
-## 📦 Installation
-```bash
-pip install adv_optm
-```
----
-## 🧠 Core Innovations
-This library integrates multiple state-of-the-art optimization techniques validated through extensive research and practical training, with **1-bit compression for optimizer states**:
-### **Memory-Efficient Optimization (SMMF-inspired)**
-- **Paper**: [SMMF: Square-Matricized Momentum Factorization](https://arxiv.org/abs/2412.08894)
-- **Approach**: Uses rank-1 non-negative matrix factorization with reconstruction cycle (factor → reconstruct → update → factor)
-- **Innovation**:
-  - First moment split into **1-bit sign + absolute value**
-  - Final storage: **four factored vectors + one 1-bit sign state**
-  - Preserves Adam-like update quality with drastically reduced memory
----
-## ⚡ Performance Characteristics
-### Memory Efficiency (SDXL Model – 6.5GB)
-| Optimizer | Memory Usage | Description |
-|-----------|--------------|-------------|
-| `Adopt_Factored` | 328 MB | 4 small vectors + 1-bit state |
-| `Adopt_Factored + AdEMAMix` | 625 MB | 6 small vectors + two 1-bit states |
-### Speed Comparison (SDXL, Batch Size 4)
-| Optimizer | Speed | Notes |
-|-----------|-------|-------|
-| `Adafactor` | ~8.5s/it | Baseline |
-| `Adopt_Factored` | ~10s/it | +18% overhead from compression |
-| `Adopt_Factored + AdEMAMix` | ~12s/it | +41% overhead (3 factored states) |
----
-## 🧪 Available Optimizers
-### Standard Optimizers (All support `factored=True/False`)
-| Optimizer | Description | Best For |
-|-----------|-------------|----------|
-| `Adam_Adv` | Advanced Adam implementation | General purpose |
-| `Adopt_Adv` | Adam-variant with independent beta2 | Stable training for small batch size regimes |
-| `Prodigy_Adv` | Prodigy with D-Adaptation | Adam with automatic LR tuning |
-| `Lion_Adv` | Advanced Lion implementation | Memory-constrained environments |
-| `Prodigy_Lion_Adv` | Prodigy + Lion combination | Lion with automatic LR tuning |
----
-## ⚙️ Feature Matrix
-| Feature | Adam_Adv | Adopt_Adv | Prodigy_Adv | Lion_Adv |
-|---------|----------|-----------|-------------|----------|
-| Factored | ✓ | ✓ | ✓ ✓ |
-| OrthoGrad | ✓ | ✓ | ✓ | ✓ |
-| atan2 | ✓ | ✓ | ✓ |✗ |
-| Stochastic Rounding | ✓ | ✓ | ✓ |✓ |
-| Fused Backward Pass | ✓ | ✓ | ✓ | ✓ |
-| **Kourkoutas-β** | ✓ | ✓ | ✓ | ✗ |
----
-## 🛠️ Comprehensive Feature Guide
-### A. Universal Safe Features
-*These features work with all optimizers and are generally safe to enable.*
-| Feature | Description | Recommended Usage | Performance Impact | Theoretical Basis | Compatibility |
-|--------|-------------|-------------------|--------------------|-------------------|--------------|
-| **Fused Back Pass** | Fuses backward pass; gradients used immediately and memory freed on-the-fly | Memory-constrained environments | Reduces peak memory | Memory optimization | All optimizers |
-| **Stochastic Rounding** | Replaces nearest rounding with stochastic rounding to preserve small gradient updates in BF16 | BF16 training | Minimal overhead (<5%) | [Revisiting BFloat16 Training](https://arxiv.org/abs/2010.06192) | All optimizers |
-| **OrthoGrad** | Removes gradient component parallel to weights to reduce overfitting | Full fine-tuning without weight decay | +33% time overhead (BS=4); less at larger BS | [Grokking at Edge](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability) | All optimizers |
-| **Factored** | Memory-efficient optimization via rank-1 1-bit factorization of optimizer states | Large models / memory-limited hardware | Adds compression overhead | [SMMF](https://arxiv.org/abs/2412.08894) | All optimizers |
-### B. Individual Features
-| Feature | Description | Recommended Usage | Performance Impact | Theoretical Basis | Compatibility |
-|--------|-------------|-------------------|--------------------|-------------------|--------------|
-| **atan2** | Robust epsilon replacement with built-in gradient clipping | Use for stable bounded updates (or for Adopt as it needs that) | No overhead | [Adam-atan2](https://github.com/lucidrains/adam-atan2-pytorch) | Adam/Adopt/Prodigy |
-| **Kourkoutas-β** | Layer-wise adaptive β₂ based on gradient “sunspike” ratio | Noisy/small/large-batch/high-LR training | No overhead | [Kourkoutas-β]() | Adam/Adopt/Prodigy |
----
-## 🔍 Feature Deep Dives
-### atan2
-- Replaces `eps` in Adam-family optimizers with a **scale-invariant**, bounded update rule.
-- Automatically clips updates to **[-2, 2]**, preventing destabilizing jumps.
-- **Highly recommended** for `Adopt_Adv`, which is prone to instability without clipping.
-> 📚 **Reference**:
-> - Paper: https://arxiv.org/abs/2407.05872
-> - Code: https://github.com/lucidrains/adam-atan2-pytorch
----
-### **Kourkoutas-β**
-**Kourkoutas-β** introduces a **sunspike-driven, layer-wise adaptive second-moment decay (β₂)** as an optional enhancement for `Adam_Adv`, `Adopt_Adv`, `Prodigy_Adv`.
-Instead of using a fixed β₂ (e.g., 0.999 or 0.95), it **dynamically modulates β₂ per layer** based on a bounded *sunspike ratio*:
-- **During gradient bursts** → β₂ ↓ toward `Lower β₂` → faster reaction
-- **During calm phases** → β₂ ↑ toward `The Selected β₂` → stronger smoothing
-This is especially effective for **noisy training, small batch sizes, and high learning rates**, where gradient norms shift abruptly due to noise or aggressive LR schedules.
-#### Pros/Cons
-| **Category** | **Details** |
-|--------------|-------------|
-| ✅ **Pros** | • **Layer-wise adaptation** blends benefits of high β₂ (strong smoothing) and low β₂ (fast reaction).<br>• **Robust to sudden loss landscape shifts**, reacts quickly during gradient bursts, smooths during calm phases.<br>• **High tolerance to aggressive learning rates**. |
-| ⚠️ **Cons** | • **Potentially unstable at the start of training** due to unreliable early gradient norms; mitigated by using `K-β Warmup Steps`. |
-> 💡 **Best Practice**: Set `K_warmup_steps` equal to your standard LR warmup steps. During warmup, the optimizer uses the static `beta2`; adaptation begins only after warmup ends.
-> 📚 **Reference**:
-> - Paper: [Kourkoutas-β: A Sunspike-Driven Adam Optimizer with Desert Flair](https://arxiv.org/abs/2508.12996)
-> - Code: [kbeta](https://github.com/sck-at-ucy/kbeta)
----
-## 📚 References
-1. [Revisiting BFloat16 Training](https://arxiv.org/abs/2010.06192)
-2. [SMMF: Square-Matricized Momentum Factorization](https://arxiv.org/abs/2412.08894)
-6. [Kourkoutas-β: A Sunspike-Driven Adam Optimizer with Desert Flair](https://arxiv.org/abs/2508.12996)
-7. [Scaling Exponents Across Parameterizations and Optimizers](https://arxiv.org/abs/2407.05872)

adv_optm-2.4.dev22.dist-info/RECORD DELETED Viewed

@@ -1,29 +0,0 @@
-adv_optm/__init__.py,sha256=_JP348MJrOkexPp6UrvAn4CYPZoZoSXHP5UZiNrPoKA,356
-adv_optm/optim/AdaMuon_adv.py,sha256=2ATHZhO7sfNE50gIc0ydb1nwY9ZbQM83d-Q_9KaKsqY,33572
-adv_optm/optim/AdamW_adv.py,sha256=_Nb_hw01ALW3se869kYiOJEKOftlQLW04Dj7Wver6eo,22749
-adv_optm/optim/Adopt_adv.py,sha256=WpuuWuO1c8mOVKAaTAbsKUMfQ2IaZWKKRd9C-TOi4WY,22988
-adv_optm/optim/Lion_adv.py,sha256=owq4RUKjw-xfn5r_UOekTr5EGqpx8zCp369KkK9vll8,12596
-adv_optm/optim/Muon_adv.py,sha256=2EmsS74_UD4A4yidv1q5EOvzpmqNDVKUWyYXeLmipo0,27877
-adv_optm/optim/Prodigy_adv.py,sha256=292F0ckr0MYEGex34z3F2xaWAcPPGP-KdyZGEteP17M,26991
-adv_optm/optim/SignSGD_adv.py,sha256=GF3g-EuVvcGBrg00UhIstoJ27vDxgxWifGUR38Q6iKw,16012
-adv_optm/optim/SinkSGD_adv.py,sha256=BPJ5xMweqxFnf6pWePa7iBbY73oXE3I_67S-3nBProg,16118
-adv_optm/optim/__init__.py,sha256=RkpzWpEgAOdQAppTBLc5AilfJq-wn0aabNsQDj_7-4A,452
-adv_optm/util/Kourkoutas.py,sha256=tTo2QbOWPhI29hWQ4ERosoZXRAFXXWDzIMCj1KMFaOE,13325
-adv_optm/util/Muon_AuxAdam.py,sha256=le72chD-VhQoDYN5BwungO_3G_7nZHwR89X9WauISsY,8313
-adv_optm/util/Muon_util.py,sha256=w6f6aeBpuJwNhltXiVlJy8X52hlaGQACdPHFFN7-hJg,15280
-adv_optm/util/OrthoGrad.py,sha256=TXZENLTa66hvtdIgyldyRgZo25GgQQx5MCZTK1KvE5g,780
-adv_optm/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-adv_optm/util/centered_decay.py,sha256=F6IGOjrEaVS5HAvO67u1gcDcu8TLQINFHL5s9RHoIb8,3869
-adv_optm/util/factorization_util.py,sha256=I75o2Cxlp4XKMBBnuRa6nDWAT9aJqNS75zv7-OGO5K8,3974
-adv_optm/util/lion_k.py,sha256=b3oqTHK1spoD_Hi8wxKZ9lEWUkHejovT4piU-wUp_eI,1686
-adv_optm/util/param_update.py,sha256=cUubaDgurnu-j9nWlyPDmTx48fJQV727xR48VVL2vGk,18281
-adv_optm/util/scaled_optm.py,sha256=a10D2oZJjfhjx2I6FP71oZ0LLcZ0Sp-PYbqTVk1W6QE,8014
-adv_optm/util/signed_util.py,sha256=ebQn-O8pNvld3RvsJ_MAdCL4C8pBNS15Uqzr2xHChss,1873
-adv_optm/util/sinkhorn.py,sha256=oFdWY243jBeuBKLBS47I5dE8DsK5OqJNDWlMBKXsxUM,5075
-adv_optm/util/state_util.py,sha256=K3cF_HTDqU__lC4PYDSd16K0ITzcKPbsSXyZPFUyHSU,11199
-adv_optm/util/update_util.py,sha256=IAaYuxrJ9kP2P4Z0v02pesRe4mRzP5SvXbLHjLrrdZw,1332
-adv_optm-2.4.dev22.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-adv_optm-2.4.dev22.dist-info/METADATA,sha256=S3Vvj3dsUSOF0km5FGq33CYu-2hGxdltQYMdbur1eZo,9725
-adv_optm-2.4.dev22.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
-adv_optm-2.4.dev22.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
-adv_optm-2.4.dev22.dist-info/RECORD,,

{adv_optm-2.4.dev22.dist-info → adv_optm-2.4.dev24.dist-info}/WHEEL RENAMED Viewed

File without changes

{adv_optm-2.4.dev22.dist-info → adv_optm-2.4.dev24.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{adv_optm-2.4.dev22.dist-info → adv_optm-2.4.dev24.dist-info}/top_level.txt RENAMED Viewed

File without changes

adv-optm 2.4.dev22__py3-none-any.whl → 2.4.dev24__py3-none-any.whl

adv-optm 2.4.dev22py3-none-any.whl → 2.4.dev24py3-none-any.whl