PyPI - adv-optm - Versions diffs - 0.1.0__tar.gz → 0.1.1__tar.gz - Mend

adv-optm 0.1.0tar.gz → 0.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of adv-optm might be problematic. Click here for more details.

Files changed (22) hide show

{adv_optm-0.1.0 → adv_optm-0.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 0.1.0
+Version: 0.1.1
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-0.1.0 → adv_optm-0.1.1}/adv_optm/__init__.py RENAMED Viewed

@@ -10,4 +10,4 @@ __all__ = [
     "Adopt_adv",
 ]
-__version__ = "0.1.0"
+__version__ = "0.1.1"

{adv_optm-0.1.0 → adv_optm-0.1.1}/adv_optm/optim/AdamW_adv.py RENAMED Viewed

@@ -37,7 +37,7 @@ class AdamW_adv(torch.optim.Optimizer):
             combined with the primary momentum (`mt`) to stabilize updates,
             especially in noisy, small-batch settings. If `False`, the
             optimizer behaves as standard AdamW. (default: False)
-        beta3 (float): The decay rate for the slow exponential moving average of
+        beta3_ema (float): The decay rate for the slow exponential moving average of
             the momentum (only used when `use_AdEMAMix` is `True`). A higher
             value (e.g., 0.9999) gives the EMA a longer memory, making it more
             stable but slower to adapt. A lower value (e.g., 0.999) is often
@@ -71,7 +71,7 @@ class AdamW_adv(torch.optim.Optimizer):
         use_grams: bool = False,
         use_orthograd: bool = False,
         use_AdEMAMix: bool = False,
-        beta3: float = 0.9999,
+        beta3_ema: float = 0.9999,
         alpha: float = 5.0,
         t_alpha: int | None = None,
         factored: bool = True,
@@ -89,7 +89,7 @@ class AdamW_adv(torch.optim.Optimizer):
             "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
             "vector_reshape": vector_reshape, "use_atan2": use_atan2,
             "use_orthograd": use_orthograd, "use_bias_correction": use_bias_correction,
-            "beta3": beta3, "alpha": alpha, "t_alpha": t_alpha,
+            "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
         }
         self.stochastic_rounding = stochastic_rounding
         self.use_cautious = use_cautious
@@ -162,7 +162,7 @@ class AdamW_adv(torch.optim.Optimizer):
         beta1, beta2 = group['betas']
         if self.use_AdEMAMix:
-            beta3 = group['beta3']
+            beta3_ema = group['beta3_ema']
             alpha = group['alpha']
             t_alpha = group['t_alpha']
             current_step = state['step'] + 1
@@ -201,7 +201,7 @@ class AdamW_adv(torch.optim.Optimizer):
                 torch.where(unpacked_sign_slow, mt_slow, -mt_slow, out=mt_slow)
                 del unpacked_sign_slow
-                mt_slow.mul_(beta3).add_(grad_reshaped, alpha=1.0 - beta3)
+                mt_slow.mul_(beta3_ema).add_(grad_reshaped, alpha=1.0 - beta3_ema)
                 update_m = mt + (alpha_t * mt_slow)
             else:
                 update_m = mt
@@ -245,7 +245,7 @@ class AdamW_adv(torch.optim.Optimizer):
             if self.use_AdEMAMix:
                 exp_avg_slow = state['exp_avg_slow']
-                exp_avg_slow.mul_(beta3).add_(grad, alpha=1 - beta3)
+                exp_avg_slow.mul_(beta3_ema).add_(grad, alpha=1 - beta3_ema)
                 update_m = exp_avg + (alpha_t * exp_avg_slow)
             else:
                 update_m = exp_avg

{adv_optm-0.1.0 → adv_optm-0.1.1}/adv_optm/optim/Adopt_adv.py RENAMED Viewed

@@ -48,7 +48,7 @@ class Adopt_adv(torch.optim.Optimizer):
             combined with the primary momentum (`mt`) to stabilize updates,
             especially in noisy, small-batch settings. If `False`, the
             optimizer behaves as standard ADOPT. (default: False)
-        beta3 (float): The decay rate for the slow exponential moving average of
+        beta3_ema (float): The decay rate for the slow exponential moving average of
             the momentum (only used when `use_AdEMAMix` is `True`). A higher
             value (e.g., 0.9999) gives the EMA a longer memory, making it more
             stable but slower to adapt. A lower value (e.g., 0.999) is often
@@ -83,7 +83,7 @@ class Adopt_adv(torch.optim.Optimizer):
         use_grams: bool = False,
         use_orthograd: bool = False,
         use_AdEMAMix: bool = False,
-        beta3: float = 0.9999,
+        beta3_ema: float = 0.9999,
         alpha: float = 5.0,
         t_alpha: int | None = None,
         factored: bool = True,
@@ -99,7 +99,7 @@ class Adopt_adv(torch.optim.Optimizer):
         defaults = {
             "lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay,
-            "vector_reshape": vector_reshape, "beta3": beta3, "alpha": alpha,
+            "vector_reshape": vector_reshape, "beta3_ema": beta3_ema, "alpha": alpha,
             "t_alpha": t_alpha,
         }
         self.clip_lambda = clip_lambda
@@ -179,7 +179,7 @@ class Adopt_adv(torch.optim.Optimizer):
         beta1, beta2 = group['betas']
         if self.use_AdEMAMix:
-            beta3 = group['beta3']
+            beta3_ema = group['beta3_ema']
             alpha = group['alpha']
             t_alpha = group['t_alpha']
             # Use step+1 for 1-based step count in scheduler
@@ -236,7 +236,7 @@ class Adopt_adv(torch.optim.Optimizer):
                 del mask
             if self.use_AdEMAMix:
-                mt_slow = mt_slow_prev.mul_(beta3).add_(normalized_grad, alpha=1.0 - beta3)
+                mt_slow = mt_slow_prev.mul_(beta3_ema).add_(normalized_grad, alpha=1.0 - beta3_ema)
                 update = mt + (alpha_t * mt_slow)
                 update = update.view(p.shape)
             else:
@@ -293,7 +293,7 @@ class Adopt_adv(torch.optim.Optimizer):
                 del mask
             if self.use_AdEMAMix:
-                m_slow.mul_(beta3).add_(normalized_grad, alpha=1.0 - beta3)
+                m_slow.mul_(beta3_ema).add_(normalized_grad, alpha=1.0 - beta3_ema)
                 update = m + (alpha_t * m_slow)
             else:
                 update = m

{adv_optm-0.1.0 → adv_optm-0.1.1}/adv_optm/optim/Prodigy_adv.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import torch
-from typing import Optional
+import torch.distributed as dist
 import math
 from ..util.BF16_Stochastic_Rounding import add_stochastic_
@@ -54,6 +55,23 @@ class Prodigy_adv(torch.optim.Optimizer):
             the scheduler is disabled and th
         factored (bool): whether to use the factorization or disable it to use
             the uncompressed optimizer. (default: True)
+        d0 (float):
+            Initial D estimate for D-adaptation (default 1e-6). Rarely needs changing.
+        d_coef (float):
+            Coefficient in the expression for the estimate of d (default 1.0).
+            Values such as 0.5 and 2.0 typically work as well.
+            Changing this parameter is the preferred way to tune the method.
+        growth_rate (float):
+            prevent the D estimate from growing faster than this multiplicative rate.
+            Default is inf, for unrestricted. Values like 1.02 give a kind of learning
+            rate warmup effect.
+        fsdp_in_use (bool):
+            If you're using sharded parameters, this should be set to True. The optimizer
+            will attempt to auto-detect this, but if you're using an implementation other
+            than PyTorch's builtin version, the auto-detection won't work.
+        slice_p (int): Reduce memory usage by calculating LR adaptation statistics on only every
+            pth entry of each tensor. For values greater than 1 this an an approximation to standard
+            Prodigy. Values ~11 are reasonable (default 1).
     """
     def __init__(
@@ -80,6 +98,7 @@ class Prodigy_adv(torch.optim.Optimizer):
         d_coef: float = 1,
         growth_rate: float = float('inf'),
         safeguard_warmup: bool = False,
+        fsdp_in_use: bool = False,
         slice_p: int = 11,
     ):
         if not (lr >= 0.0):
@@ -98,12 +117,14 @@ class Prodigy_adv(torch.optim.Optimizer):
             "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
             "beta3": beta3, "d": d0, "d0": d0, "d_max": d0, "d_numerator": 0.0, "d_coef": d_coef,
             "growth_rate": growth_rate, "safeguard_warmup": safeguard_warmup, "k": 0, "slice_p": slice_p,
+            "fsdp_in_use": fsdp_in_use,
         }
         self.stochastic_rounding = stochastic_rounding
         self.use_cautious = use_cautious
         self.use_grams = use_grams
         self.use_AdEMAMix = use_AdEMAMix
         self.factored = factored
+        self.fsdp_in_use = fsdp_in_use
         super().__init__(params, defaults)
         self.init_step()
@@ -142,6 +163,9 @@ class Prodigy_adv(torch.optim.Optimizer):
         if p.grad is None:
             return
+        if hasattr(p, "_fsdp_flattened"):
+            self.fsdp_in_use = True
         grad = p.grad
         if grad.dtype != torch.float32 and self.factored:
             grad = grad.float()
@@ -349,8 +373,16 @@ class Prodigy_adv(torch.optim.Optimizer):
         g_group = self.param_groups[0]
         d_max, d_coef, growth_rate = g_group['d_max'], g_group['d_coef'], g_group['growth_rate']
-        global_d_numerator = self.d_numerator
-        global_d_denom = self.d_denom
+        if self.fsdp_in_use and dist.is_available() and dist.is_initialized():
+            # Use the device of the first parameter to avoid hardcoding '.cuda()'
+            device = self.param_groups[0]['params'][0].device
+            dist_tensor = torch.tensor([self.d_numerator, self.d_denom], device=device)
+            dist.all_reduce(dist_tensor, op=dist.ReduceOp.SUM)
+            global_d_numerator = dist_tensor[0].item()
+            global_d_denom = dist_tensor[1].item()
+        else:
+            global_d_numerator = self.d_numerator
+            global_d_denom = self.d_denom
         d_hat = self.d
         if global_d_denom > 0:

{adv_optm-0.1.0 → adv_optm-0.1.1}/adv_optm.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 0.1.0
+Version: 0.1.1
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

{adv_optm-0.1.0 → adv_optm-0.1.1}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setup(
     name="adv_optm",
-    version="0.1.0",
+    version="0.1.1",
     author="Koratahiu",
     author_email="hiuhonor@gmail.com",
     license='Apache 2.0',