PyPI - adv-optm - Versions diffs - 1.1.0.dev3__py3-none-any.whl → 1.1.0.dev5__py3-none-any.whl - Mend

adv-optm 1.1.0.dev3py3-none-any.whl → 1.1.0.dev5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of adv-optm might be problematic. Click here for more details.

Files changed (14) hide show

adv_optm/__init__.py +1 -1
adv_optm/optim/AdamW_adv.py +3 -3
adv_optm/optim/Adopt_adv.py +435 -439
adv_optm/optim/Lion_Prodigy_adv.py +315 -315
adv_optm/optim/Lion_adv.py +1 -1
adv_optm/optim/Prodigy_adv.py +13 -6
adv_optm/optim/Simplified_AdEMAMix.py +3 -3
adv_optm/util/Kourkoutas.py +71 -36
{adv_optm-1.1.0.dev3.dist-info → adv_optm-1.1.0.dev5.dist-info}/METADATA +1 -1
adv_optm-1.1.0.dev5.dist-info/RECORD +20 -0
adv_optm-1.1.0.dev3.dist-info/RECORD +0 -20
{adv_optm-1.1.0.dev3.dist-info → adv_optm-1.1.0.dev5.dist-info}/WHEEL +0 -0
{adv_optm-1.1.0.dev3.dist-info → adv_optm-1.1.0.dev5.dist-info}/licenses/LICENSE +0 -0
{adv_optm-1.1.0.dev3.dist-info → adv_optm-1.1.0.dev5.dist-info}/top_level.txt +0 -0

adv_optm/optim/Prodigy_adv.py CHANGED Viewed

@@ -88,6 +88,9 @@ class Prodigy_adv(torch.optim.Optimizer):
         prodigy_steps (int): If greater than zero, disable Prodigy's stepsize adjustments
             after the specified optimiser step and release all state memory required by Prodigy
             (default: 0).
+        d_limiter (bool): whether to clamp the new step size estimate (`d_hat`)
+            to prevent sudden, volatile increases in the adaptive step size (`d`).
+            (default: False)
         kourkoutas_beta (bool): whether to enable the layer-wise dynamic β₂ logic.
             If `False`, the optimizer behaves as standard AdamW/Prodigy. (default: False)
         beta2_min (float): The minimum value for dynamic β₂, used during periods of
@@ -141,9 +144,11 @@ class Prodigy_adv(torch.optim.Optimizer):
         fsdp_in_use: bool = False,
         slice_p: int = 11,
         prodigy_steps: int = 0,
+        d_limiter: bool = False,
+        # K-b parameters
         kourkoutas_beta: bool = False,
-        beta2_min: float = 0.88,
-        ema_alpha: float = 0.93,
+        beta2_min: float = 0.9,
+        ema_alpha: float = 0.95,
         tiny_spike: float = 1e-9,
         k_warmup_steps: int = 0,
         k_logging: int = 0,
@@ -175,8 +180,8 @@ class Prodigy_adv(torch.optim.Optimizer):
             use_atan2 = False
         if kourkoutas_beta and not (betas[1] > beta2_min):
             raise ValueError(f"For Kourkoutas-β, betas[1] (as beta2_max) must be > beta2_min. Got {betas[1]} and {beta2_min}")
-        if Simplified_AdEMAMix and alpha_grad > 0:
-            # scales d_coef by alpha_grad, this force prodigy to behave well with Simplified_AdEMAMix
+        if Simplified_AdEMAMix and alpha_grad > 0 and not d_limiter:
+            # scales d_coef by alpha_grad, this force prodigy to behave well with Simplified_AdEMAMix.
             d_coef = d_coef/alpha_grad
         defaults = {
@@ -186,7 +191,7 @@ class Prodigy_adv(torch.optim.Optimizer):
             "beta3_ema": beta3_ema, "alpha": alpha, "t_alpha": t_alpha,
             "beta3": beta3, "d": d0, "d0": d0, "d_max": d0, "d_numerator": 0.0, "d_coef": d_coef,
             "growth_rate": growth_rate, "safeguard_warmup": safeguard_warmup, "k": 0, "slice_p": slice_p,
-            "fsdp_in_use": fsdp_in_use, "prodigy_steps": prodigy_steps,
+            "fsdp_in_use": fsdp_in_use, "prodigy_steps": prodigy_steps, "d_limiter": d_limiter,
             "alpha_grad": alpha_grad,
             "kourkoutas_beta": kourkoutas_beta, "beta2_min": beta2_min, "ema_alpha": ema_alpha,
             "tiny_spike": tiny_spike, "k_warmup_steps": k_warmup_steps, "k_logging": k_logging,
@@ -251,7 +256,7 @@ class Prodigy_adv(torch.optim.Optimizer):
         state = self.state[p]
         # State Initialization
-        if len(state) == 0:
+        if 'step' not in state:
             state['step'] = 0
             should_factor = (
@@ -512,6 +517,8 @@ class Prodigy_adv(torch.optim.Optimizer):
             d_hat = self.d
             if global_d_denom > 0:
                 d_hat = d_coef * global_d_numerator / global_d_denom
+                if g_group['d_limiter']:
+                    d_hat = min(self.d * (2 ** 0.25), d_hat)
                 if self.d == g_group['d0']:
                     self.d = max(self.d, d_hat)
                 d_max = max(d_max, d_hat)

adv_optm/optim/Simplified_AdEMAMix.py CHANGED Viewed

@@ -90,8 +90,8 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
         stochastic_rounding: bool = True,
         orthogonal_gradient: bool = False,
         kourkoutas_beta: bool = False,
-        beta2_min: float = 0.88,
-        ema_alpha: float = 0.93,
+        beta2_min: float = 0.9,
+        ema_alpha: float = 0.95,
         tiny_spike: float = 1e-9,
         k_warmup_steps: int = 0,
         k_logging: int = 0,
@@ -152,7 +152,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
         state = self.state[p]
         # State Initialization
-        if len(state) == 0:
+        if 'step' not in state:
             state['step'] = 0
             should_factor = (

adv_optm/util/Kourkoutas.py CHANGED Viewed

@@ -11,22 +11,27 @@ class KourkoutasHelper:
         if not hasattr(optimizer, 'param_groups'):
             raise TypeError("optimizer must be a valid torch.optim.Optimizer instance.")
         self.optimizer = optimizer
-        # State managed by the helper
         self.layer_state = {}
         self.layer_info = {}
         self._layer_info_built = False
         self._current_step_prepared = -1
+        # Store stats for external logging (e.g., TensorBoard)
+        self.last_beta2_stats = {}
         # This ensures the map is complete before the first backward pass,
         # making it compatible with fused back pass mechanisms.
         self._build_layer_info_if_needed()
+        if self.optimizer.param_groups[0].get('k_logging', 0) > 0:
+            self.print_layer_info()
     def _build_layer_info_if_needed(self):
         """Builds a map of layers and the parameters they contain."""
         if self._layer_info_built:
             return
         if not hasattr(self.optimizer, 'layer_key_fn') or self.optimizer.layer_key_fn is None:
             print("Warning: KourkoutasHelper requires 'layer_key_fn' on the optimizer. Defaulting to tensor-wise (id).")
             self.optimizer.layer_key_fn = lambda p: id(p)
@@ -45,64 +50,94 @@ class KourkoutasHelper:
         self._layer_info_built = True
+    def print_layer_info(self):
+        """Prints the contents of self.layer_info for debugging."""
+        print("\n--- BEGIN self.layer_info DUMP ---")
+        if not self.layer_info:
+            print("Layer info is empty. Make sure the optimizer has parameters.")
+            return
+        for layer_key, info in self.layer_info.items():
+            param_count = len(info['params'])
+            first_param_details = ""
+            if param_count > 0:
+                p = info['params'][0]
+                first_param_details = f" (Example param shape: {list(p.shape)}, dtype: {p.dtype})"
+            print(f"Key: {layer_key}, Params: {param_count}{first_param_details}")
+        print("--- END self.layer_info DUMP ---\n")
     def prepare_step(self, current_step: int):
         """
         Calculates dynamic beta2 for all layers using the completed scalar accumulators
         from the PREVIOUS step. Should be called once at the start of an optimizer step.
         """
-        # Check if logging is enabled for this step based on the interval
-        k_logging_interval = self.optimizer.param_groups[0].get('k_logging', 0)
-        is_logging_step = k_logging_interval > 0 and (current_step + 1) % k_logging_interval == 0
-        beta2_log = [] if is_logging_step else None
+        beta2_log = []
         first_layer_key = next(iter(self.layer_info), None)
+        # These are just for the sample log, initialize them
+        sun, pooled_grad_norm, prev_r_ema_val, r_ema_tensor = (torch.tensor(0.0),)*4
         for layer_key, info in self.layer_info.items():
             params, group = info['params'], info['group_ref']
+            first_param_in_layer = info['params'][0]
+            param_state = self.optimizer.state[first_param_in_layer]
             if layer_key not in self.layer_state:
                 self.layer_state[layer_key] = {
-                    'r_ema_grad_norm': torch.tensor(0.0, device=params[0].device, dtype=torch.float32),
-                    'sum_sq_accumulator': torch.tensor(0.0, device=params[0].device, dtype=torch.float32)
+                    'sum_sq_accumulator': torch.tensor(0.0, device=first_param_in_layer.device, dtype=torch.float32)
                 }
-            layer_state = self.layer_state[layer_key]
-            # Use the completed accumulator from the previous step
-            pooled_grad_norm = torch.sqrt(layer_state['sum_sq_accumulator'])
+            if 'kourkoutas_r_ema' not in param_state:
+                param_state['kourkoutas_r_ema'] = torch.tensor(0.0, device=first_param_in_layer.device, dtype=torch.float32)
+            r_ema_tensor = param_state['kourkoutas_r_ema']
+            accumulator = self.layer_state[layer_key]['sum_sq_accumulator']
-            r_ema = layer_state['r_ema_grad_norm']
-            prev_r_ema_val = r_ema.item() # for logging
+            pooled_grad_norm = torch.sqrt(accumulator)
+            prev_r_ema_val = r_ema_tensor.item() # for logging
-            # EMA is always updated, even during warmup
-            r_ema.mul_(group['ema_alpha']).add_(pooled_grad_norm, alpha=1.0 - group['ema_alpha'])
+            # Update the persistent EMA tensor in-place.
+            r_ema_tensor.mul_(group['ema_alpha']).add_(pooled_grad_norm, alpha=1.0 - group['ema_alpha'])
-            sun = torch.tensor(0.0, device=r_ema.device) # Default sun to 0 for warmup
             beta2_max = group['betas'][1]
-            # --- CONSOLIDATED WARMUP LOGIC ---
+            sun = torch.tensor(0.0, device=r_ema_tensor.device) # Default sun to 0 for warmup
             if current_step < group['k_warmup_steps']:
                 beta2 = beta2_max
             else:
-                raw = pooled_grad_norm / (r_ema + group['tiny_spike'])
+                raw = pooled_grad_norm / (r_ema_tensor + group['tiny_spike'])
                 sun = raw / (1.0 + raw)
                 beta2 = beta2_max - (beta2_max - group['beta2_min']) * sun
-            layer_state['dynamic_beta2'] = beta2.item() if isinstance(beta2, torch.Tensor) else beta2
-            layer_state['sum_sq_accumulator'].zero_()
+            # Store the final calculated beta2 in the helper's transient state for this step.
+            self.layer_state[layer_key]['dynamic_beta2'] = beta2.item() if isinstance(beta2, torch.Tensor) else beta2
+            # Reset the accumulator for the next optimizer step.
+            accumulator.zero_()
-            if is_logging_step:
-                beta2_log.append(layer_state['dynamic_beta2'])
-                if layer_key == first_layer_key:
-                    print(f"\n[Kourkoutas-β Debug] Step {current_step + 1} - Sample Layer '{layer_key}':")
-                    print(f"  - Grad Norm: {pooled_grad_norm.item():.4e}, Prev EMA: {prev_r_ema_val:.4e}, New EMA: {r_ema.item():.4e}")
-                    print(f"  - Sunspike: {sun.item():.4f}, Dynamic Beta2: {layer_state['dynamic_beta2']:.4f}")
-        if is_logging_step and beta2_log:
+            beta2_log.append(self.layer_state[layer_key]['dynamic_beta2'])
+        # Always compute stats for TensorBoard
+        if beta2_log:
             beta2_tensor = torch.tensor(beta2_log, device='cpu')
-            print(f"[Kourkoutas-β Debug] Step {current_step + 1} Overall Beta2 Stats: Min={beta2_tensor.min():.4f}, Max={beta2_tensor.max():.4f}, Mean={beta2_tensor.mean():.4f}")
+            self.last_beta2_stats = {
+                'min': beta2_tensor.min().item(),
+                'max': beta2_tensor.max().item(),
+                'mean': beta2_tensor.mean().item(),
+            }
+        # Handle periodic console logging
+        k_logging_interval = self.optimizer.param_groups[0].get('k_logging', 0)
+        is_logging_step = k_logging_interval > 0 and (current_step + 1) % k_logging_interval == 0
+        if is_logging_step and self.last_beta2_stats:
+            if first_layer_key:
+                print(f"\n[Kourkoutas-β Debug] Step {current_step + 1} - Sample Layer '{first_layer_key}':")
+                print(f"  - Grad Norm: {pooled_grad_norm.item():.4e}, Prev EMA: {prev_r_ema_val:.4e}, New EMA: {r_ema_tensor.item():.4e}")
+                print(f"  - Sunspike: {sun.item():.4f}, Dynamic Beta2: {self.layer_state[first_layer_key]['dynamic_beta2']:.4f}")
+            print(f"[Kourkoutas-β Debug] Step {current_step + 1} Overall Beta2 Stats: Min={self.last_beta2_stats['min']:.4f}, Max={self.last_beta2_stats['max']:.4f}, Mean={self.last_beta2_stats['mean']:.4f}")
     def maybe_prepare_step(self, current_step: int):
         """
@@ -119,9 +154,9 @@ class KourkoutasHelper:
         layer_key = self.optimizer.layer_key_fn(p)
         if layer_key in self.layer_info:
+            # Initialize the transient state for this layer if it's the first time in the step.
             if layer_key not in self.layer_state:
                     self.layer_state[layer_key] = {
-                    'r_ema_grad_norm': torch.tensor(0.0, device=p.device, dtype=torch.float32),
                     'sum_sq_accumulator': torch.tensor(0.0, device=p.device, dtype=torch.float32)
                 }
             # Accumulate for the *next* step's prepare_step call

{adv_optm-1.1.0.dev3.dist-info → adv_optm-1.1.0.dev5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: adv_optm
-Version: 1.1.0.dev3
+Version: 1.1.0.dev5
 Summary: A family of highly efficient, lightweight yet powerful optimizers.
 Home-page: https://github.com/Koratahiu/Advanced_Optimizers
 Author: Koratahiu

adv_optm-1.1.0.dev5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+adv_optm/__init__.py,sha256=lOHXiF0KmYmUnaQGIoUYeIxdEfYE8T1hFSVq5FVujDs,311
+adv_optm/optim/AdamW_adv.py,sha256=gVVpaKIbpv8pkfvfgVGCQN6No8A4atO7eRSPDBUVqq8,17490
+adv_optm/optim/Adopt_adv.py,sha256=K7z1iiln_HxuEPLl9yGtCngBfdZHxJISQ5dKgNBV-s4,21463
+adv_optm/optim/Lion_Prodigy_adv.py,sha256=aJ9orEEw0QYbrDzn1be0SHvOBlIkLwWG9RpWFuNMskM,13163
+adv_optm/optim/Lion_adv.py,sha256=aGNAplZlyXYgVllYcV_s4bK8iC4fv6EizFoWIMNLdBc,8299
+adv_optm/optim/Prodigy_adv.py,sha256=ecdnnbRgclcG49sGzxAmPHPE_0KkaQWtaiynsBYudoM,25979
+adv_optm/optim/Simplified_AdEMAMix.py,sha256=Cm-8tdCaTahdz45EExgn2W3a5Xl44T9MW-IMrUDbJFk,12983
+adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
+adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
+adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
+adv_optm/util/Kourkoutas.py,sha256=DCsIcZ1sEeSwthN8KZH7OTKoIZJ3ah4t5DNiqxsSuCk,8344
+adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
+adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
+adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
+adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
+adv_optm-1.1.0.dev5.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
+adv_optm-1.1.0.dev5.dist-info/METADATA,sha256=2xyGCRbIN54aIuAWnRIpR49okoVgVJb2AGHl2-jgVx8,8427
+adv_optm-1.1.0.dev5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+adv_optm-1.1.0.dev5.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
+adv_optm-1.1.0.dev5.dist-info/RECORD,,

adv_optm-1.1.0.dev3.dist-info/RECORD DELETED Viewed

@@ -1,20 +0,0 @@
-adv_optm/__init__.py,sha256=aSPtwpl2S7i_-KYXTDDeKoQlcLjZc6whVUNOINl6TEA,311
-adv_optm/optim/AdamW_adv.py,sha256=H4XlYZELwiFvXt0A9wMlRNiw9c8rmPMspHDCvR_SZIQ,17487
-adv_optm/optim/Adopt_adv.py,sha256=PJ3ZaLgzYbvxXDS56FGjzMrVMyHDXSWdUPHnX5NpNAA,21241
-adv_optm/optim/Lion_Prodigy_adv.py,sha256=sGzhts9a6gHfCkuHTB5L9IrClo4c6UThzYYErBwqOaA,12844
-adv_optm/optim/Lion_adv.py,sha256=6G1CukJB_pC7l9HwFEuY1ydsNHZFabVmOvcHDsHHVuQ,8295
-adv_optm/optim/Prodigy_adv.py,sha256=EeSfYu8IIeZX1Dk8MlD71vGOpMadtnW2iMhHxPDL2XQ,25574
-adv_optm/optim/Simplified_AdEMAMix.py,sha256=b4GaSI-TX6wFBqGxZeoJPbf2nVRCEtB3WVb1olDgY14,12980
-adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
-adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
-adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
-adv_optm/util/Kourkoutas.py,sha256=UN_EAbG-9p98Qp2c_vSUy1Gw1K55SQ_e0TmnNBb-OFQ,6748
-adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
-adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
-adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
-adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
-adv_optm-1.1.0.dev3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
-adv_optm-1.1.0.dev3.dist-info/METADATA,sha256=03sDh1nQ1CQXxu4TbRnRblX1IZ9S-Eka7hP1LNs54WA,8427
-adv_optm-1.1.0.dev3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-adv_optm-1.1.0.dev3.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
-adv_optm-1.1.0.dev3.dist-info/RECORD,,

{adv_optm-1.1.0.dev3.dist-info → adv_optm-1.1.0.dev5.dist-info}/WHEEL RENAMED Viewed

File without changes

{adv_optm-1.1.0.dev3.dist-info → adv_optm-1.1.0.dev5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{adv_optm-1.1.0.dev3.dist-info → adv_optm-1.1.0.dev5.dist-info}/top_level.txt RENAMED Viewed

File without changes

adv-optm 1.1.0.dev3__py3-none-any.whl → 1.1.0.dev5__py3-none-any.whl

Potentially problematic release.

adv-optm 1.1.0.dev3py3-none-any.whl → 1.1.0.dev5py3-none-any.whl