adv-optm 1.1.0.dev5__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of adv-optm might be problematic. Click here for more details.

adv_optm/__init__.py CHANGED
@@ -16,4 +16,4 @@ __all__ = [
16
16
  "Lion_Prodigy_adv",
17
17
  ]
18
18
 
19
- __version__ = "1.1.0.dev5"
19
+ __version__ = "1.1.2"
@@ -10,7 +10,7 @@ from ..util.Kourkoutas import KourkoutasHelper
10
10
 
11
11
  class AdamW_adv(torch.optim.Optimizer):
12
12
  """
13
- Implements a factored AdamW algorithm.
13
+ Implements an advanced AdamW algorithm.
14
14
  This is an advanced version of AdamW with optional features like
15
15
  low-rank factorization of optimizer states (SMMF), OrthoGrad, etc.
16
16
 
@@ -67,7 +67,7 @@ class AdamW_adv(torch.optim.Optimizer):
67
67
  "sunspike" ratio calculation to prevent division by zero. Corresponds
68
68
  to `ε_spike` in the paper. (default: 1e-9)
69
69
  k_warmup_steps (int): The number of initial steps during which β₂ is held
70
- at a fixed average value (`(beta2_min + beta2_max) / 2`) before the
70
+ at a fixed beta2 value before the
71
71
  dynamic logic activates. (default: 0)
72
72
  k_logging (int): if > 0 and kourkoutas_beta=True, enables periodic console
73
73
  logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
@@ -10,7 +10,7 @@ from ..util.Kourkoutas import KourkoutasHelper
10
10
 
11
11
  class Adopt_adv(torch.optim.Optimizer):
12
12
  """
13
- Implements a fusion of SMMF, and the ADOPT algorithm.
13
+ Implements an advanced ADOPT algorithm.
14
14
 
15
15
  The ADOPT update rule modifies Adam by:
16
16
  1. **Initialization:** The second moment `v` is initialized as `v₀ = g₀²`.
@@ -85,7 +85,7 @@ class Adopt_adv(torch.optim.Optimizer):
85
85
  "sunspike" ratio calculation to prevent division by zero. Corresponds
86
86
  to `ε_spike` in the paper. (default: 1e-9)
87
87
  k_warmup_steps (int): The number of initial steps during which β₂ is held
88
- at a fixed average value (`(beta2_min + beta2_max) / 2`) before the
88
+ at a fixed beta2 value before the
89
89
  dynamic logic activates. (default: 0)
90
90
  k_logging (int): if > 0 and kourkoutas_beta=True, enables periodic console
91
91
  logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
@@ -50,6 +50,12 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
50
50
  slice_p (int): Reduce memory usage by calculating LR adaptation statistics on only every
51
51
  pth entry of each tensor. For values greater than 1 this an an approximation to standard
52
52
  Prodigy. Values ~11 are reasonable (default 11).
53
+ prodigy_steps (int): If greater than zero, disable Prodigy's stepsize adjustments
54
+ after the specified optimiser step and release all state memory required by Prodigy
55
+ (default: 0).
56
+ d_limiter (bool): whether to clamp the new step size estimate (`d_hat`)
57
+ to prevent sudden, volatile increases in the adaptive step size (`d`).
58
+ (default: True)
53
59
  """
54
60
 
55
61
  def __init__(
@@ -63,7 +69,7 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
63
69
  orthogonal_gradient: bool = False,
64
70
  cautious_mask: bool = False,
65
71
  clip_threshold: float = 0.0,
66
- nnmf_factor: bool = True,
72
+ nnmf_factor: bool = False,
67
73
  # prodigy parameters
68
74
  beta3: float = None,
69
75
  d0: float = 1e-6,
@@ -72,6 +78,8 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
72
78
  safeguard_warmup: bool = False,
73
79
  fsdp_in_use: bool = False,
74
80
  slice_p: int = 11,
81
+ prodigy_steps: int = 0,
82
+ d_limiter: bool = True,
75
83
  ):
76
84
  if not lr > 0.0:
77
85
  raise ValueError(f"Learning rate must be > 0.0, but got {lr}")
@@ -90,6 +98,8 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
90
98
  beta3=beta3, d=d0, d0=d0, d_max=d0, d_numerator=0.0, d_coef=d_coef,
91
99
  growth_rate=growth_rate, safeguard_warmup=safeguard_warmup, k=0, slice_p=slice_p,
92
100
  fsdp_in_use=fsdp_in_use,
101
+ prodigy_steps=prodigy_steps,
102
+ d_limiter=d_limiter,
93
103
  )
94
104
  self.stochastic_rounding = stochastic_rounding
95
105
  self.cautious_mask = cautious_mask
@@ -235,20 +245,28 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
235
245
  # Update momentum
236
246
  exp_avg.mul_(self.beta2).add_(grad, alpha=self.d * (1 - self.beta2))
237
247
 
238
- # --- Accumulate Prodigy stats ---
239
- d0, safeguard_warmup, slice_p = group['d0'], group['safeguard_warmup'], group['slice_p']
240
- s, p0 = state['s'], state['p0']
241
- grad_flat = grad.flatten().float()
242
- p_flat = p.data.flatten().float()
243
- p0 = p0.float()
248
+ prodigy_steps = group['prodigy_steps']
249
+ if prodigy_steps <= 0 or group['k'] < prodigy_steps:
250
+ # --- Accumulate Prodigy stats ---
251
+ d0, safeguard_warmup, slice_p = group['d0'], group['safeguard_warmup'], group['slice_p']
252
+ s, p0 = state['s'], state['p0']
253
+ grad_flat = grad.flatten().float()
254
+ p_flat = p.data.flatten().float()
255
+ p0 = p0.float()
244
256
 
245
- self.d_numerator += (self.d / d0) * self.dlr * torch.dot(grad_flat[::slice_p], p0.data - p_flat[::slice_p]).item()
257
+ self.d_numerator += (self.d / d0) * self.dlr * torch.dot(grad_flat[::slice_p], p0.data - p_flat[::slice_p]).item()
246
258
 
247
- alpha = ((self.d / d0) * self.d) if safeguard_warmup else ((self.d / d0) * self.dlr)
248
- s.mul_(self.beta3).add_(grad_flat[::slice_p], alpha=alpha)
249
- self.d_denom += s.abs().sum().item()
259
+ alpha = ((self.d / d0) * self.d) if safeguard_warmup else ((self.d / d0) * self.dlr)
260
+ s.mul_(self.beta3).add_(grad_flat[::slice_p], alpha=alpha)
261
+ self.d_denom += s.abs().sum().item()
250
262
 
251
- del s, p0, grad_flat, p_flat, alpha
263
+ del s, p0, grad_flat, p_flat, alpha
264
+ else:
265
+ # Free memory if prodigy_steps is reached
266
+ if 's' in state:
267
+ del state['s']
268
+ if 'p0' in state:
269
+ del state['p0']
252
270
 
253
271
  if group["weight_decay"] != 0:
254
272
  if p.dtype == torch.bfloat16 and self.stochastic_rounding:
@@ -287,29 +305,37 @@ class Lion_Prodigy_adv(torch.optim.Optimizer):
287
305
  def calculate_d(self):
288
306
  """Calculates the new `d` based on the accumulated stats."""
289
307
  g_group = self.param_groups[0]
290
- d_max, d_coef, growth_rate = g_group['d_max'], g_group['d_coef'], g_group['growth_rate']
291
-
292
- if self.fsdp_in_use and dist.is_available() and dist.is_initialized():
293
- # Use the device of the first parameter to avoid hardcoding '.cuda()'
294
- device = self.param_groups[0]['params'][0].device
295
- dist_tensor = torch.tensor([self.d_numerator, self.d_denom], device=device)
296
- dist.all_reduce(dist_tensor, op=dist.ReduceOp.SUM)
297
- global_d_numerator = dist_tensor[0].item()
298
- global_d_denom = dist_tensor[1].item()
299
- else:
300
- global_d_numerator = self.d_numerator
301
- global_d_denom = self.d_denom
302
-
303
- d_hat = self.d
304
- if global_d_denom > 0:
305
- d_hat = d_coef * global_d_numerator / global_d_denom
306
- if self.d == g_group['d0']:
307
- self.d = max(self.d, d_hat)
308
- d_max = max(d_max, d_hat)
309
- self.d = min(d_max, self.d * growth_rate)
310
-
308
+ # Only perform d-adaptation if prodigy_steps has not been reached
309
+ prodigy_active = not (g_group.get('prodigy_steps', 0) > 0 and g_group['k'] >= g_group['prodigy_steps'])
310
+
311
+ if prodigy_active:
312
+ d_max, d_coef, growth_rate = g_group['d_max'], g_group['d_coef'], g_group['growth_rate']
313
+
314
+ if self.fsdp_in_use and dist.is_available() and dist.is_initialized():
315
+ # Use the device of the first parameter to avoid hardcoding '.cuda()'
316
+ device = self.param_groups[0]['params'][0].device
317
+ dist_tensor = torch.tensor([self.d_numerator, self.d_denom], device=device)
318
+ dist.all_reduce(dist_tensor, op=dist.ReduceOp.SUM)
319
+ global_d_numerator = dist_tensor[0].item()
320
+ global_d_denom = dist_tensor[1].item()
321
+ else:
322
+ global_d_numerator = self.d_numerator
323
+ global_d_denom = self.d_denom
324
+
325
+ d_hat = self.d
326
+ if global_d_denom > 0:
327
+ d_hat = d_coef * global_d_numerator / global_d_denom
328
+ if g_group['d_limiter']:
329
+ d_hat = min(self.d * (2 ** 0.25), d_hat)
330
+ if self.d == g_group['d0']:
331
+ self.d = max(self.d, d_hat)
332
+ d_max = max(d_max, d_hat)
333
+ self.d = min(d_max, self.d * growth_rate)
334
+
335
+ for group in self.param_groups:
336
+ group['d_numerator'] = global_d_numerator
337
+ group['d'] = self.d
338
+ group['d_max'] = d_max
339
+ # Increment step counter for all groups, regardless of whether d was updated
311
340
  for group in self.param_groups:
312
- group['d_numerator'] = global_d_numerator
313
- group['d'] = self.d
314
- group['d_max'] = d_max
315
341
  group['k'] += 1
@@ -14,7 +14,7 @@ from ..util.Kourkoutas import KourkoutasHelper
14
14
 
15
15
  class Prodigy_adv(torch.optim.Optimizer):
16
16
  """
17
- Implements a factored Prodigy/AdamW algorithm.
17
+ Implements an advanced Prodigy algorithm.
18
18
  This is an advanced version of Prodigy with optional features like
19
19
  low-rank factorization of optimizer states (SMMF), OrthoGrad, etc.
20
20
 
@@ -103,7 +103,7 @@ class Prodigy_adv(torch.optim.Optimizer):
103
103
  "sunspike" ratio calculation to prevent division by zero. Corresponds
104
104
  to `ε_spike` in the paper. (default: 1e-9)
105
105
  k_warmup_steps (int): The number of initial steps during which β₂ is held
106
- at a fixed average value (`(beta2_min + beta2_max) / 2`) before the
106
+ at a fixed beta2 value before the
107
107
  dynamic logic activates. (default: 0)
108
108
  k_logging (int): if > 0 and kourkoutas_beta=True, enables periodic console
109
109
  logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
@@ -311,10 +311,8 @@ class Prodigy_adv(torch.optim.Optimizer):
311
311
  self.kourkoutas_helper.accumulate_gradient_sq_norm(p, grad)
312
312
  # Get the dynamic beta2 calculated in prepare_step()
313
313
  beta2 = self.kourkoutas_helper.get_beta2(p, group, current_step)
314
- beta3 = math.sqrt(beta2)
315
314
  else:
316
315
  beta2 = self.beta2_default
317
- beta3 = self.beta3
318
316
 
319
317
  if self.use_AdEMAMix:
320
318
  beta3_ema = group['beta3_ema']
@@ -451,7 +449,7 @@ class Prodigy_adv(torch.optim.Optimizer):
451
449
  self.d_numerator += (self.d / d0) * self.dlr * torch.dot(grad_flat[::slice_p], p0.data - p_flat[::slice_p]).item()
452
450
 
453
451
  alpha = ((self.d / d0) * self.d) if safeguard_warmup else ((self.d / d0) * self.dlr)
454
- s.mul_(beta3).add_(grad_flat[::slice_p], alpha=alpha)
452
+ s.mul_(self.beta3).add_(grad_flat[::slice_p], alpha=alpha)
455
453
  self.d_denom += s.abs().sum().item()
456
454
 
457
455
  del s, p0, grad_flat, p_flat, alpha
@@ -61,7 +61,7 @@ class Simplified_AdEMAMix(torch.optim.Optimizer):
61
61
  "sunspike" ratio calculation to prevent division by zero. Corresponds
62
62
  to `ε_spike` in the paper. (default: 1e-9)
63
63
  k_warmup_steps (int): The number of initial steps during which β₂ is held
64
- at a fixed average value (`(beta2_min + beta2_max) / 2`) before the
64
+ at a fixed beta2 value before the
65
65
  dynamic logic activates. (default: 0)
66
66
  k_logging (int): if > 0 and kourkoutas_beta=True, enables periodic console
67
67
  logging of Kourkoutas-β statistics (min, max, mean of `β₂` across layers)
@@ -32,9 +32,16 @@ class KourkoutasHelper:
32
32
  if self._layer_info_built:
33
33
  return
34
34
 
35
- if not hasattr(self.optimizer, 'layer_key_fn') or self.optimizer.layer_key_fn is None:
36
- print("Warning: KourkoutasHelper requires 'layer_key_fn' on the optimizer. Defaulting to tensor-wise (id).")
37
- self.optimizer.layer_key_fn = lambda p: id(p)
35
+ if hasattr(self.optimizer, 'layer_key_fn') and self.optimizer.layer_key_fn is not None:
36
+ # A custom key function was provided by the user. We will use it.
37
+ pass
38
+ else:
39
+ # No key function was provided. Default to coarse, shape-based bucketing.
40
+ self.optimizer.layer_key_fn = lambda p: \
41
+ (id(p),) if p.dim() == 2 and 1 <= p.shape[0] <= 10 and p.shape[1] in {768, 1280, 4096} \
42
+ else tuple(p.shape)
43
+ # This ensures that we won't mix embeddings with tokens (1 to 10)
44
+ # TODO find a better way to safeguard the embeddings
38
45
 
39
46
  for group in self.optimizer.param_groups:
40
47
  for p in group['params']:
@@ -0,0 +1,275 @@
1
+ Metadata-Version: 2.4
2
+ Name: adv_optm
3
+ Version: 1.1.2
4
+ Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
+ Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
+ Author: Koratahiu
7
+ Author-email: hiuhonor@gmail.com
8
+ License: Apache 2.0
9
+ Keywords: llm,fine-tuning,memory-efficient,low-rank,compression,pytorch,optimizer,adam
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: torch>=2.0
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: classifier
22
+ Dynamic: description
23
+ Dynamic: description-content-type
24
+ Dynamic: home-page
25
+ Dynamic: keywords
26
+ Dynamic: license
27
+ Dynamic: license-file
28
+ Dynamic: requires-dist
29
+ Dynamic: requires-python
30
+ Dynamic: summary
31
+
32
+ # Advanced Optimizers (AIO)
33
+
34
+ A comprehensive, all-in-one collection of optimization algorithms for deep learning, designed for **maximum efficiency**, **minimal memory footprint**, and **superior performance** across diverse model architectures and training scenarios.
35
+
36
+ [![PyPI](https://img.shields.io/pypi/v/adv_optm)](https://pypi.org/project/adv_optm/)
37
+
38
+ ---
39
+
40
+ ## 📦 Installation
41
+
42
+ ```bash
43
+ pip install adv_optm
44
+ ```
45
+
46
+ ---
47
+
48
+ ## 🧠 Core Innovations
49
+
50
+ This library integrates multiple state-of-the-art optimization techniques validated through extensive research and practical training, with **1-bit compression for optimizer states**:
51
+
52
+ ### **Memory-Efficient Optimization (SMMF-inspired)**
53
+ - **Paper**: [SMMF: Square-Matricized Momentum Factorization](https://arxiv.org/abs/2412.08894)
54
+ - **Approach**: Uses rank-1 non-negative matrix factorization with reconstruction cycle (factor → reconstruct → update → factor)
55
+ - **Innovation**:
56
+ - First moment split into **1-bit sign + absolute value**
57
+ - Final storage: **four factored vectors + one 1-bit sign state**
58
+ - Preserves Adam-like update quality with drastically reduced memory
59
+
60
+ ---
61
+
62
+ ## ⚡ Performance Characteristics
63
+
64
+ ### Memory Efficiency (SDXL Model – 6.5GB)
65
+ | Optimizer | Memory Usage | Description |
66
+ |-----------|--------------|-------------|
67
+ | `Adopt_Factored` | 328 MB | 4 small vectors + 1-bit state |
68
+ | `Adopt_Factored + AdEMAMix` | 625 MB | 6 small vectors + two 1-bit states |
69
+ | `Simplified_AdEMAMix` | 328 MB | Same as standard factored (no extra state) |
70
+
71
+ ### Speed Comparison (SDXL, Batch Size 4)
72
+ | Optimizer | Speed | Notes |
73
+ |-----------|-------|-------|
74
+ | `Adafactor` | ~8.5s/it | Baseline |
75
+ | `Adopt_Factored` | ~10s/it | +18% overhead from compression |
76
+ | `Adopt_Factored + AdEMAMix` | ~12s/it | +41% overhead (3 factored states) |
77
+
78
+ ---
79
+
80
+ ## 🧪 Available Optimizers
81
+
82
+ ### Standard Optimizers (All support `factored=True/False`)
83
+ | Optimizer | Description | Best For |
84
+ |-----------|-------------|----------|
85
+ | `Adam_Adv` | Advanced Adam implementation | General purpose |
86
+ | `Adopt_Adv` | Adam-variant with independent beta2 | Stable training for small batch size regimes |
87
+ | `Prodigy_Adv` | Prodigy with D-Adaptation | Adam with automatic LR tuning |
88
+ | `Simplified_AdEMAMix` | Adam variant with accumulator momentum | Small/large batch training when tuned correctly |
89
+ | `Lion_Adv` | Advanced Lion implementation | Memory-constrained environments |
90
+ | `Prodigy_Lion_Adv` | Prodigy + Lion combination | Lion with automatic LR tuning |
91
+
92
+ ---
93
+
94
+ ## ⚙️ Feature Matrix
95
+
96
+ | Feature | Adam_Adv | Adopt_Adv | Prodigy_Adv | Simplified_AdEMAMix | Lion_Adv |
97
+ |---------|----------|-----------|-------------|---------------------|----------|
98
+ | Factored | ✓ | ✓ | ✓ | ✓ | ✓ |
99
+ | AdEMAMix | ✓ | ✓ | ✓ | ✗ | ✗ |
100
+ | Simplified_AdEMAMix | ✗ | ✓ | ✓ | ✓ | ✗ |
101
+ | OrthoGrad | ✓ | ✓ | ✓ | ✓ | ✓ |
102
+ | Grams | ✓ | ✓ | ✓ | ✗ | ✗ |
103
+ | Cautious | ✓ | ✓ | ✓ | ✗ | ✓ |
104
+ | atan2 | ✓ | ✓ | ✓ | ✗ | ✗ |
105
+ | Stochastic Rounding | ✓ | ✓ | ✓ | ✓ | ✓ |
106
+ | Fused Backward Pass | ✓ | ✓ | ✓ | ✓ | ✓ |
107
+ | **Kourkoutas-β** | ✓ | ✓ | ✓ | ✓ | ✗ |
108
+
109
+ ---
110
+
111
+ ## 🛠️ Comprehensive Feature Guide
112
+
113
+ ### A. Universal Safe Features
114
+ *These features work with all optimizers and are generally safe to enable.*
115
+
116
+ | Feature | Description | Recommended Usage | Performance Impact | Theoretical Basis | Compatibility |
117
+ |--------|-------------|-------------------|--------------------|-------------------|--------------|
118
+ | **Fused Back Pass** | Fuses backward pass; gradients used immediately and memory freed on-the-fly | Memory-constrained environments | Reduces peak memory | Memory optimization | All optimizers |
119
+ | **Stochastic Rounding** | Replaces nearest rounding with stochastic rounding to preserve small gradient updates in BF16 | BF16 training | Minimal overhead (<5%) | [Revisiting BFloat16 Training](https://arxiv.org/abs/2010.06192) | All optimizers |
120
+ | **OrthoGrad** | Removes gradient component parallel to weights to reduce overfitting | Full fine-tuning without weight decay | +33% time overhead (BS=4); less at larger BS | [Grokking at Edge](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability) | All optimizers |
121
+ | **Factored** | Memory-efficient optimization via rank-1 1-bit factorization of optimizer states | Large models / memory-limited hardware | Adds compression overhead | [SMMF](https://arxiv.org/abs/2412.08894) | All optimizers |
122
+
123
+ ### B. Individual Features
124
+
125
+ | Feature | Description | Recommended Usage | Performance Impact | Theoretical Basis | Compatibility |
126
+ |--------|-------------|-------------------|--------------------|-------------------|--------------|
127
+ | **Cautious** | Only applies update if gradient direction aligns with momentum direction | Accelerating convergence | No overhead | [C-Optim](https://github.com/kyleliang919/C-Optim) | Adam/Adopt/Prodigy/Lion |
128
+ | **Grams** | Update direction derived purely from current gradient | When Cautious is insufficient | No overhead | [Grams](https://github.com/Gunale0926/Grams) | Adam/Adopt/Prodigy |
129
+ | **AdEMAMix** | Dual EMA system that retains relevance of gradients over tens of thousands of steps | Long training runs, especially where model forgetting is a concern | +1 state memory | [AdEMAMix](https://arxiv.org/abs/2409.03137) | Adam/Adopt/Prodigy |
130
+ | **Simplified_AdEMAMix** | Accumulator-based momentum, single EMA variant of AdEMAMix | All scenarios when tuned correctly | No overhead | [Connections](https://arxiv.org/abs/2502.02431) | Adam/Adopt/Prodigy |
131
+ | **atan2** | Robust epsilon replacement with built-in gradient clipping | Use for stable bounded updates (or for Adopt as it needs that) | No overhead | [Adam-atan2](https://github.com/lucidrains/adam-atan2-pytorch) | Adam/Adopt/Prodigy |
132
+ | **Kourkoutas-β** | Layer-wise adaptive β₂ based on gradient “sunspike” ratio | Noisy/small/large-batch/high-LR training | No overhead | [Kourkoutas-β]() | Adam/Adopt/Prodigy/Simplified_AdEMAMix |
133
+
134
+ > **Note**: If both **Cautious** and **Grams** are enabled, **Grams takes precedence** and Cautious is disabled.
135
+
136
+ ---
137
+
138
+ ## 🔍 Feature Deep Dives
139
+
140
+ ### AdEMAMix
141
+
142
+ - Adds a **slow-decaying second EMA** (`beta3`) that retains gradient memory over tens of thousands of steps.
143
+ - Particularly effective for **small batch sizes**, where Adam’s standard first moment is nearly useless.
144
+ - **Reference**: [AdaMeM: Memory Efficient Momentum for Adafactor](https://openreview.net/forum?id=fZqMVTz7K5)
145
+
146
+ #### Tunable Hyperparameters
147
+ | Parameter | Default | Tuning Guide |
148
+ |-----------|---------|--------------|
149
+ | `beta3` | 0.9999 | • Runs >120k steps: **0.9999**<br>• Runs ≤120k steps: **0.999** |
150
+ | `alpha` | 5 | • Reduce to **2–3** if diverging<br>• Increase to strengthen long-term memory |
151
+
152
+ > ✅ **Pro Tip**: Set `beta1=0` in Adam/Adopt/Prodigy to skip standard EMA entirely and rely solely on AdEMAMix’s slow EMA, ideal for small-batch regimes.
153
+
154
+ ---
155
+
156
+ ### Simplified_AdEMAMix
157
+
158
+ - Introduced in [Connections between Schedule-Free Optimizers, AdEMAMix, and Accelerated SGD Variants (arXiv:2502.02431)](https://arxiv.org/abs/2502.02431).
159
+ - Replaces Adam’s first moment with a **gradient accumulator**, combining the stability of long memory with responsiveness to recent gradients.
160
+ - **Key insight**: Classical momentum **does not accelerate** in noisy (small-batch) regimes; this accumulator do.
161
+
162
+ #### Tunable Hyperparameters
163
+ | Parameter | Default | Tuning Guide |
164
+ |----------|---------|--------------|
165
+ | `beta1` | 0.99 | Controls accumulator memory length:<br>• Small BS: **0.99–0.9999**<br>• Large BS: **0.9** |
166
+ | `Grad α` | 100 | Most critical parameter:<br>• Inversely scales with batch size<br>• **100–10** for small BS (≤32)<br>• **1–0.1** for large BS (≥512) |
167
+
168
+ > ⚠️ **Critical**: Requires **~100x smaller learning rate** than AdamW (e.g., 1e-6 vs 1e-4).
169
+ > For `Prodigy_Adv`, set `initial_d` to:
170
+ > - **LoRA**: `1e-8`
171
+ > - **Full FT**: `1e-10`
172
+ > - **Embedding**: `1e-7`
173
+
174
+ > ⚠️ **Incompatible** with: **Cautious**, **Grams**, **atan2**, and standard gradient clipping.
175
+
176
+ #### Performance Validation
177
+
178
+ **Small Batch Training (SDXL, BS=2, 1.8K steps)**
179
+ ![Training Comparison](https://github.com/user-attachments/assets/7eff0671-cc59-47fc-8b63-d5205456d649)
180
+
181
+ - **🟢 Prodigy_Adv** (beta1=0.9, d0=1e-5): Final LR = 2.9e-4
182
+ - **🔵 Prodigy_Adv + Simplified_AdEMAMix** (beta1=0.99, α=100, d0=1e-7): Final LR = 5.8e-6
183
+
184
+ **Results**:
185
+ - Faster convergence and higher final performance with Simplified_AdEMAMix
186
+ - D-Adaptation automatically compensates for aggressive updates
187
+ - Generated samples show **significantly better quality**
188
+
189
+ ---
190
+
191
+ ### atan2
192
+
193
+ - Replaces `eps` in Adam-family optimizers with a **scale-invariant**, bounded update rule.
194
+ - Automatically clips updates to **[-2, 2]**, preventing destabilizing jumps.
195
+ - **Highly recommended** for `Adopt_Adv`, which is prone to instability without clipping.
196
+
197
+ ---
198
+
199
+ ### **Kourkoutas-β**
200
+
201
+ **Kourkoutas-β** introduces a **sunspike-driven, layer-wise adaptive second-moment decay (β₂)** as an optional enhancement for `Adam_Adv`, `Adopt_Adv`, `Prodigy_Adv`, and `Simplified_AdEMAMix`.
202
+
203
+ Instead of using a fixed β₂ (e.g., 0.999 or 0.95), it **dynamically modulates β₂ per layer** based on a bounded *sunspike ratio*:
204
+
205
+ - **During gradient bursts** → β₂ ↓ toward `Lower β₂` → faster reaction
206
+ - **During calm phases** → β₂ ↑ toward `The Selected β₂` → stronger smoothing
207
+
208
+ This is especially effective for **noisy training, small batch sizes, and high learning rates**, where gradient norms shift abruptly due to noise or aggressive LR schedules.
209
+
210
+ #### Pros/Cons
211
+
212
+ | **Category** | **Details** |
213
+ |--------------|-------------|
214
+ | ✅ **Pros** | • **Layer-wise adaptation** blends benefits of high β₂ (strong smoothing) and low β₂ (fast reaction).<br>• **Robust to sudden loss landscape shifts**, reacts quickly during gradient bursts, smooths during calm phases.<br>• **High tolerance to aggressive learning rates**. |
215
+ | ⚠️ **Cons** | • **Potentially unstable at the start of training** due to unreliable early gradient norms; mitigated by using `K-β Warmup Steps`. |
216
+
217
+ > 💡 **Best Practice**: Set `K_warmup_steps` equal to your standard LR warmup steps. During warmup, the optimizer uses the static `beta2`; adaptation begins only after warmup ends.
218
+
219
+ > 🔍 **Debugging Aid**: Enable `K_Logging` to monitor (min, max, mean) of dynamic β₂ values across layers every *N* steps.
220
+
221
+ #### 📊 Performance Validation
222
+
223
+ **ADAMW_ADV - full SDXL finetuning (aggressive LR: 3e-5) (BS=4, 2.5K steps)**
224
+ <img width="1460" height="382" alt="image" src="https://github.com/user-attachments/assets/007f278a-fbac-4f3d-9cc7-274c3b959cdd" />
225
+
226
+ - 🟣 Fixed `beta2=0.999`
227
+ - 🟠 Auto K-beta
228
+
229
+ **Observations:**
230
+ - K-beta is clearly better and more robust/stable for high LRs.
231
+
232
+ > 📚 **Reference**:
233
+ > - Paper: [Kourkoutas-β: A Sunspike-Driven Adam Optimizer with Desert Flair](https://arxiv.org/abs/2508.12996)
234
+ > - Code: [kbeta](https://github.com/sck-at-ucy/kbeta)
235
+
236
+ ---
237
+
238
+ ## Recommended Preset (Tested on LoRA/FT/Embedding)
239
+
240
+ ```yaml
241
+ Learning Rate: 1
242
+ optimizer: PRODIGY_Adv
243
+ settings:
244
+ - beta1: 0.99 # Controls momentum decay, ~100-step effective memory. Adjust to 0.999 (1000 steps) or 0.9999 (10000 steps) based on training length and stability needs.
245
+ - beta2: 0.999
246
+ - kourkoutas_beta: True # For Kourkoutas-β
247
+ - K-β Warmup Steps: 50 # Or 100, 200, depending on your run
248
+ - Simplified_AdEMAMix: True
249
+ - Grad α: 100
250
+ - OrthoGrad: True
251
+ - weight_decay: 0.0
252
+ - initial_d:
253
+ • LoRA: 1e-8
254
+ • Full fine-tune: 1e-10
255
+ • Embedding: 1e-7
256
+ - d_coef: 1
257
+ - d_limiter: True # To stablizie Prodigy with Simplified_AdEMAMix
258
+ - factored: False # Can be true or false, quality should not degrade due to Simplified_AdEMAMix’s high tolerance to 1-bit factorization.
259
+ ```
260
+
261
+ > ✅ **Why it works**:
262
+ > - `Kourkoutas-β` handles beta2 values
263
+ > - `Simplified_AdEMAMix` ensures responsiveness in small-batch noise
264
+ > - `OrthoGrad` prevents overfitting without weight decay
265
+
266
+ ---
267
+
268
+ ## 📚 References
269
+
270
+ 1. [Revisiting BFloat16 Training](https://arxiv.org/abs/2010.06192)
271
+ 2. [SMMF: Square-Matricized Momentum Factorization](https://arxiv.org/abs/2412.08894)
272
+ 3. [The AdEMAMix Optimizer](https://arxiv.org/abs/2409.03137)
273
+ 4. [Connections between Schedule-Free Optimizers, AdEMAMix, and Accelerated SGD](https://arxiv.org/abs/2502.02431)
274
+ 5. [AdaMeM: Memory Efficient Momentum for Adafactor](https://openreview.net/forum?id=fZqMVTz7K5)
275
+ 6. [Kourkoutas-β: A Sunspike-Driven Adam Optimizer with Desert Flair](https://arxiv.org/abs/2508.12996)
@@ -0,0 +1,20 @@
1
+ adv_optm/__init__.py,sha256=IJAqLP1mOIBeEFFeMCrFvxEwK7oz-g5SRAEfmukmy9o,306
2
+ adv_optm/optim/AdamW_adv.py,sha256=ddEUVOif1gfZPgEJNrEGZ2wnha4MPMWw5ppPd8acQ3o,17457
3
+ adv_optm/optim/Adopt_adv.py,sha256=fhH3hS9K6z5Blxc7NFfzpCrUGbl9EQnwLPmKDxBC1zg,21415
4
+ adv_optm/optim/Lion_Prodigy_adv.py,sha256=0AY-2hjdNKnxiY4MUG4Y4HCe1DvesuwaaxRzrHUkAGA,14606
5
+ adv_optm/optim/Lion_adv.py,sha256=aGNAplZlyXYgVllYcV_s4bK8iC4fv6EizFoWIMNLdBc,8299
6
+ adv_optm/optim/Prodigy_adv.py,sha256=nD59cAWOJJCjZdIiuD5hD9MWO5sTjPQSvq-3dwGTcEM,25875
7
+ adv_optm/optim/Simplified_AdEMAMix.py,sha256=gPjMhKulzmAeO42foe-d7xW0AcB50vKFYsvHgxbD3uc,12949
8
+ adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
9
+ adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
10
+ adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
11
+ adv_optm/util/Kourkoutas.py,sha256=woyJfX7l4eieeg0pC5XrILBLvwECwbD3a6ou1K6qjKU,8706
12
+ adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
13
+ adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
14
+ adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
15
+ adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
16
+ adv_optm-1.1.2.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
17
+ adv_optm-1.1.2.dist-info/METADATA,sha256=mtTfygEQn52Jqwc_W7rnDhJwRAArvaEjiK0s-cyDFVQ,14019
18
+ adv_optm-1.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
+ adv_optm-1.1.2.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
20
+ adv_optm-1.1.2.dist-info/RECORD,,
@@ -1,174 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: adv_optm
3
- Version: 1.1.0.dev5
4
- Summary: A family of highly efficient, lightweight yet powerful optimizers.
5
- Home-page: https://github.com/Koratahiu/Advanced_Optimizers
6
- Author: Koratahiu
7
- Author-email: hiuhonor@gmail.com
8
- License: Apache 2.0
9
- Keywords: llm,fine-tuning,memory-efficient,low-rank,compression,pytorch,optimizer,adam
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: License :: OSI Approved :: Apache Software License
12
- Classifier: Operating System :: OS Independent
13
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
- Requires-Python: >=3.8
16
- Description-Content-Type: text/markdown
17
- License-File: LICENSE
18
- Requires-Dist: torch>=2.0
19
- Dynamic: author
20
- Dynamic: author-email
21
- Dynamic: classifier
22
- Dynamic: description
23
- Dynamic: description-content-type
24
- Dynamic: home-page
25
- Dynamic: keywords
26
- Dynamic: license
27
- Dynamic: license-file
28
- Dynamic: requires-dist
29
- Dynamic: requires-python
30
- Dynamic: summary
31
-
32
- # Advanced Optimizers (AIO)
33
-
34
- A comprehensive, all-in-one collection of optimization algorithms for deep learning, designed for maximum efficiency, minimal memory footprint, and superior performance across diverse model architectures and training scenarios.
35
-
36
- [![PyPI](https://img.shields.io/pypi/v/adv_optm)](https://pypi.org/project/adv_optm/)
37
-
38
- ---
39
-
40
- ## 📦 Installation
41
-
42
- ```bash
43
- pip install adv_optm
44
- ```
45
-
46
- ---
47
-
48
- ## 🧠 Core Innovations
49
-
50
- This library integrates multiple state-of-the-art optimization techniques validated through extensive research and practical training, with 1-bit compression for optimizer states:
51
-
52
- ### **Memory-Efficient Optimization (SMMF-inspired)**
53
- - **Paper**: [SMMF: Square-Matricized Momentum Factorization](https://arxiv.org/abs/2412.08894)
54
- - **Approach**: Uses rank-1 non-negative matrix factorization with reconstruction cycle (factor → reconstruct → update → factor)
55
- - **Innovation**:
56
- - First moment split into **1-bit sign + absolute value**
57
- - Final storage: **four factored vectors + one 1-bit sign state**
58
- - Preserves Adam-like update quality with drastically reduced memory
59
-
60
- ---
61
-
62
- ## ⚡ Performance Characteristics
63
-
64
- ### Memory Efficiency (SDXL Model - 6.5GB)
65
- | Optimizer | Memory Usage | Description |
66
- |-----------|--------------|-------------|
67
- | `Adopt_Factored` | 328 MB | 4 small vectors + 1-bit state |
68
- | `Adopt_Factored + AdEMAMix` | 625 MB | 6 small vectors + two 1-bit states |
69
- | `Simplified_AdEMAMix` | 328 MB | Same as standard factored (no extra state) |
70
-
71
- ### Speed Comparison (SDXL, Batch Size 4)
72
- | Optimizer | Speed | Notes |
73
- |-----------|-------|-------|
74
- | `Adafactor` | ~8.5s/it | Baseline |
75
- | `Adopt_Factored` | ~10s/it | +18% overhead from compression |
76
- | `Adopt_Factored + AdEMAMix` | ~12s/it | +41% overhead (3 factored states) |
77
-
78
- ---
79
-
80
- ## 🧪 Available Optimizers
81
-
82
- ### Standard Optimizers (All support `factored=True/False`)
83
- | Optimizer | Description | Best For |
84
- |-----------|-------------|----------|
85
- | `Adam_Adv` | Advanced Adam implementation | General purpose |
86
- | `Adopt_Adv` | Adam-variant with independent beta2 | Stable training for small batch size regimes |
87
- | `Prodigy_Adv` | Prodigy with D-Adaptation | Adam with automatic LR tuning |
88
- | `Simplified_AdEMAMix` | Adam variant with accumulator momentum | Small/large batch training when tuned correctly |
89
- | `Lion_Adv` | Advanced Lion implementation | Memory-constrained environments |
90
- | `Prodigy_Lion_Adv` | Prodigy + Lion combination | Lion with automatic LR tuning |
91
-
92
- ### Feature Matrix
93
- | Feature | Adam_Adv | Adopt_Adv | Prodigy_Adv | Simplified_AdEMAMix | Lion_Adv |
94
- |---------|----------|-----------|-------------|---------------------|----------|
95
- | Factored | ✓ | ✓ | ✓ | ✓ | ✓ |
96
- | AdEMAMix | ✓ | ✓ | ✓ | ✗ | ✗ |
97
- | Simplified_AdEMAMix | ✗ | ✗ | ✓ | ✓ | ✗ |
98
- | OrthoGrad | ✓ | ✓ | ✓ | ✓ | ✓ |
99
- | Grams | ✓ | ✓ | ✓ | ✗ | ✗ |
100
- | Cautious | ✓ | ✓ | ✓ | ✗ | ✓ |
101
- | atan2 | ✓ | ✓ | ✓ | ✗ | ✗ |
102
- | Stochastic Rounding | ✓ | ✓ | ✓ | ✓ | ✓ |
103
- | Fused Backward Pass | ✓ | ✓ | ✓ | ✓ | ✓ |
104
-
105
- ---
106
-
107
- ## ⚙️ Key Features & Parameters
108
-
109
- ### Comprehensive Feature Guide
110
-
111
- | Feature | Description | Recommended Usage | Performance Impact | Theoretical Basis | Compatibility |
112
- |---------|-------------|-------------------|--------------------|-------------------|--------------|
113
- | **Factored** | Memory-efficient optimization using rank-1 factorization | Enable for large models (>1B params) or limited VRAM | +12-41% time overhead, 1-bit memory usage | [SMMF](https://arxiv.org/abs/2412.08894) | All optimizers |
114
- | **AdEMAMix** | Dual EMA system for momentum | Use for long training runs (10k+ steps) | +1 state memory. | [AdEMAMix](https://arxiv.org/abs/2409.03137) | Adam/Adopt/Prodigy |
115
- | **Simplified_AdEMAMix** | Accumulator-based momentum | Small batch training (≤32) | Same memory as standard, no extra overhead | [Schedule-Free Connections](https://arxiv.org/abs/2502.02431) | Adam/Prodigy |
116
- | **OrthoGrad** | Removes gradient component parallel to weights | Full finetuning without weight decay | +33% time overhead, no memory impact | [Grokking at Edge](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability) | All optimizers |
117
- | **Stochastic Rounding** | Improves precision for BF16 training | BF16 training | Minimal overhead (<5%) | [Revisiting BFloat16 Training](https://arxiv.org/abs/2010.06192) | All optimizers |
118
- | **atan2** | Robust eps replacement + built-in clipping | Use with Adopt or unstable training | No overhead | [Adam-atan2](https://github.com/lucidrains/adam-atan2-pytorch) | Adam/Adopt/prodigy |
119
- | **Cautious** | Update only when the direction align with the gradients | should faster the convergence | No overhead | [C-Optim](https://github.com/kyleliang919/C-Optim) | Adam/Adopt/prodigy |
120
- | **Grams** | Update direction from the gradients | should have a stronger effect than cautious | No overhead | [Grams](https://github.com/Gunale0926/Grams) | Adam/Adopt/prodigy |
121
-
122
- ---
123
-
124
- ## Simplified_AdEMAMix Parameters
125
- Simplified_AdEMAMix replaces standard momentum with an accumulator for better small-large batch performance.
126
-
127
- | Parameter | Recommended Values | Description |
128
- |-----------|---------------------|-------------|
129
- | `beta1` | 0.9 (large BS), 0.99-0.9999 (small BS) | Determines memory length of accumulator |
130
- | `alpha` | 100-10 (small BS), 1-0 (large BS) | Gradient smoothing factor |
131
-
132
- **Alpha Tuning Guide**:
133
- | Batch Size | Recommended α | Rationale |
134
- |------------|---------------|-----------|
135
- | Small (≤32) | 100, 50, 20, 10 | Emphasizes recent gradients for quick adaptation |
136
- | Medium (32-512) | 10, 5, 2, 1 | Balanced approach |
137
- | Large (≥512) | 1, 0.5, 0 | Emphasizes historical gradients for stability |
138
-
139
- ⚠️ **Important**: Use **~100x smaller learning rate** with Simplified_AdEMAMix compared to AdamW (e.g., 1e-6 instead of 1e-4)
140
-
141
- ### 📊 Performance Validation
142
- Small Batch Training (SDXL, BS=2, 1.8K steps)
143
- ![Training Comparison](https://github.com/user-attachments/assets/7eff0671-cc59-47fc-8b63-d5205456d649)
144
-
145
- - **🟢 Prodigy_adv** (beta1=0.9, d0=1e-5): Final LR=2.9e-4
146
- - **🔵 Prodigy_adv + Simplified_AdEMAMix** (beta1=0.99, α=100, d0=1e-7): Final LR=5.8e-6
147
-
148
- **Results**:
149
- - Simplified_AdEMAMix shows faster convergence and better final performance
150
- - D-Adaptation automatically handles aggressive updates (50x smaller LR)
151
- - Generated samples show significantly better quality with Simplified_AdEMAMix
152
-
153
- ---
154
-
155
- ## ⚠️ Known Limitations
156
-
157
- ### 1. Prodigy_Adv Sensitivity
158
- - Highly sensitive to gradient modifications (Adopt normalization, low-rank factorization)
159
- - May fail to increase learning rate in some LoRA scenarios
160
- - **Fix**: Disable factorization or set beta1=0
161
-
162
- ### 2. Aggressive Learning Rates
163
- - Can destabilize factored first moment
164
- - **Recommendation**: Check Prodigy learning rate as reference for safe LR threshold
165
-
166
- ---
167
-
168
- ## 📚 References
169
-
170
- 1. [SMMF: Square-Matricized Momentum Factorization](https://arxiv.org/abs/2412.08894)
171
- 2. [The AdEMAMix Optimizer: Better, Faster, Older](https://arxiv.org/abs/2409.03137)
172
- 3. [Connections between Schedule-Free Optimizers, AdEMAMix, and Accelerated SGD Variants](https://arxiv.org/abs/2502.02431)
173
-
174
- ---
@@ -1,20 +0,0 @@
1
- adv_optm/__init__.py,sha256=lOHXiF0KmYmUnaQGIoUYeIxdEfYE8T1hFSVq5FVujDs,311
2
- adv_optm/optim/AdamW_adv.py,sha256=gVVpaKIbpv8pkfvfgVGCQN6No8A4atO7eRSPDBUVqq8,17490
3
- adv_optm/optim/Adopt_adv.py,sha256=K7z1iiln_HxuEPLl9yGtCngBfdZHxJISQ5dKgNBV-s4,21463
4
- adv_optm/optim/Lion_Prodigy_adv.py,sha256=aJ9orEEw0QYbrDzn1be0SHvOBlIkLwWG9RpWFuNMskM,13163
5
- adv_optm/optim/Lion_adv.py,sha256=aGNAplZlyXYgVllYcV_s4bK8iC4fv6EizFoWIMNLdBc,8299
6
- adv_optm/optim/Prodigy_adv.py,sha256=ecdnnbRgclcG49sGzxAmPHPE_0KkaQWtaiynsBYudoM,25979
7
- adv_optm/optim/Simplified_AdEMAMix.py,sha256=Cm-8tdCaTahdz45EExgn2W3a5Xl44T9MW-IMrUDbJFk,12983
8
- adv_optm/optim/__init__.py,sha256=pcP865H2j1tut2VfTUhzQh7V8TF_tzPjqFnjMfFed2k,382
9
- adv_optm/util/BF16_Stochastic_Rounding.py,sha256=Q5H0BcogmE4atP65dLoI21HKSf50lRdsBDfeF6v9Tbg,1548
10
- adv_optm/util/Effective_Shape.py,sha256=TBvIk1V8IuTbbBsxuekJA4e_v8JlR5Nujtut8RTWAm4,318
11
- adv_optm/util/Kourkoutas.py,sha256=DCsIcZ1sEeSwthN8KZH7OTKoIZJ3ah4t5DNiqxsSuCk,8344
12
- adv_optm/util/NNMF.py,sha256=yRf5IP5Sjq0Uf0DxN0Q8NxEGSdD-f1ULziLVDOjY8K4,639
13
- adv_optm/util/One_Bit_Boolean.py,sha256=Wat49esdwohuN-OHOFMW8D0aOQgV9cP5Rl8z6yfmpos,1068
14
- adv_optm/util/OrthoGrad.py,sha256=NzInuBQGy_Ja__M1R9XbvqVaQ0fhGbtGgFE9YON7B3I,707
15
- adv_optm/util/__init__.py,sha256=qoyIF0jcLjs_vSEcsv36clw5LFNBEbifyXrrVxMH-G4,349
16
- adv_optm-1.1.0.dev5.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
17
- adv_optm-1.1.0.dev5.dist-info/METADATA,sha256=2xyGCRbIN54aIuAWnRIpR49okoVgVJb2AGHl2-jgVx8,8427
18
- adv_optm-1.1.0.dev5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- adv_optm-1.1.0.dev5.dist-info/top_level.txt,sha256=iNfBIIzu-lPrQ7jyC56WBCcbkRwitM2nJ15-MRQ_6fg,9
20
- adv_optm-1.1.0.dev5.dist-info/RECORD,,