ista-daslab-optimizers 1.1.5__tar.gz → 1.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ista_daslab_optimizers-1.1.5/ista_daslab_optimizers.egg-info → ista_daslab_optimizers-1.1.7}/PKG-INFO +27 -12
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/README.md +24 -10
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/__init__.py +2 -0
- ista_daslab_optimizers-1.1.7/ista_daslab_optimizers/ista_optimizer/__init__.py +5 -0
- ista_daslab_optimizers-1.1.7/ista_daslab_optimizers/ista_optimizer/ista_optimizer.py +36 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/tools.py +4 -1
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7/ista_daslab_optimizers.egg-info}/PKG-INFO +27 -12
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/SOURCES.txt +2 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/requires.txt +1 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/pyproject.toml +3 -1
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/LICENSE +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/MANIFEST.in +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/acdc/__init__.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/acdc/acdc.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/acdc/wd_scheduler.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/dense_mfac/__init__.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/dense_mfac/dense_core_mfac.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/dense_mfac/dense_mfac.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/micro_adam/__init__.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/micro_adam/micro_adam.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/sparse_mfac/__init__.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/sparse_mfac/sparse_core_mfac_w_ef.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/sparse_mfac/sparse_mfac.py +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/dependency_links.txt +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/top_level.txt +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/dense_mfac/dense_mfac.cpp +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/dense_mfac/dense_mfac_kernel.cu +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam.cpp +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam_asymm_block_quant.cu +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam_asymm_block_quant_inv.cu +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam_update.cu +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac/sparse_mfac.cpp +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac/sparse_mfac_LCG_kernel.cu +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac/sparse_mfac_SP_kernel.cu +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac_pruner/sparse_mfac_pruner.cpp +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac_pruner/sparse_mfac_pruner.cu +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/tools/tools.cpp +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/tools/tools_kernel.cu +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/utils.h +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/setup.cfg +0 -0
- {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: ista_daslab_optimizers
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.7
|
|
4
4
|
Summary: Deep Learning optimizers developed in the Distributed Algorithms and Systems group (DASLab) @ Institute of Science and Technology Austria (ISTA)
|
|
5
5
|
Author-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
|
|
6
6
|
Maintainer-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
|
|
@@ -222,6 +222,7 @@ Requires-Dist: gpustat
|
|
|
222
222
|
Requires-Dist: timm
|
|
223
223
|
Requires-Dist: einops
|
|
224
224
|
Requires-Dist: psutil
|
|
225
|
+
Requires-Dist: fast-hadamard-transform
|
|
225
226
|
|
|
226
227
|
# ISTA DAS Lab Optimization Algorithms Package
|
|
227
228
|
This repository contains optimization algorithms for Deep Learning developed by
|
|
@@ -240,6 +241,9 @@ The repository contains code for the following optimizers published by DASLab @
|
|
|
240
241
|
- **MicroAdam**:
|
|
241
242
|
- paper: [MicroAdam: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence](https://arxiv.org/abs/2405.15593)
|
|
242
243
|
- official repository: [GitHub](https://github.com/IST-DASLab/MicroAdam)
|
|
244
|
+
- **Trion / DCT-AdamW**:
|
|
245
|
+
- paper: [FFT-based Dynamic Subspace Selection for Low-Rank Adaptive Optimization of Large Language Models](https://arxiv.org/abs/2505.17967v3)
|
|
246
|
+
- code: [GitHub](https://github.com/IST-DASLab/ISTA-DASLab-Optimizers/tree/main/ista_daslab_optimizers/fft_low_rank)
|
|
243
247
|
|
|
244
248
|
### Installation
|
|
245
249
|
To use the latest stable version of this repository, you can install via pip:
|
|
@@ -261,7 +265,8 @@ source install.sh
|
|
|
261
265
|
|
|
262
266
|
## How to use optimizers?
|
|
263
267
|
|
|
264
|
-
In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
|
|
268
|
+
In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
|
|
269
|
+
`dense_mfac`, `sparse_mfac` and `micro_adam`:
|
|
265
270
|
```shell
|
|
266
271
|
cd examples/cifar10
|
|
267
272
|
OPTIMIZER=micro_adam # or any other optimizer listed above
|
|
@@ -291,22 +296,32 @@ optimizer = MicroAdam(
|
|
|
291
296
|
# Versions summary:
|
|
292
297
|
|
|
293
298
|
---
|
|
299
|
+
- **1.1.7** @ October 8th, 2025:
|
|
300
|
+
- added code for `Trion & DCT-AdamW`
|
|
301
|
+
- **1.1.6** @ February 19th, 2025:
|
|
302
|
+
- do not update the parameters that have `None` gradient in method `update_model` from `tools.py`.
|
|
303
|
+
This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework.
|
|
294
304
|
- **1.1.5** @ February 19th, 2025:
|
|
295
|
-
- adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
|
|
305
|
+
- adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
|
|
306
|
+
we have one feature extractor block and a list of classification heads. The issue was related to
|
|
307
|
+
the model size, which included the feature extractor backbone and all classification heads, but
|
|
308
|
+
in practice only one classification head will be used for training and inference. This caused some
|
|
309
|
+
size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had
|
|
310
|
+
fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size`
|
|
311
|
+
to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created
|
|
312
|
+
automatically in the `step` function.
|
|
296
313
|
- **1.1.3** @ September 5th, 2024:
|
|
297
314
|
- allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
|
|
298
315
|
- **1.1.2** @ August 1st, 2024:
|
|
299
|
-
- ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
|
|
300
|
-
(EF) to be integrated into the update to make it dense. Finally, the
|
|
301
|
-
the expense of another call to `Qinv` and `Q` (and
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
316
|
+
- ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
|
|
317
|
+
the fraction of error feedback (EF) to be integrated into the update to make it dense. Finally, the
|
|
318
|
+
fraction alpha will be discarded from the EF at the expense of another call to `Qinv` and `Q` (and
|
|
319
|
+
implicitly quantization statistics computation).
|
|
320
|
+
- ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the
|
|
321
|
+
`update_step` method instead of MicroAdam constructor
|
|
305
322
|
- **1.0.1** @ June 27th, 2024:
|
|
306
323
|
- removed version in dependencies to avoid conflicts with llm-foundry
|
|
307
|
-
|
|
308
324
|
- **1.0.0** @ June 20th, 2024:
|
|
309
325
|
- changed minimum required Python version to 3.8+ and torch to 2.3.0+
|
|
310
|
-
|
|
311
326
|
- **0.0.1** @ June 13th, 2024:
|
|
312
327
|
- added initial version of the package for Python 3.9+ and torch 2.3.1+
|
|
@@ -15,6 +15,9 @@ The repository contains code for the following optimizers published by DASLab @
|
|
|
15
15
|
- **MicroAdam**:
|
|
16
16
|
- paper: [MicroAdam: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence](https://arxiv.org/abs/2405.15593)
|
|
17
17
|
- official repository: [GitHub](https://github.com/IST-DASLab/MicroAdam)
|
|
18
|
+
- **Trion / DCT-AdamW**:
|
|
19
|
+
- paper: [FFT-based Dynamic Subspace Selection for Low-Rank Adaptive Optimization of Large Language Models](https://arxiv.org/abs/2505.17967v3)
|
|
20
|
+
- code: [GitHub](https://github.com/IST-DASLab/ISTA-DASLab-Optimizers/tree/main/ista_daslab_optimizers/fft_low_rank)
|
|
18
21
|
|
|
19
22
|
### Installation
|
|
20
23
|
To use the latest stable version of this repository, you can install via pip:
|
|
@@ -36,7 +39,8 @@ source install.sh
|
|
|
36
39
|
|
|
37
40
|
## How to use optimizers?
|
|
38
41
|
|
|
39
|
-
In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
|
|
42
|
+
In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
|
|
43
|
+
`dense_mfac`, `sparse_mfac` and `micro_adam`:
|
|
40
44
|
```shell
|
|
41
45
|
cd examples/cifar10
|
|
42
46
|
OPTIMIZER=micro_adam # or any other optimizer listed above
|
|
@@ -66,22 +70,32 @@ optimizer = MicroAdam(
|
|
|
66
70
|
# Versions summary:
|
|
67
71
|
|
|
68
72
|
---
|
|
73
|
+
- **1.1.7** @ October 8th, 2025:
|
|
74
|
+
- added code for `Trion & DCT-AdamW`
|
|
75
|
+
- **1.1.6** @ February 19th, 2025:
|
|
76
|
+
- do not update the parameters that have `None` gradient in method `update_model` from `tools.py`.
|
|
77
|
+
This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework.
|
|
69
78
|
- **1.1.5** @ February 19th, 2025:
|
|
70
|
-
- adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
|
|
79
|
+
- adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
|
|
80
|
+
we have one feature extractor block and a list of classification heads. The issue was related to
|
|
81
|
+
the model size, which included the feature extractor backbone and all classification heads, but
|
|
82
|
+
in practice only one classification head will be used for training and inference. This caused some
|
|
83
|
+
size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had
|
|
84
|
+
fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size`
|
|
85
|
+
to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created
|
|
86
|
+
automatically in the `step` function.
|
|
71
87
|
- **1.1.3** @ September 5th, 2024:
|
|
72
88
|
- allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
|
|
73
89
|
- **1.1.2** @ August 1st, 2024:
|
|
74
|
-
- ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
|
|
75
|
-
(EF) to be integrated into the update to make it dense. Finally, the
|
|
76
|
-
the expense of another call to `Qinv` and `Q` (and
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
90
|
+
- ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
|
|
91
|
+
the fraction of error feedback (EF) to be integrated into the update to make it dense. Finally, the
|
|
92
|
+
fraction alpha will be discarded from the EF at the expense of another call to `Qinv` and `Q` (and
|
|
93
|
+
implicitly quantization statistics computation).
|
|
94
|
+
- ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the
|
|
95
|
+
`update_step` method instead of MicroAdam constructor
|
|
80
96
|
- **1.0.1** @ June 27th, 2024:
|
|
81
97
|
- removed version in dependencies to avoid conflicts with llm-foundry
|
|
82
|
-
|
|
83
98
|
- **1.0.0** @ June 20th, 2024:
|
|
84
99
|
- changed minimum required Python version to 3.8+ and torch to 2.3.0+
|
|
85
|
-
|
|
86
100
|
- **0.0.1** @ June 13th, 2024:
|
|
87
101
|
- added initial version of the package for Python 3.9+ and torch 2.3.1+
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
|
|
3
|
+
class ISTAOptimizer(torch.optim.Optimizer):
|
|
4
|
+
def __init__(self, params, lr, weight_decay):
|
|
5
|
+
super().__init__(params, dict(lr=lr, weight_decay=weight_decay))
|
|
6
|
+
self.lr = lr
|
|
7
|
+
self.weight_decay = weight_decay
|
|
8
|
+
self.optim_steps = 0
|
|
9
|
+
|
|
10
|
+
def loop_params(self, check_grad=True):
|
|
11
|
+
for group in self.param_groups:
|
|
12
|
+
for p in group['params']:
|
|
13
|
+
if check_grad:
|
|
14
|
+
if p.grad is None: continue
|
|
15
|
+
yield group, self.state[p], p
|
|
16
|
+
|
|
17
|
+
@torch.no_grad()
|
|
18
|
+
def init_optimizer_states(self):
|
|
19
|
+
raise NotImplementedError
|
|
20
|
+
|
|
21
|
+
@torch.no_grad()
|
|
22
|
+
def optimizer_step(self):
|
|
23
|
+
raise NotImplementedError
|
|
24
|
+
|
|
25
|
+
@torch.no_grad()
|
|
26
|
+
def step(self, closure=None):
|
|
27
|
+
self.optim_steps += 1
|
|
28
|
+
|
|
29
|
+
loss = None
|
|
30
|
+
if closure is not None:
|
|
31
|
+
with torch.enable_grad():
|
|
32
|
+
loss = closure()
|
|
33
|
+
|
|
34
|
+
self.optimizer_step()
|
|
35
|
+
|
|
36
|
+
return loss
|
{ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/tools.py
RENAMED
|
@@ -134,6 +134,8 @@ def update_model(params, update, weight_decay=0, alpha=None, multiply_wd_w_lr=Fa
|
|
|
134
134
|
lr = group['lr']
|
|
135
135
|
wd = group.get('weight_decay', weight_decay) # if the param groups do not have weight decay, then use the externally provided one
|
|
136
136
|
for p in group['params']:
|
|
137
|
+
if p.grad is None:
|
|
138
|
+
continue
|
|
137
139
|
u = update[count:(count + p.numel())].reshape(p.shape).to(p.device)
|
|
138
140
|
if wd > 0:
|
|
139
141
|
if multiply_wd_w_lr:
|
|
@@ -212,4 +214,5 @@ class KernelVersionsManager:
|
|
|
212
214
|
return self.LCG_BLOCKS_THREADS[self.version_LCG][self.BLOCK_INDEX]
|
|
213
215
|
|
|
214
216
|
def get_LCG_threads(self):
|
|
215
|
-
return self.LCG_BLOCKS_THREADS[self.version_LCG][self.THREAD_INDEX]
|
|
217
|
+
return self.LCG_BLOCKS_THREADS[self.version_LCG][self.THREAD_INDEX]
|
|
218
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: ista_daslab_optimizers
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.7
|
|
4
4
|
Summary: Deep Learning optimizers developed in the Distributed Algorithms and Systems group (DASLab) @ Institute of Science and Technology Austria (ISTA)
|
|
5
5
|
Author-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
|
|
6
6
|
Maintainer-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
|
|
@@ -222,6 +222,7 @@ Requires-Dist: gpustat
|
|
|
222
222
|
Requires-Dist: timm
|
|
223
223
|
Requires-Dist: einops
|
|
224
224
|
Requires-Dist: psutil
|
|
225
|
+
Requires-Dist: fast-hadamard-transform
|
|
225
226
|
|
|
226
227
|
# ISTA DAS Lab Optimization Algorithms Package
|
|
227
228
|
This repository contains optimization algorithms for Deep Learning developed by
|
|
@@ -240,6 +241,9 @@ The repository contains code for the following optimizers published by DASLab @
|
|
|
240
241
|
- **MicroAdam**:
|
|
241
242
|
- paper: [MicroAdam: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence](https://arxiv.org/abs/2405.15593)
|
|
242
243
|
- official repository: [GitHub](https://github.com/IST-DASLab/MicroAdam)
|
|
244
|
+
- **Trion / DCT-AdamW**:
|
|
245
|
+
- paper: [FFT-based Dynamic Subspace Selection for Low-Rank Adaptive Optimization of Large Language Models](https://arxiv.org/abs/2505.17967v3)
|
|
246
|
+
- code: [GitHub](https://github.com/IST-DASLab/ISTA-DASLab-Optimizers/tree/main/ista_daslab_optimizers/fft_low_rank)
|
|
243
247
|
|
|
244
248
|
### Installation
|
|
245
249
|
To use the latest stable version of this repository, you can install via pip:
|
|
@@ -261,7 +265,8 @@ source install.sh
|
|
|
261
265
|
|
|
262
266
|
## How to use optimizers?
|
|
263
267
|
|
|
264
|
-
In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
|
|
268
|
+
In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
|
|
269
|
+
`dense_mfac`, `sparse_mfac` and `micro_adam`:
|
|
265
270
|
```shell
|
|
266
271
|
cd examples/cifar10
|
|
267
272
|
OPTIMIZER=micro_adam # or any other optimizer listed above
|
|
@@ -291,22 +296,32 @@ optimizer = MicroAdam(
|
|
|
291
296
|
# Versions summary:
|
|
292
297
|
|
|
293
298
|
---
|
|
299
|
+
- **1.1.7** @ October 8th, 2025:
|
|
300
|
+
- added code for `Trion & DCT-AdamW`
|
|
301
|
+
- **1.1.6** @ February 19th, 2025:
|
|
302
|
+
- do not update the parameters that have `None` gradient in method `update_model` from `tools.py`.
|
|
303
|
+
This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework.
|
|
294
304
|
- **1.1.5** @ February 19th, 2025:
|
|
295
|
-
- adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
|
|
305
|
+
- adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
|
|
306
|
+
we have one feature extractor block and a list of classification heads. The issue was related to
|
|
307
|
+
the model size, which included the feature extractor backbone and all classification heads, but
|
|
308
|
+
in practice only one classification head will be used for training and inference. This caused some
|
|
309
|
+
size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had
|
|
310
|
+
fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size`
|
|
311
|
+
to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created
|
|
312
|
+
automatically in the `step` function.
|
|
296
313
|
- **1.1.3** @ September 5th, 2024:
|
|
297
314
|
- allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
|
|
298
315
|
- **1.1.2** @ August 1st, 2024:
|
|
299
|
-
- ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
|
|
300
|
-
(EF) to be integrated into the update to make it dense. Finally, the
|
|
301
|
-
the expense of another call to `Qinv` and `Q` (and
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
316
|
+
- ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
|
|
317
|
+
the fraction of error feedback (EF) to be integrated into the update to make it dense. Finally, the
|
|
318
|
+
fraction alpha will be discarded from the EF at the expense of another call to `Qinv` and `Q` (and
|
|
319
|
+
implicitly quantization statistics computation).
|
|
320
|
+
- ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the
|
|
321
|
+
`update_step` method instead of MicroAdam constructor
|
|
305
322
|
- **1.0.1** @ June 27th, 2024:
|
|
306
323
|
- removed version in dependencies to avoid conflicts with llm-foundry
|
|
307
|
-
|
|
308
324
|
- **1.0.0** @ June 20th, 2024:
|
|
309
325
|
- changed minimum required Python version to 3.8+ and torch to 2.3.0+
|
|
310
|
-
|
|
311
326
|
- **0.0.1** @ June 13th, 2024:
|
|
312
327
|
- added initial version of the package for Python 3.9+ and torch 2.3.1+
|
|
@@ -27,6 +27,8 @@ ista_daslab_optimizers/acdc/wd_scheduler.py
|
|
|
27
27
|
ista_daslab_optimizers/dense_mfac/__init__.py
|
|
28
28
|
ista_daslab_optimizers/dense_mfac/dense_core_mfac.py
|
|
29
29
|
ista_daslab_optimizers/dense_mfac/dense_mfac.py
|
|
30
|
+
ista_daslab_optimizers/ista_optimizer/__init__.py
|
|
31
|
+
ista_daslab_optimizers/ista_optimizer/ista_optimizer.py
|
|
30
32
|
ista_daslab_optimizers/micro_adam/__init__.py
|
|
31
33
|
ista_daslab_optimizers/micro_adam/micro_adam.py
|
|
32
34
|
ista_daslab_optimizers/sparse_mfac/__init__.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name='ista_daslab_optimizers'
|
|
7
|
-
version='1.1.
|
|
7
|
+
version='1.1.7'
|
|
8
8
|
dependencies = [
|
|
9
9
|
"torch", # >=2.3.1",
|
|
10
10
|
"torchaudio", # >=2.3.1",
|
|
@@ -15,6 +15,8 @@ dependencies = [
|
|
|
15
15
|
"timm", # >=1.0.3",
|
|
16
16
|
"einops", # >=0.7.0",
|
|
17
17
|
"psutil", # >=5.9.8",
|
|
18
|
+
"fast-hadamard-transform",
|
|
19
|
+
# "fast-hadamard-transform @ git+https://github.com/Dao-AILab/fast-hadamard-transform.git",
|
|
18
20
|
]
|
|
19
21
|
requires-python = '>= 3.8'
|
|
20
22
|
authors = [
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/acdc/acdc.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/dense_mfac/dense_mfac.cpp
RENAMED
|
File without changes
|
|
File without changes
|
{ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam.cpp
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac/sparse_mfac.cpp
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|