ista-daslab-optimizers 1.1.5__tar.gz → 1.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {ista_daslab_optimizers-1.1.5/ista_daslab_optimizers.egg-info → ista_daslab_optimizers-1.1.7}/PKG-INFO +27 -12
  2. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/README.md +24 -10
  3. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/__init__.py +2 -0
  4. ista_daslab_optimizers-1.1.7/ista_daslab_optimizers/ista_optimizer/__init__.py +5 -0
  5. ista_daslab_optimizers-1.1.7/ista_daslab_optimizers/ista_optimizer/ista_optimizer.py +36 -0
  6. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/tools.py +4 -1
  7. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7/ista_daslab_optimizers.egg-info}/PKG-INFO +27 -12
  8. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/SOURCES.txt +2 -0
  9. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/requires.txt +1 -0
  10. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/pyproject.toml +3 -1
  11. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/LICENSE +0 -0
  12. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/MANIFEST.in +0 -0
  13. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/acdc/__init__.py +0 -0
  14. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/acdc/acdc.py +0 -0
  15. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/acdc/wd_scheduler.py +0 -0
  16. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/dense_mfac/__init__.py +0 -0
  17. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/dense_mfac/dense_core_mfac.py +0 -0
  18. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/dense_mfac/dense_mfac.py +0 -0
  19. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/micro_adam/__init__.py +0 -0
  20. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/micro_adam/micro_adam.py +0 -0
  21. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/sparse_mfac/__init__.py +0 -0
  22. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/sparse_mfac/sparse_core_mfac_w_ef.py +0 -0
  23. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/sparse_mfac/sparse_mfac.py +0 -0
  24. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/dependency_links.txt +0 -0
  25. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/top_level.txt +0 -0
  26. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/dense_mfac/dense_mfac.cpp +0 -0
  27. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/dense_mfac/dense_mfac_kernel.cu +0 -0
  28. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam.cpp +0 -0
  29. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam_asymm_block_quant.cu +0 -0
  30. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam_asymm_block_quant_inv.cu +0 -0
  31. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam_update.cu +0 -0
  32. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac/sparse_mfac.cpp +0 -0
  33. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac/sparse_mfac_LCG_kernel.cu +0 -0
  34. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac/sparse_mfac_SP_kernel.cu +0 -0
  35. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac_pruner/sparse_mfac_pruner.cpp +0 -0
  36. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac_pruner/sparse_mfac_pruner.cu +0 -0
  37. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/tools/tools.cpp +0 -0
  38. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/tools/tools_kernel.cu +0 -0
  39. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/kernels/utils.h +0 -0
  40. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/setup.cfg +0 -0
  41. {ista_daslab_optimizers-1.1.5 → ista_daslab_optimizers-1.1.7}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.1
2
2
  Name: ista_daslab_optimizers
3
- Version: 1.1.5
3
+ Version: 1.1.7
4
4
  Summary: Deep Learning optimizers developed in the Distributed Algorithms and Systems group (DASLab) @ Institute of Science and Technology Austria (ISTA)
5
5
  Author-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
6
6
  Maintainer-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
@@ -222,6 +222,7 @@ Requires-Dist: gpustat
222
222
  Requires-Dist: timm
223
223
  Requires-Dist: einops
224
224
  Requires-Dist: psutil
225
+ Requires-Dist: fast-hadamard-transform
225
226
 
226
227
  # ISTA DAS Lab Optimization Algorithms Package
227
228
  This repository contains optimization algorithms for Deep Learning developed by
@@ -240,6 +241,9 @@ The repository contains code for the following optimizers published by DASLab @
240
241
  - **MicroAdam**:
241
242
  - paper: [MicroAdam: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence](https://arxiv.org/abs/2405.15593)
242
243
  - official repository: [GitHub](https://github.com/IST-DASLab/MicroAdam)
244
+ - **Trion / DCT-AdamW**:
245
+ - paper: [FFT-based Dynamic Subspace Selection for Low-Rank Adaptive Optimization of Large Language Models](https://arxiv.org/abs/2505.17967v3)
246
+ - code: [GitHub](https://github.com/IST-DASLab/ISTA-DASLab-Optimizers/tree/main/ista_daslab_optimizers/fft_low_rank)
243
247
 
244
248
  ### Installation
245
249
  To use the latest stable version of this repository, you can install via pip:
@@ -261,7 +265,8 @@ source install.sh
261
265
 
262
266
  ## How to use optimizers?
263
267
 
264
- In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`, `dense_mfac`, `sparse_mfac` and `micro_adam`:
268
+ In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
269
+ `dense_mfac`, `sparse_mfac` and `micro_adam`:
265
270
  ```shell
266
271
  cd examples/cifar10
267
272
  OPTIMIZER=micro_adam # or any other optimizer listed above
@@ -291,22 +296,32 @@ optimizer = MicroAdam(
291
296
  # Versions summary:
292
297
 
293
298
  ---
299
+ - **1.1.7** @ October 8th, 2025:
300
+ - added code for `Trion & DCT-AdamW`
301
+ - **1.1.6** @ February 19th, 2025:
302
+ - do not update the parameters that have `None` gradient in method `update_model` from `tools.py`.
303
+ This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework.
294
304
  - **1.1.5** @ February 19th, 2025:
295
- - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where we have one feature extractor block and a list of classification heads. The issue was related to the model size, which included the feature extractor backbone and all classification heads, but in practice only one classification head will be used for training and inference. This caused some size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size` to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created automatically in the `step` function.
305
+ - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
306
+ we have one feature extractor block and a list of classification heads. The issue was related to
307
+ the model size, which included the feature extractor backbone and all classification heads, but
308
+ in practice only one classification head will be used for training and inference. This caused some
309
+ size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had
310
+ fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size`
311
+ to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created
312
+ automatically in the `step` function.
296
313
  - **1.1.3** @ September 5th, 2024:
297
314
  - allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
298
315
  - **1.1.2** @ August 1st, 2024:
299
- - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls the fraction of error feedback
300
- (EF) to be integrated into the update to make it dense. Finally, the fraction alpha will be discarded from the EF at
301
- the expense of another call to `Qinv` and `Q` (and implicitly quantization statistics computation).
302
- - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the `update_step` method
303
- instead of MicroAdam constructor
304
-
316
+ - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
317
+ the fraction of error feedback (EF) to be integrated into the update to make it dense. Finally, the
318
+ fraction alpha will be discarded from the EF at the expense of another call to `Qinv` and `Q` (and
319
+ implicitly quantization statistics computation).
320
+ - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the
321
+ `update_step` method instead of MicroAdam constructor
305
322
  - **1.0.1** @ June 27th, 2024:
306
323
  - removed version in dependencies to avoid conflicts with llm-foundry
307
-
308
324
  - **1.0.0** @ June 20th, 2024:
309
325
  - changed minimum required Python version to 3.8+ and torch to 2.3.0+
310
-
311
326
  - **0.0.1** @ June 13th, 2024:
312
327
  - added initial version of the package for Python 3.9+ and torch 2.3.1+
@@ -15,6 +15,9 @@ The repository contains code for the following optimizers published by DASLab @
15
15
  - **MicroAdam**:
16
16
  - paper: [MicroAdam: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence](https://arxiv.org/abs/2405.15593)
17
17
  - official repository: [GitHub](https://github.com/IST-DASLab/MicroAdam)
18
+ - **Trion / DCT-AdamW**:
19
+ - paper: [FFT-based Dynamic Subspace Selection for Low-Rank Adaptive Optimization of Large Language Models](https://arxiv.org/abs/2505.17967v3)
20
+ - code: [GitHub](https://github.com/IST-DASLab/ISTA-DASLab-Optimizers/tree/main/ista_daslab_optimizers/fft_low_rank)
18
21
 
19
22
  ### Installation
20
23
  To use the latest stable version of this repository, you can install via pip:
@@ -36,7 +39,8 @@ source install.sh
36
39
 
37
40
  ## How to use optimizers?
38
41
 
39
- In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`, `dense_mfac`, `sparse_mfac` and `micro_adam`:
42
+ In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
43
+ `dense_mfac`, `sparse_mfac` and `micro_adam`:
40
44
  ```shell
41
45
  cd examples/cifar10
42
46
  OPTIMIZER=micro_adam # or any other optimizer listed above
@@ -66,22 +70,32 @@ optimizer = MicroAdam(
66
70
  # Versions summary:
67
71
 
68
72
  ---
73
+ - **1.1.7** @ October 8th, 2025:
74
+ - added code for `Trion & DCT-AdamW`
75
+ - **1.1.6** @ February 19th, 2025:
76
+ - do not update the parameters that have `None` gradient in method `update_model` from `tools.py`.
77
+ This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework.
69
78
  - **1.1.5** @ February 19th, 2025:
70
- - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where we have one feature extractor block and a list of classification heads. The issue was related to the model size, which included the feature extractor backbone and all classification heads, but in practice only one classification head will be used for training and inference. This caused some size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size` to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created automatically in the `step` function.
79
+ - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
80
+ we have one feature extractor block and a list of classification heads. The issue was related to
81
+ the model size, which included the feature extractor backbone and all classification heads, but
82
+ in practice only one classification head will be used for training and inference. This caused some
83
+ size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had
84
+ fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size`
85
+ to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created
86
+ automatically in the `step` function.
71
87
  - **1.1.3** @ September 5th, 2024:
72
88
  - allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
73
89
  - **1.1.2** @ August 1st, 2024:
74
- - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls the fraction of error feedback
75
- (EF) to be integrated into the update to make it dense. Finally, the fraction alpha will be discarded from the EF at
76
- the expense of another call to `Qinv` and `Q` (and implicitly quantization statistics computation).
77
- - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the `update_step` method
78
- instead of MicroAdam constructor
79
-
90
+ - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
91
+ the fraction of error feedback (EF) to be integrated into the update to make it dense. Finally, the
92
+ fraction alpha will be discarded from the EF at the expense of another call to `Qinv` and `Q` (and
93
+ implicitly quantization statistics computation).
94
+ - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the
95
+ `update_step` method instead of MicroAdam constructor
80
96
  - **1.0.1** @ June 27th, 2024:
81
97
  - removed version in dependencies to avoid conflicts with llm-foundry
82
-
83
98
  - **1.0.0** @ June 20th, 2024:
84
99
  - changed minimum required Python version to 3.8+ and torch to 2.3.0+
85
-
86
100
  - **0.0.1** @ June 13th, 2024:
87
101
  - added initial version of the package for Python 3.9+ and torch 2.3.1+
@@ -2,3 +2,5 @@ from .acdc import *
2
2
  from .micro_adam import *
3
3
  from .sparse_mfac import *
4
4
  from .dense_mfac import *
5
+ from .fft_low_rank.trion import Trion
6
+ from .fft_low_rank.dct_adamw import DCTAdamW
@@ -0,0 +1,5 @@
1
+ from .ista_optimizer import ISTAOptimizer
2
+
3
+ __all__ = [
4
+ 'ISTAOptimizer'
5
+ ]
@@ -0,0 +1,36 @@
1
+ import torch
2
+
3
+ class ISTAOptimizer(torch.optim.Optimizer):
4
+ def __init__(self, params, lr, weight_decay):
5
+ super().__init__(params, dict(lr=lr, weight_decay=weight_decay))
6
+ self.lr = lr
7
+ self.weight_decay = weight_decay
8
+ self.optim_steps = 0
9
+
10
+ def loop_params(self, check_grad=True):
11
+ for group in self.param_groups:
12
+ for p in group['params']:
13
+ if check_grad:
14
+ if p.grad is None: continue
15
+ yield group, self.state[p], p
16
+
17
+ @torch.no_grad()
18
+ def init_optimizer_states(self):
19
+ raise NotImplementedError
20
+
21
+ @torch.no_grad()
22
+ def optimizer_step(self):
23
+ raise NotImplementedError
24
+
25
+ @torch.no_grad()
26
+ def step(self, closure=None):
27
+ self.optim_steps += 1
28
+
29
+ loss = None
30
+ if closure is not None:
31
+ with torch.enable_grad():
32
+ loss = closure()
33
+
34
+ self.optimizer_step()
35
+
36
+ return loss
@@ -134,6 +134,8 @@ def update_model(params, update, weight_decay=0, alpha=None, multiply_wd_w_lr=Fa
134
134
  lr = group['lr']
135
135
  wd = group.get('weight_decay', weight_decay) # if the param groups do not have weight decay, then use the externally provided one
136
136
  for p in group['params']:
137
+ if p.grad is None:
138
+ continue
137
139
  u = update[count:(count + p.numel())].reshape(p.shape).to(p.device)
138
140
  if wd > 0:
139
141
  if multiply_wd_w_lr:
@@ -212,4 +214,5 @@ class KernelVersionsManager:
212
214
  return self.LCG_BLOCKS_THREADS[self.version_LCG][self.BLOCK_INDEX]
213
215
 
214
216
  def get_LCG_threads(self):
215
- return self.LCG_BLOCKS_THREADS[self.version_LCG][self.THREAD_INDEX]
217
+ return self.LCG_BLOCKS_THREADS[self.version_LCG][self.THREAD_INDEX]
218
+
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.1
2
2
  Name: ista_daslab_optimizers
3
- Version: 1.1.5
3
+ Version: 1.1.7
4
4
  Summary: Deep Learning optimizers developed in the Distributed Algorithms and Systems group (DASLab) @ Institute of Science and Technology Austria (ISTA)
5
5
  Author-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
6
6
  Maintainer-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
@@ -222,6 +222,7 @@ Requires-Dist: gpustat
222
222
  Requires-Dist: timm
223
223
  Requires-Dist: einops
224
224
  Requires-Dist: psutil
225
+ Requires-Dist: fast-hadamard-transform
225
226
 
226
227
  # ISTA DAS Lab Optimization Algorithms Package
227
228
  This repository contains optimization algorithms for Deep Learning developed by
@@ -240,6 +241,9 @@ The repository contains code for the following optimizers published by DASLab @
240
241
  - **MicroAdam**:
241
242
  - paper: [MicroAdam: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence](https://arxiv.org/abs/2405.15593)
242
243
  - official repository: [GitHub](https://github.com/IST-DASLab/MicroAdam)
244
+ - **Trion / DCT-AdamW**:
245
+ - paper: [FFT-based Dynamic Subspace Selection for Low-Rank Adaptive Optimization of Large Language Models](https://arxiv.org/abs/2505.17967v3)
246
+ - code: [GitHub](https://github.com/IST-DASLab/ISTA-DASLab-Optimizers/tree/main/ista_daslab_optimizers/fft_low_rank)
243
247
 
244
248
  ### Installation
245
249
  To use the latest stable version of this repository, you can install via pip:
@@ -261,7 +265,8 @@ source install.sh
261
265
 
262
266
  ## How to use optimizers?
263
267
 
264
- In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`, `dense_mfac`, `sparse_mfac` and `micro_adam`:
268
+ In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
269
+ `dense_mfac`, `sparse_mfac` and `micro_adam`:
265
270
  ```shell
266
271
  cd examples/cifar10
267
272
  OPTIMIZER=micro_adam # or any other optimizer listed above
@@ -291,22 +296,32 @@ optimizer = MicroAdam(
291
296
  # Versions summary:
292
297
 
293
298
  ---
299
+ - **1.1.7** @ October 8th, 2025:
300
+ - added code for `Trion & DCT-AdamW`
301
+ - **1.1.6** @ February 19th, 2025:
302
+ - do not update the parameters that have `None` gradient in method `update_model` from `tools.py`.
303
+ This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework.
294
304
  - **1.1.5** @ February 19th, 2025:
295
- - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where we have one feature extractor block and a list of classification heads. The issue was related to the model size, which included the feature extractor backbone and all classification heads, but in practice only one classification head will be used for training and inference. This caused some size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size` to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created automatically in the `step` function.
305
+ - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
306
+ we have one feature extractor block and a list of classification heads. The issue was related to
307
+ the model size, which included the feature extractor backbone and all classification heads, but
308
+ in practice only one classification head will be used for training and inference. This caused some
309
+ size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had
310
+ fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size`
311
+ to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created
312
+ automatically in the `step` function.
296
313
  - **1.1.3** @ September 5th, 2024:
297
314
  - allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
298
315
  - **1.1.2** @ August 1st, 2024:
299
- - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls the fraction of error feedback
300
- (EF) to be integrated into the update to make it dense. Finally, the fraction alpha will be discarded from the EF at
301
- the expense of another call to `Qinv` and `Q` (and implicitly quantization statistics computation).
302
- - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the `update_step` method
303
- instead of MicroAdam constructor
304
-
316
+ - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
317
+ the fraction of error feedback (EF) to be integrated into the update to make it dense. Finally, the
318
+ fraction alpha will be discarded from the EF at the expense of another call to `Qinv` and `Q` (and
319
+ implicitly quantization statistics computation).
320
+ - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the
321
+ `update_step` method instead of MicroAdam constructor
305
322
  - **1.0.1** @ June 27th, 2024:
306
323
  - removed version in dependencies to avoid conflicts with llm-foundry
307
-
308
324
  - **1.0.0** @ June 20th, 2024:
309
325
  - changed minimum required Python version to 3.8+ and torch to 2.3.0+
310
-
311
326
  - **0.0.1** @ June 13th, 2024:
312
327
  - added initial version of the package for Python 3.9+ and torch 2.3.1+
@@ -27,6 +27,8 @@ ista_daslab_optimizers/acdc/wd_scheduler.py
27
27
  ista_daslab_optimizers/dense_mfac/__init__.py
28
28
  ista_daslab_optimizers/dense_mfac/dense_core_mfac.py
29
29
  ista_daslab_optimizers/dense_mfac/dense_mfac.py
30
+ ista_daslab_optimizers/ista_optimizer/__init__.py
31
+ ista_daslab_optimizers/ista_optimizer/ista_optimizer.py
30
32
  ista_daslab_optimizers/micro_adam/__init__.py
31
33
  ista_daslab_optimizers/micro_adam/micro_adam.py
32
34
  ista_daslab_optimizers/sparse_mfac/__init__.py
@@ -7,3 +7,4 @@ gpustat
7
7
  timm
8
8
  einops
9
9
  psutil
10
+ fast-hadamard-transform
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name='ista_daslab_optimizers'
7
- version='1.1.5'
7
+ version='1.1.7'
8
8
  dependencies = [
9
9
  "torch", # >=2.3.1",
10
10
  "torchaudio", # >=2.3.1",
@@ -15,6 +15,8 @@ dependencies = [
15
15
  "timm", # >=1.0.3",
16
16
  "einops", # >=0.7.0",
17
17
  "psutil", # >=5.9.8",
18
+ "fast-hadamard-transform",
19
+ # "fast-hadamard-transform @ git+https://github.com/Dao-AILab/fast-hadamard-transform.git",
18
20
  ]
19
21
  requires-python = '>= 3.8'
20
22
  authors = [