ista-daslab-optimizers 1.1.6__tar.gz → 1.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {ista_daslab_optimizers-1.1.6/ista_daslab_optimizers.egg-info → ista_daslab_optimizers-1.1.7}/PKG-INFO +26 -10
  2. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/README.md +23 -8
  3. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/__init__.py +2 -0
  4. ista_daslab_optimizers-1.1.7/ista_daslab_optimizers/ista_optimizer/__init__.py +5 -0
  5. ista_daslab_optimizers-1.1.7/ista_daslab_optimizers/ista_optimizer/ista_optimizer.py +36 -0
  6. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/tools.py +2 -1
  7. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7/ista_daslab_optimizers.egg-info}/PKG-INFO +26 -10
  8. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/SOURCES.txt +2 -0
  9. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/requires.txt +1 -0
  10. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/pyproject.toml +3 -1
  11. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/LICENSE +0 -0
  12. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/MANIFEST.in +0 -0
  13. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/acdc/__init__.py +0 -0
  14. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/acdc/acdc.py +0 -0
  15. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/acdc/wd_scheduler.py +0 -0
  16. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/dense_mfac/__init__.py +0 -0
  17. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/dense_mfac/dense_core_mfac.py +0 -0
  18. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/dense_mfac/dense_mfac.py +0 -0
  19. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/micro_adam/__init__.py +0 -0
  20. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/micro_adam/micro_adam.py +0 -0
  21. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/sparse_mfac/__init__.py +0 -0
  22. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/sparse_mfac/sparse_core_mfac_w_ef.py +0 -0
  23. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers/sparse_mfac/sparse_mfac.py +0 -0
  24. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/dependency_links.txt +0 -0
  25. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/ista_daslab_optimizers.egg-info/top_level.txt +0 -0
  26. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/dense_mfac/dense_mfac.cpp +0 -0
  27. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/dense_mfac/dense_mfac_kernel.cu +0 -0
  28. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam.cpp +0 -0
  29. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam_asymm_block_quant.cu +0 -0
  30. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam_asymm_block_quant_inv.cu +0 -0
  31. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/micro_adam/micro_adam_update.cu +0 -0
  32. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac/sparse_mfac.cpp +0 -0
  33. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac/sparse_mfac_LCG_kernel.cu +0 -0
  34. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac/sparse_mfac_SP_kernel.cu +0 -0
  35. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac_pruner/sparse_mfac_pruner.cpp +0 -0
  36. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/sparse_mfac_pruner/sparse_mfac_pruner.cu +0 -0
  37. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/tools/tools.cpp +0 -0
  38. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/tools/tools_kernel.cu +0 -0
  39. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/kernels/utils.h +0 -0
  40. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/setup.cfg +0 -0
  41. {ista_daslab_optimizers-1.1.6 → ista_daslab_optimizers-1.1.7}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.1
2
2
  Name: ista_daslab_optimizers
3
- Version: 1.1.6
3
+ Version: 1.1.7
4
4
  Summary: Deep Learning optimizers developed in the Distributed Algorithms and Systems group (DASLab) @ Institute of Science and Technology Austria (ISTA)
5
5
  Author-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
6
6
  Maintainer-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
@@ -222,6 +222,7 @@ Requires-Dist: gpustat
222
222
  Requires-Dist: timm
223
223
  Requires-Dist: einops
224
224
  Requires-Dist: psutil
225
+ Requires-Dist: fast-hadamard-transform
225
226
 
226
227
  # ISTA DAS Lab Optimization Algorithms Package
227
228
  This repository contains optimization algorithms for Deep Learning developed by
@@ -240,6 +241,9 @@ The repository contains code for the following optimizers published by DASLab @
240
241
  - **MicroAdam**:
241
242
  - paper: [MicroAdam: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence](https://arxiv.org/abs/2405.15593)
242
243
  - official repository: [GitHub](https://github.com/IST-DASLab/MicroAdam)
244
+ - **Trion / DCT-AdamW**:
245
+ - paper: [FFT-based Dynamic Subspace Selection for Low-Rank Adaptive Optimization of Large Language Models](https://arxiv.org/abs/2505.17967v3)
246
+ - code: [GitHub](https://github.com/IST-DASLab/ISTA-DASLab-Optimizers/tree/main/ista_daslab_optimizers/fft_low_rank)
243
247
 
244
248
  ### Installation
245
249
  To use the latest stable version of this repository, you can install via pip:
@@ -261,7 +265,8 @@ source install.sh
261
265
 
262
266
  ## How to use optimizers?
263
267
 
264
- In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`, `dense_mfac`, `sparse_mfac` and `micro_adam`:
268
+ In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
269
+ `dense_mfac`, `sparse_mfac` and `micro_adam`:
265
270
  ```shell
266
271
  cd examples/cifar10
267
272
  OPTIMIZER=micro_adam # or any other optimizer listed above
@@ -291,18 +296,29 @@ optimizer = MicroAdam(
291
296
  # Versions summary:
292
297
 
293
298
  ---
299
+ - **1.1.7** @ October 8th, 2025:
300
+ - added code for `Trion & DCT-AdamW`
294
301
  - **1.1.6** @ February 19th, 2025:
295
- - do not update the parameters that have `None` gradient in method `update_model` from `tools.py`. This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework
302
+ - do not update the parameters that have `None` gradient in method `update_model` from `tools.py`.
303
+ This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework.
296
304
  - **1.1.5** @ February 19th, 2025:
297
- - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where we have one feature extractor block and a list of classification heads. The issue was related to the model size, which included the feature extractor backbone and all classification heads, but in practice only one classification head will be used for training and inference. This caused some size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size` to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created automatically in the `step` function.
305
+ - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
306
+ we have one feature extractor block and a list of classification heads. The issue was related to
307
+ the model size, which included the feature extractor backbone and all classification heads, but
308
+ in practice only one classification head will be used for training and inference. This caused some
309
+ size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had
310
+ fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size`
311
+ to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created
312
+ automatically in the `step` function.
298
313
  - **1.1.3** @ September 5th, 2024:
299
314
  - allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
300
315
  - **1.1.2** @ August 1st, 2024:
301
- - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls the fraction of error feedback
302
- (EF) to be integrated into the update to make it dense. Finally, the fraction alpha will be discarded from the EF at
303
- the expense of another call to `Qinv` and `Q` (and implicitly quantization statistics computation).
304
- - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the `update_step` method
305
- instead of MicroAdam constructor
316
+ - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
317
+ the fraction of error feedback (EF) to be integrated into the update to make it dense. Finally, the
318
+ fraction alpha will be discarded from the EF at the expense of another call to `Qinv` and `Q` (and
319
+ implicitly quantization statistics computation).
320
+ - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the
321
+ `update_step` method instead of MicroAdam constructor
306
322
  - **1.0.1** @ June 27th, 2024:
307
323
  - removed version in dependencies to avoid conflicts with llm-foundry
308
324
  - **1.0.0** @ June 20th, 2024:
@@ -15,6 +15,9 @@ The repository contains code for the following optimizers published by DASLab @
15
15
  - **MicroAdam**:
16
16
  - paper: [MicroAdam: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence](https://arxiv.org/abs/2405.15593)
17
17
  - official repository: [GitHub](https://github.com/IST-DASLab/MicroAdam)
18
+ - **Trion / DCT-AdamW**:
19
+ - paper: [FFT-based Dynamic Subspace Selection for Low-Rank Adaptive Optimization of Large Language Models](https://arxiv.org/abs/2505.17967v3)
20
+ - code: [GitHub](https://github.com/IST-DASLab/ISTA-DASLab-Optimizers/tree/main/ista_daslab_optimizers/fft_low_rank)
18
21
 
19
22
  ### Installation
20
23
  To use the latest stable version of this repository, you can install via pip:
@@ -36,7 +39,8 @@ source install.sh
36
39
 
37
40
  ## How to use optimizers?
38
41
 
39
- In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`, `dense_mfac`, `sparse_mfac` and `micro_adam`:
42
+ In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
43
+ `dense_mfac`, `sparse_mfac` and `micro_adam`:
40
44
  ```shell
41
45
  cd examples/cifar10
42
46
  OPTIMIZER=micro_adam # or any other optimizer listed above
@@ -66,18 +70,29 @@ optimizer = MicroAdam(
66
70
  # Versions summary:
67
71
 
68
72
  ---
73
+ - **1.1.7** @ October 8th, 2025:
74
+ - added code for `Trion & DCT-AdamW`
69
75
  - **1.1.6** @ February 19th, 2025:
70
- - do not update the parameters that have `None` gradient in method `update_model` from `tools.py`. This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework
76
+ - do not update the parameters that have `None` gradient in method `update_model` from `tools.py`.
77
+ This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework.
71
78
  - **1.1.5** @ February 19th, 2025:
72
- - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where we have one feature extractor block and a list of classification heads. The issue was related to the model size, which included the feature extractor backbone and all classification heads, but in practice only one classification head will be used for training and inference. This caused some size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size` to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created automatically in the `step` function.
79
+ - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
80
+ we have one feature extractor block and a list of classification heads. The issue was related to
81
+ the model size, which included the feature extractor backbone and all classification heads, but
82
+ in practice only one classification head will be used for training and inference. This caused some
83
+ size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had
84
+ fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size`
85
+ to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created
86
+ automatically in the `step` function.
73
87
  - **1.1.3** @ September 5th, 2024:
74
88
  - allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
75
89
  - **1.1.2** @ August 1st, 2024:
76
- - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls the fraction of error feedback
77
- (EF) to be integrated into the update to make it dense. Finally, the fraction alpha will be discarded from the EF at
78
- the expense of another call to `Qinv` and `Q` (and implicitly quantization statistics computation).
79
- - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the `update_step` method
80
- instead of MicroAdam constructor
90
+ - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
91
+ the fraction of error feedback (EF) to be integrated into the update to make it dense. Finally, the
92
+ fraction alpha will be discarded from the EF at the expense of another call to `Qinv` and `Q` (and
93
+ implicitly quantization statistics computation).
94
+ - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the
95
+ `update_step` method instead of MicroAdam constructor
81
96
  - **1.0.1** @ June 27th, 2024:
82
97
  - removed version in dependencies to avoid conflicts with llm-foundry
83
98
  - **1.0.0** @ June 20th, 2024:
@@ -2,3 +2,5 @@ from .acdc import *
2
2
  from .micro_adam import *
3
3
  from .sparse_mfac import *
4
4
  from .dense_mfac import *
5
+ from .fft_low_rank.trion import Trion
6
+ from .fft_low_rank.dct_adamw import DCTAdamW
@@ -0,0 +1,5 @@
1
+ from .ista_optimizer import ISTAOptimizer
2
+
3
+ __all__ = [
4
+ 'ISTAOptimizer'
5
+ ]
@@ -0,0 +1,36 @@
1
+ import torch
2
+
3
+ class ISTAOptimizer(torch.optim.Optimizer):
4
+ def __init__(self, params, lr, weight_decay):
5
+ super().__init__(params, dict(lr=lr, weight_decay=weight_decay))
6
+ self.lr = lr
7
+ self.weight_decay = weight_decay
8
+ self.optim_steps = 0
9
+
10
+ def loop_params(self, check_grad=True):
11
+ for group in self.param_groups:
12
+ for p in group['params']:
13
+ if check_grad:
14
+ if p.grad is None: continue
15
+ yield group, self.state[p], p
16
+
17
+ @torch.no_grad()
18
+ def init_optimizer_states(self):
19
+ raise NotImplementedError
20
+
21
+ @torch.no_grad()
22
+ def optimizer_step(self):
23
+ raise NotImplementedError
24
+
25
+ @torch.no_grad()
26
+ def step(self, closure=None):
27
+ self.optim_steps += 1
28
+
29
+ loss = None
30
+ if closure is not None:
31
+ with torch.enable_grad():
32
+ loss = closure()
33
+
34
+ self.optimizer_step()
35
+
36
+ return loss
@@ -214,4 +214,5 @@ class KernelVersionsManager:
214
214
  return self.LCG_BLOCKS_THREADS[self.version_LCG][self.BLOCK_INDEX]
215
215
 
216
216
  def get_LCG_threads(self):
217
- return self.LCG_BLOCKS_THREADS[self.version_LCG][self.THREAD_INDEX]
217
+ return self.LCG_BLOCKS_THREADS[self.version_LCG][self.THREAD_INDEX]
218
+
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.1
2
2
  Name: ista_daslab_optimizers
3
- Version: 1.1.6
3
+ Version: 1.1.7
4
4
  Summary: Deep Learning optimizers developed in the Distributed Algorithms and Systems group (DASLab) @ Institute of Science and Technology Austria (ISTA)
5
5
  Author-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
6
6
  Maintainer-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
@@ -222,6 +222,7 @@ Requires-Dist: gpustat
222
222
  Requires-Dist: timm
223
223
  Requires-Dist: einops
224
224
  Requires-Dist: psutil
225
+ Requires-Dist: fast-hadamard-transform
225
226
 
226
227
  # ISTA DAS Lab Optimization Algorithms Package
227
228
  This repository contains optimization algorithms for Deep Learning developed by
@@ -240,6 +241,9 @@ The repository contains code for the following optimizers published by DASLab @
240
241
  - **MicroAdam**:
241
242
  - paper: [MicroAdam: Accurate Adaptive Optimization with Low Space Overhead and Provable Convergence](https://arxiv.org/abs/2405.15593)
242
243
  - official repository: [GitHub](https://github.com/IST-DASLab/MicroAdam)
244
+ - **Trion / DCT-AdamW**:
245
+ - paper: [FFT-based Dynamic Subspace Selection for Low-Rank Adaptive Optimization of Large Language Models](https://arxiv.org/abs/2505.17967v3)
246
+ - code: [GitHub](https://github.com/IST-DASLab/ISTA-DASLab-Optimizers/tree/main/ista_daslab_optimizers/fft_low_rank)
243
247
 
244
248
  ### Installation
245
249
  To use the latest stable version of this repository, you can install via pip:
@@ -261,7 +265,8 @@ source install.sh
261
265
 
262
266
  ## How to use optimizers?
263
267
 
264
- In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`, `dense_mfac`, `sparse_mfac` and `micro_adam`:
268
+ In this repository we provide a minimal working example for CIFAR-10 for optimizers `acdc`,
269
+ `dense_mfac`, `sparse_mfac` and `micro_adam`:
265
270
  ```shell
266
271
  cd examples/cifar10
267
272
  OPTIMIZER=micro_adam # or any other optimizer listed above
@@ -291,18 +296,29 @@ optimizer = MicroAdam(
291
296
  # Versions summary:
292
297
 
293
298
  ---
299
+ - **1.1.7** @ October 8th, 2025:
300
+ - added code for `Trion & DCT-AdamW`
294
301
  - **1.1.6** @ February 19th, 2025:
295
- - do not update the parameters that have `None` gradient in method `update_model` from `tools.py`. This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework
302
+ - do not update the parameters that have `None` gradient in method `update_model` from `tools.py`.
303
+ This is useful when using M-FAC for models with more than one classification head in the Continual Learning framework.
296
304
  - **1.1.5** @ February 19th, 2025:
297
- - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where we have one feature extractor block and a list of classification heads. The issue was related to the model size, which included the feature extractor backbone and all classification heads, but in practice only one classification head will be used for training and inference. This caused some size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size` to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created automatically in the `step` function.
305
+ - adapted `DenseMFAC` for a model with multiple classification heads for Continual Learning where
306
+ we have one feature extractor block and a list of classification heads. The issue was related to
307
+ the model size, which included the feature extractor backbone and all classification heads, but
308
+ in practice only one classification head will be used for training and inference. This caused some
309
+ size mismatch errors at runtime in the `DenseCoreMFAC` module because the gradient at runtime had
310
+ fewer entries than the entire model. When using `DenseMFAC` for such settings, set `optimizer.model_size`
311
+ to the correct size after calling the constructor and the `DenseCoreMFAC` object will be created
312
+ automatically in the `step` function.
298
313
  - **1.1.3** @ September 5th, 2024:
299
314
  - allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
300
315
  - **1.1.2** @ August 1st, 2024:
301
- - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls the fraction of error feedback
302
- (EF) to be integrated into the update to make it dense. Finally, the fraction alpha will be discarded from the EF at
303
- the expense of another call to `Qinv` and `Q` (and implicitly quantization statistics computation).
304
- - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the `update_step` method
305
- instead of MicroAdam constructor
316
+ - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls
317
+ the fraction of error feedback (EF) to be integrated into the update to make it dense. Finally, the
318
+ fraction alpha will be discarded from the EF at the expense of another call to `Qinv` and `Q` (and
319
+ implicitly quantization statistics computation).
320
+ - ***[1.0.2]:*** added FSDP-compatible implementation by initializing the parameter states in the
321
+ `update_step` method instead of MicroAdam constructor
306
322
  - **1.0.1** @ June 27th, 2024:
307
323
  - removed version in dependencies to avoid conflicts with llm-foundry
308
324
  - **1.0.0** @ June 20th, 2024:
@@ -27,6 +27,8 @@ ista_daslab_optimizers/acdc/wd_scheduler.py
27
27
  ista_daslab_optimizers/dense_mfac/__init__.py
28
28
  ista_daslab_optimizers/dense_mfac/dense_core_mfac.py
29
29
  ista_daslab_optimizers/dense_mfac/dense_mfac.py
30
+ ista_daslab_optimizers/ista_optimizer/__init__.py
31
+ ista_daslab_optimizers/ista_optimizer/ista_optimizer.py
30
32
  ista_daslab_optimizers/micro_adam/__init__.py
31
33
  ista_daslab_optimizers/micro_adam/micro_adam.py
32
34
  ista_daslab_optimizers/sparse_mfac/__init__.py
@@ -7,3 +7,4 @@ gpustat
7
7
  timm
8
8
  einops
9
9
  psutil
10
+ fast-hadamard-transform
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name='ista_daslab_optimizers'
7
- version='1.1.6'
7
+ version='1.1.7'
8
8
  dependencies = [
9
9
  "torch", # >=2.3.1",
10
10
  "torchaudio", # >=2.3.1",
@@ -15,6 +15,8 @@ dependencies = [
15
15
  "timm", # >=1.0.3",
16
16
  "einops", # >=0.7.0",
17
17
  "psutil", # >=5.9.8",
18
+ "fast-hadamard-transform",
19
+ # "fast-hadamard-transform @ git+https://github.com/Dao-AILab/fast-hadamard-transform.git",
18
20
  ]
19
21
  requires-python = '>= 3.8'
20
22
  authors = [