ista-daslab-optimizers 1.1.2__tar.gz → 1.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {ista_daslab_optimizers-1.1.2/ista_daslab_optimizers.egg-info → ista_daslab_optimizers-1.1.3}/PKG-INFO +3 -1
  2. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/README.md +2 -0
  3. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/micro_adam/micro_adam.py +87 -23
  4. ista_daslab_optimizers-1.1.3/ista_daslab_optimizers/sparse_mfac/__init__.py +7 -0
  5. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3/ista_daslab_optimizers.egg-info}/PKG-INFO +3 -1
  6. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers.egg-info/SOURCES.txt +13 -2
  7. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/pyproject.toml +1 -1
  8. ista_daslab_optimizers-1.1.2/ista_daslab_optimizers/sparse_mfac/__init__.py +0 -5
  9. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/LICENSE +0 -0
  10. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/MANIFEST.in +0 -0
  11. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/__init__.py +0 -0
  12. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/acdc/__init__.py +0 -0
  13. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/acdc/acdc.py +0 -0
  14. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/acdc/wd_scheduler.py +0 -0
  15. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/dense_mfac/__init__.py +0 -0
  16. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/dense_mfac/dense_core_mfac.py +0 -0
  17. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/dense_mfac/dense_mfac.py +0 -0
  18. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/micro_adam/__init__.py +0 -0
  19. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/sparse_mfac/sparse_core_mfac_w_ef.py +0 -0
  20. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/sparse_mfac/sparse_mfac.py +0 -0
  21. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers/tools.py +0 -0
  22. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers.egg-info/dependency_links.txt +0 -0
  23. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers.egg-info/requires.txt +0 -0
  24. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/ista_daslab_optimizers.egg-info/top_level.txt +0 -0
  25. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/dense_mfac/dense_mfac.cpp +0 -0
  26. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/dense_mfac/dense_mfac_kernel.cu +0 -0
  27. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/micro_adam/micro_adam.cpp +0 -0
  28. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/micro_adam/micro_adam_asymm_block_quant.cu +0 -0
  29. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/micro_adam/micro_adam_asymm_block_quant_inv.cu +0 -0
  30. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/micro_adam/micro_adam_update.cu +0 -0
  31. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/sparse_mfac/sparse_mfac.cpp +0 -0
  32. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/sparse_mfac/sparse_mfac_LCG_kernel.cu +0 -0
  33. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/sparse_mfac/sparse_mfac_SP_kernel.cu +0 -0
  34. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/tools/tools.cpp +0 -0
  35. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/tools/tools_kernel.cu +0 -0
  36. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/kernels/utils.h +0 -0
  37. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/setup.cfg +0 -0
  38. {ista_daslab_optimizers-1.1.2 → ista_daslab_optimizers-1.1.3}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ista_daslab_optimizers
3
- Version: 1.1.2
3
+ Version: 1.1.3
4
4
  Summary: Deep Learning optimizers developed in the Distributed Algorithms and Systems group (DASLab) @ Institute of Science and Technology Austria (ISTA)
5
5
  Author-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
6
6
  Maintainer-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
@@ -289,6 +289,8 @@ optimizer = MicroAdam(
289
289
  # Versions summary:
290
290
 
291
291
  ---
292
+ - **1.1.3** @ September 5th, 2024:
293
+ - allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
292
294
  - **1.1.2** @ August 1st, 2024:
293
295
  - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls the fraction of error feedback
294
296
  (EF) to be integrated into the update to make it dense. Finally, the fraction alpha will be discarded from the EF at
@@ -64,6 +64,8 @@ optimizer = MicroAdam(
64
64
  # Versions summary:
65
65
 
66
66
  ---
67
+ - **1.1.3** @ September 5th, 2024:
68
+ - allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
67
69
  - **1.1.2** @ August 1st, 2024:
68
70
  - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls the fraction of error feedback
69
71
  (EF) to be integrated into the update to make it dense. Finally, the fraction alpha will be discarded from the EF at
@@ -15,7 +15,7 @@ class MicroAdam(torch.optim.Optimizer):
15
15
  defaults = dict(lr=lr, weight_decay=weight_decay, eps=eps, alpha=alpha)
16
16
  super(MicroAdam, self).__init__(params, defaults)
17
17
 
18
- assert 0 <= alpha < 1, 'Alpha must be in the [0, 1) interval'
18
+ assert (0 <= alpha < 1) or alpha == -2, 'Alpha must be in the [0, 1) interval or -2'
19
19
 
20
20
  self.m = m
21
21
  self.lr = lr
@@ -27,7 +27,9 @@ class MicroAdam(torch.optim.Optimizer):
27
27
  self.beta2 = betas[1]
28
28
  self.eps = eps
29
29
 
30
- self.densify_update = (self.alpha > 0)
30
+ self.densify_update_using_ef = (self.alpha > 0)
31
+ self.densify_update_using_quant_error = (self.alpha == -2)
32
+
31
33
  self.model_size = sum([p.numel() for group in self.param_groups for p in group['params']])
32
34
 
33
35
  self.steps = 0 # how many optimization steps were performed so far
@@ -41,8 +43,12 @@ class MicroAdam(torch.optim.Optimizer):
41
43
  self.max_floats = ista_daslab_tools.get_max_floats_for_shared_memory_per_thread_block()
42
44
  self.d_block_size = self.max_floats // 2 // int(100 / self.shared_memory_carveout)
43
45
 
44
- self.fsdp_dict_size_count = [{} for _ in range(
45
- torch.distributed.get_world_size())] # key = layer size, value = how many layers of that size the model has (per worker)
46
+ if torch.distributed.is_initialized():
47
+ self.fsdp_dict_size_count = [{} for _ in range(
48
+ torch.distributed.get_world_size())] # key = layer size, value = how many layers of that size the model has (per worker)
49
+ else:
50
+ self.fsdp_dict_size_count = [{}]
51
+
46
52
  self.dict_size_count = {} # key = layer size, value = how many layers of that size the model has
47
53
  for param in self.param_groups:
48
54
  for p in param['params']:
@@ -56,7 +62,10 @@ class MicroAdam(torch.optim.Optimizer):
56
62
  layer_size = p.numel()
57
63
  st = self.state[p]
58
64
 
59
- rank = torch.distributed.get_rank()
65
+ rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
66
+
67
+ if self.densify_update_using_quant_error:
68
+ st['quant_err'] = torch.zeros_like(p)
60
69
 
61
70
  st['blocks'] = max(1, int(math.floor(self.blocks * layer_size * self.fsdp_dict_size_count[rank][layer_size] / self.model_size)))
62
71
 
@@ -96,7 +105,7 @@ class MicroAdam(torch.optim.Optimizer):
96
105
  loss = closure()
97
106
 
98
107
  if self.steps == 1:
99
- rank = torch.distributed.get_rank()
108
+ rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
100
109
  for param in self.param_groups:
101
110
  for p in param['params']:
102
111
  if p is not None:
@@ -106,7 +115,7 @@ class MicroAdam(torch.optim.Optimizer):
106
115
 
107
116
  time_start = time.time()
108
117
 
109
- norm_g, norm_u, norm_e, sparsity_u = 0, 0, 0, 0
118
+ norm_qe, norm_g, norm_u, norm_e, sparsity_u, sparsity_qe = 0, 0, 0, 0, 0, 0
110
119
 
111
120
  for group in self.param_groups:
112
121
  lr = group['lr']
@@ -119,23 +128,28 @@ class MicroAdam(torch.optim.Optimizer):
119
128
  if p is None:
120
129
  continue
121
130
 
122
- ng, nu, ne, sp_u = self.update_step(p, lr, wd)
131
+ nqe, ng, nu, ne, sp_u, sp_qe = self.update_step(p, lr, wd)
132
+ norm_qe += nqe
123
133
  norm_g += ng
124
134
  norm_u += nu
125
135
  norm_e += ne
126
136
  sparsity_u += sp_u
137
+ sparsity_qe += sp_qe
127
138
 
128
139
  # torch.cuda.synchronize()
129
140
  time_end = time.time()
130
141
  elapsed_step = time_end - time_start
131
- self._log(norm_g, norm_u, norm_e, sparsity_u, elapsed_step)
142
+ self._log(norm_qe, norm_g, norm_u, norm_e, sparsity_u, sparsity_qe, elapsed_step)
132
143
 
133
144
  return loss
134
145
 
135
146
  @torch.no_grad()
136
147
  def update_step(self, p, lr, wd):
137
- norm_g, norm_u, norm_e, sp_u = 0, 0, 0, 0
148
+ norm_qe, norm_g, norm_u, norm_e, sp_u, sp_qe = 0, 0, 0, 0, 0, 0
138
149
 
150
+ # if p.grad.dtype != torch.bfloat16:
151
+ # grad = p.grad.to(dtype=torch.bfloat16).reshape(-1)
152
+ # else:
139
153
  grad = p.grad.view(-1)
140
154
 
141
155
  if self.steps % self.log_interval == 0:
@@ -216,6 +230,48 @@ class MicroAdam(torch.optim.Optimizer):
216
230
  ##### STEP 8
217
231
  ista_daslab_micro_adam.asymm_block_quant(d, self.quant_block_size, error, min_vals, max_vals, grad) # error = Q(a, min, max)
218
232
 
233
+ # weight decay step
234
+ if wd > 0:
235
+ p.mul_(1 - lr * wd)
236
+
237
+ ##### NEW: densify using quant error
238
+ if self.densify_update_using_quant_error:
239
+ # When entering this if-statement, we have:
240
+ # - p is theta_t
241
+ # - p.grad is a_t (from step 6 in algorithm 1)
242
+ # - error is e_t+1 (from step 8 in algorithm 1)
243
+ #
244
+ # Below we have the formula to update the model parameters:
245
+ # [a = -1] with lr
246
+ # theta_t+1 = theta_t - lr * (a_t - Qinv(e_t+1)) - lr * u_t
247
+ # = theta_t - lr * a_t + lr * Qinv(e_t+1) - lr * u_t
248
+ # = theta_t - lr * a_t # STEP A below, in this if statmenet
249
+ # + lr * Qinv(e_t+1) # STEP B below, in this if statmenet
250
+ # - lr * u_t # this is steps 10-11
251
+ #
252
+ # [a = -2] without lr
253
+ # theta_t+1 = theta_t - (a_t - Qinv(e_t+1)) - lr * u_t
254
+ # = theta_t - a_t + Qinv(e_t+1) - lr * u_t
255
+ # = theta_t - a_t # STEP A below, in this if statmenet
256
+ # + Qinv(e_t+1) # STEP B below, in this if statmenet
257
+ # - lr * u_t # this is steps 10-11
258
+ quant_err = st['quant_err']
259
+ quant_err.zero_()
260
+ quant_err.add_(p.grad)
261
+
262
+ ##### STEP A
263
+ p.add_(p.grad, alpha=-1)
264
+
265
+ ##### STEP B
266
+ p.grad.zero_() # zerorize to prepare the accumulator for Qinv
267
+ ista_daslab_micro_adam.asymm_block_quant_inv(d, self.quant_block_size, error, min_vals, max_vals, grad, 1)
268
+ p.add_(p.grad)
269
+
270
+ quant_err.sub_(p.grad)
271
+
272
+ norm_qe = quant_err.norm(p=2) ** 2
273
+ sp_qe = (quant_err == 0).sum()
274
+
219
275
  ##### STEPS 10-11
220
276
  grad.zero_()
221
277
  ista_daslab_micro_adam.compute_microadam_update(blocks, # blocks
@@ -237,16 +293,22 @@ class MicroAdam(torch.optim.Optimizer):
237
293
  ##### STEP 12: # side idea: only decay the weights that are update
238
294
 
239
295
  ##### if PRETRAINING #1
240
- if self.densify_update: # we add alpha * EF to update that is stored in grad buffer
296
+ if self.densify_update_using_ef: # we add alpha * EF to update that is stored in grad buffer
241
297
  # p.grad += alpha * Qinv(error), alpha=0.1
242
298
  ista_daslab_micro_adam.asymm_block_quant_inv(d, self.quant_block_size, error, min_vals, max_vals, grad, self.alpha)
243
299
  ##### END IF PRETRAINING #1
244
300
 
245
301
  # if alpha > 0, then the update u=p.grad is dense now
246
- p.mul_(1 - lr * wd).add_(p.grad, alpha=-lr)
302
+
303
+ # update model using MicroAdam update stored in p.grad
304
+ p.add_(p.grad, alpha=-lr)
305
+
306
+ if self.steps % self.log_interval == 0:
307
+ norm_u = grad.norm(p=2) ** 2
308
+ sp_u = (grad == 0).sum() # check sparsity before zerorizing
247
309
 
248
310
  ##### if PRETRAINING #2
249
- if self.densify_update:
311
+ if self.densify_update_using_ef:
250
312
  grad.zero_()
251
313
  ista_daslab_micro_adam.asymm_block_quant_inv(d, self.quant_block_size, error, min_vals, max_vals, grad, 1-self.alpha)
252
314
 
@@ -256,27 +318,29 @@ class MicroAdam(torch.optim.Optimizer):
256
318
 
257
319
  # compute error norm
258
320
  if self.steps % self.log_interval == 0:
259
- norm_u = grad.norm(p=2) ** 2
260
- sp_u = (grad == 0).sum() # check sparsity before zerorizing
261
-
262
321
  grad.zero_()
263
322
  ista_daslab_micro_adam.asymm_block_quant_inv(d, self.quant_block_size, error, min_vals, max_vals, grad, 1.0)
264
323
 
265
324
  norm_e = grad.norm(p=2) ** 2
266
325
 
267
- return norm_g, norm_u, norm_e, sp_u
326
+ # p.grad = p.grad.to(dtype=original_grad_type)
327
+
328
+ return norm_qe, norm_g, norm_u, norm_e, sp_u, sp_qe
268
329
 
269
- def _log(self, norm_g, norm_u, norm_e, sparsity_u, elapsed_step):
330
+ def _log(self, norm_qe, norm_g, norm_u, norm_e, sparsity_u, sparsity_qe, elapsed_step):
270
331
  if self.steps % self.log_interval == 0:
271
- sync_data = torch.tensor([norm_g, norm_u, norm_e, sparsity_u, elapsed_step], dtype=torch.float,
272
- requires_grad=False).cuda() # correct, loss, size
273
- all_reduce(sync_data, op=ReduceOp.SUM)
274
- norm_g, norm_u, norm_e, sparsity_u, elapsed_step = sync_data
332
+ if is_initialized():
333
+ sync_data = torch.tensor([norm_qe, norm_g, norm_u, norm_e, sparsity_u, sparsity_qe, elapsed_step], dtype=torch.float,
334
+ requires_grad=False).cuda() # correct, loss, size
335
+ all_reduce(sync_data, op=ReduceOp.SUM)
336
+ norm_qe, norm_g, norm_u, norm_e, sparsity_u, sparsity_qe, elapsed_step = sync_data
275
337
 
276
338
  if not is_initialized() or get_rank() == 0:
277
339
  wandb_data = {
278
340
  'step/optimizer_steps': self.steps,
279
341
  'step/gpu_mem_usage': get_gpu_mem_usage(),
342
+ 'step/norm_quant_err': math.sqrt(norm_qe),
343
+ 'step/sparsity_quant_err': sparsity_qe / self.model_size * 100.,
280
344
  'step/norm_g': math.sqrt(norm_g),
281
345
  'step/norm_u': math.sqrt(norm_u),
282
346
  'step/norm_error': math.sqrt(norm_e),
@@ -335,4 +399,4 @@ class MicroAdam(torch.optim.Optimizer):
335
399
  # st['quant_full_blocks_count'], st['d_index_quant'] = block_split(st['d'], self.quant_block_size)
336
400
  # st['error'] = torch.zeros(int(math.ceil(st['d'] / 2)), dtype=torch.uint8, device=self.device) # ceil(d/2) bytes
337
401
  # st['min_vals'] = torch.zeros(st['quant_full_blocks_count'] + 1, dtype=torch.bfloat16, device=self.device) # ceil(d/q_bsz)*2 bytes
338
- # st['max_vals'] = torch.zeros(st['quant_full_blocks_count'] + 1, dtype=torch.bfloat16, device=self.device) # ceil(d/q_bsz)*2 bytes
402
+ # st['max_vals'] = torch.zeros(st['quant_full_blocks_count'] + 1, dtype=torch.bfloat16, device=self.device) # ceil(d/q_bsz)*2 bytes
@@ -0,0 +1,7 @@
1
+ from .sparse_mfac import SparseMFAC
2
+ from .sparse_core_mfac_w_ef import SparseCoreMFACwithEF
3
+
4
+ __all__ = [
5
+ 'SparseMFAC',
6
+ 'SparseCoreMFACwithEF'
7
+ ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ista_daslab_optimizers
3
- Version: 1.1.2
3
+ Version: 1.1.3
4
4
  Summary: Deep Learning optimizers developed in the Distributed Algorithms and Systems group (DASLab) @ Institute of Science and Technology Austria (ISTA)
5
5
  Author-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
6
6
  Maintainer-email: Ionut-Vlad Modoranu <ionut-vlad.modoranu@ist.ac.at>
@@ -289,6 +289,8 @@ optimizer = MicroAdam(
289
289
  # Versions summary:
290
290
 
291
291
  ---
292
+ - **1.1.3** @ September 5th, 2024:
293
+ - allow using `SparseCoreMFACwithEF` separately by importing it in `sparse_mfac.__init__.py`
292
294
  - **1.1.2** @ August 1st, 2024:
293
295
  - ***[1.1.0]:*** added support to densify the final update: introduced parameter alpha that controls the fraction of error feedback
294
296
  (EF) to be integrated into the update to make it dense. Finally, the fraction alpha will be discarded from the EF at
@@ -3,7 +3,6 @@ MANIFEST.in
3
3
  README.md
4
4
  pyproject.toml
5
5
  setup.py
6
- ./kernels/utils.h
7
6
  ./kernels/dense_mfac/dense_mfac.cpp
8
7
  ./kernels/dense_mfac/dense_mfac_kernel.cu
9
8
  ./kernels/micro_adam/micro_adam.cpp
@@ -32,4 +31,16 @@ ista_daslab_optimizers/micro_adam/__init__.py
32
31
  ista_daslab_optimizers/micro_adam/micro_adam.py
33
32
  ista_daslab_optimizers/sparse_mfac/__init__.py
34
33
  ista_daslab_optimizers/sparse_mfac/sparse_core_mfac_w_ef.py
35
- ista_daslab_optimizers/sparse_mfac/sparse_mfac.py
34
+ ista_daslab_optimizers/sparse_mfac/sparse_mfac.py
35
+ kernels/utils.h
36
+ kernels/dense_mfac/dense_mfac.cpp
37
+ kernels/dense_mfac/dense_mfac_kernel.cu
38
+ kernels/micro_adam/micro_adam.cpp
39
+ kernels/micro_adam/micro_adam_asymm_block_quant.cu
40
+ kernels/micro_adam/micro_adam_asymm_block_quant_inv.cu
41
+ kernels/micro_adam/micro_adam_update.cu
42
+ kernels/sparse_mfac/sparse_mfac.cpp
43
+ kernels/sparse_mfac/sparse_mfac_LCG_kernel.cu
44
+ kernels/sparse_mfac/sparse_mfac_SP_kernel.cu
45
+ kernels/tools/tools.cpp
46
+ kernels/tools/tools_kernel.cu
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name='ista_daslab_optimizers'
7
- version='1.1.2'
7
+ version='1.1.3'
8
8
  dependencies = [
9
9
  "torch", # >=2.3.1",
10
10
  "torchaudio", # >=2.3.1",
@@ -1,5 +0,0 @@
1
- from .sparse_mfac import SparseMFAC
2
-
3
- __all__ = [
4
- 'SparseMFAC',
5
- ]