mmgp 3.1.4.post15__py3-none-any.whl → 3.1.4.post151__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +417 -187
- mmgp/safetensors2.py +10 -4
- {mmgp-3.1.4.post15.dist-info → mmgp-3.1.4.post151.dist-info}/METADATA +19 -12
- mmgp-3.1.4.post151.dist-info/RECORD +9 -0
- mmgp-3.1.4.post15.dist-info/RECORD +0 -9
- {mmgp-3.1.4.post15.dist-info → mmgp-3.1.4.post151.dist-info}/LICENSE.md +0 -0
- {mmgp-3.1.4.post15.dist-info → mmgp-3.1.4.post151.dist-info}/WHEEL +0 -0
- {mmgp-3.1.4.post15.dist-info → mmgp-3.1.4.post151.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
1
|
+
# ------------------ Memory Management 3.1.4-1591 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -76,7 +76,18 @@ except:
|
|
|
76
76
|
from mmgp import safetensors2
|
|
77
77
|
from mmgp import profile_type
|
|
78
78
|
|
|
79
|
-
from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module
|
|
79
|
+
from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module, register_qmodule
|
|
80
|
+
|
|
81
|
+
# support for Embedding module quantization that is not supported by default by quanto
|
|
82
|
+
@register_qmodule(torch.nn.Embedding)
|
|
83
|
+
class QEmbedding(QModuleMixin, torch.nn.Embedding):
|
|
84
|
+
@classmethod
|
|
85
|
+
def qcreate(cls, module, weights, activations = None, optimizer = None, device = None):
|
|
86
|
+
module.bias = None
|
|
87
|
+
return cls( module.num_embeddings, module.embedding_dim, module.padding_idx , module.max_norm, module.norm_type, module.scale_grad_by_freq, module.sparse, dtype=module.weight.dtype, device=device, weights=weights,
|
|
88
|
+
activations=activations, optimizer=optimizer, quantize_input=True)
|
|
89
|
+
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
|
90
|
+
return torch.nn.functional.embedding( input, self.qweight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse )
|
|
80
91
|
|
|
81
92
|
|
|
82
93
|
shared_state = {}
|
|
@@ -96,11 +107,6 @@ ENDC = '\033[0m'
|
|
|
96
107
|
BOLD ='\033[1m'
|
|
97
108
|
UNBOLD ='\033[0m'
|
|
98
109
|
|
|
99
|
-
cotenants_map = {
|
|
100
|
-
"text_encoder": ["vae", "text_encoder_2"],
|
|
101
|
-
"text_encoder_2": ["vae", "text_encoder"],
|
|
102
|
-
}
|
|
103
|
-
|
|
104
110
|
class clock:
|
|
105
111
|
def __init__(self):
|
|
106
112
|
self.start_time = 0
|
|
@@ -216,15 +222,17 @@ def _get_model(model_path):
|
|
|
216
222
|
if len(_path)<=1:
|
|
217
223
|
raise("file not found")
|
|
218
224
|
else:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
if len(_path) > 2:
|
|
223
|
-
_subfolder = os.path.join(*_path[2:] )
|
|
224
|
-
model_path = hf_hub_download(repo_id=repoId, filename=_filename, subfolder=_subfolder)
|
|
225
|
-
else:
|
|
226
|
-
model_path = hf_hub_download(repo_id=repoId, filename=_filename)
|
|
225
|
+
try:
|
|
226
|
+
from huggingface_hub import hf_hub_download #snapshot_download,
|
|
227
|
+
repoId= os.path.join(*_path[0:2] ).replace("\\", "/")
|
|
227
228
|
|
|
229
|
+
if len(_path) > 2:
|
|
230
|
+
_subfolder = os.path.join(*_path[2:] )
|
|
231
|
+
model_path = hf_hub_download(repo_id=repoId, filename=_filename, subfolder=_subfolder)
|
|
232
|
+
else:
|
|
233
|
+
model_path = hf_hub_download(repo_id=repoId, filename=_filename)
|
|
234
|
+
except:
|
|
235
|
+
model_path = None
|
|
228
236
|
return model_path
|
|
229
237
|
|
|
230
238
|
|
|
@@ -278,9 +286,17 @@ def _force_load_parameter(p):
|
|
|
278
286
|
torch.utils.swap_tensors(p, q)
|
|
279
287
|
del q
|
|
280
288
|
|
|
281
|
-
def
|
|
289
|
+
def _get_tensor_ref(p):
|
|
290
|
+
if isinstance(p, QTensor):
|
|
291
|
+
if p._qtype == qint4:
|
|
292
|
+
return p._data._data.data_ptr()
|
|
293
|
+
else:
|
|
294
|
+
return p._data.data_ptr()
|
|
295
|
+
else:
|
|
296
|
+
return p.data_ptr()
|
|
282
297
|
|
|
283
298
|
|
|
299
|
+
def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
|
|
284
300
|
if partialPinning:
|
|
285
301
|
towers_names, _ = _detect_main_towers(model)
|
|
286
302
|
|
|
@@ -292,56 +308,63 @@ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
|
|
|
292
308
|
tensor_map_indexes = []
|
|
293
309
|
total_tensor_bytes = 0
|
|
294
310
|
|
|
295
|
-
|
|
311
|
+
params_dict = {} # OrderedDict
|
|
296
312
|
for k, sub_module in model.named_modules():
|
|
297
313
|
include = True
|
|
298
314
|
if partialPinning:
|
|
299
315
|
include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
|
|
300
316
|
if include:
|
|
301
|
-
|
|
302
|
-
|
|
317
|
+
params_dict.update( { k + '.' + n : (p, False) for n, p in sub_module.named_parameters(recurse=False) } )
|
|
318
|
+
params_dict.update( { k + '.' + n : (b, True) for n, b in sub_module.named_buffers(recurse=False) } )
|
|
303
319
|
|
|
304
320
|
if verboseLevel>=1 :
|
|
305
321
|
if partialPinning:
|
|
306
|
-
if len(
|
|
322
|
+
if len(params_dict) == 0:
|
|
307
323
|
print(f"Unable to apply Partial of '{model_id}' as no isolated main structures were found")
|
|
308
324
|
else:
|
|
309
325
|
print(f"Partial pinning of data of '{model_id}' to reserved RAM")
|
|
310
326
|
else:
|
|
311
327
|
print(f"Pinning data of '{model_id}' to reserved RAM")
|
|
312
328
|
|
|
313
|
-
if partialPinning and len(
|
|
329
|
+
if partialPinning and len(params_dict) == 0:
|
|
314
330
|
return
|
|
315
331
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
for n, p, _ in
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
332
|
+
ref_cache = {}
|
|
333
|
+
tied_weights = {}
|
|
334
|
+
for n, (p, _) in params_dict.items():
|
|
335
|
+
ref = _get_tensor_ref(p)
|
|
336
|
+
match = ref_cache.get(ref, None)
|
|
337
|
+
if match != None:
|
|
338
|
+
match_name, match_size = match
|
|
339
|
+
if verboseLevel >=1:
|
|
340
|
+
print(f"Tied weights of {match_size/ONE_MB:0.2f} MB detected: {match_name} <-> {n}")
|
|
341
|
+
tied_weights[n] = match_name
|
|
342
|
+
else:
|
|
343
|
+
if isinstance(p, QTensor):
|
|
344
|
+
if p._qtype == qint4:
|
|
345
|
+
if hasattr(p,"_scale_shift"):
|
|
346
|
+
length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
347
|
+
else:
|
|
348
|
+
length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
|
|
323
349
|
else:
|
|
324
|
-
length = torch.numel(p._data
|
|
350
|
+
length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
|
|
325
351
|
else:
|
|
326
|
-
length = torch.numel(p.
|
|
327
|
-
else:
|
|
328
|
-
length = torch.numel(p.data) * p.data.element_size()
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
|
|
332
|
-
big_tensors_sizes.append(current_big_tensor_size)
|
|
333
|
-
current_big_tensor_size = 0
|
|
334
|
-
big_tensor_no += 1
|
|
352
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
335
353
|
|
|
354
|
+
ref_cache[ref] = (n, length)
|
|
355
|
+
if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
|
|
356
|
+
big_tensors_sizes.append(current_big_tensor_size)
|
|
357
|
+
current_big_tensor_size = 0
|
|
358
|
+
big_tensor_no += 1
|
|
336
359
|
|
|
337
|
-
itemsize = p.data.dtype.itemsize
|
|
338
|
-
if current_big_tensor_size % itemsize:
|
|
339
|
-
current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
|
|
340
|
-
tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
|
|
341
|
-
current_big_tensor_size += length
|
|
342
360
|
|
|
343
|
-
|
|
361
|
+
itemsize = p.data.dtype.itemsize
|
|
362
|
+
if current_big_tensor_size % itemsize:
|
|
363
|
+
current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
|
|
364
|
+
tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
|
|
365
|
+
current_big_tensor_size += length
|
|
344
366
|
|
|
367
|
+
total_tensor_bytes += length
|
|
345
368
|
|
|
346
369
|
big_tensors_sizes.append(current_big_tensor_size)
|
|
347
370
|
|
|
@@ -368,39 +391,53 @@ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
|
|
|
368
391
|
|
|
369
392
|
tensor_no = 0
|
|
370
393
|
# prev_big_tensor = 0
|
|
371
|
-
for n, p,
|
|
372
|
-
|
|
373
|
-
# if big_tensor_no != prev_big_tensor:
|
|
374
|
-
# gc.collect()
|
|
375
|
-
# prev_big_tensor = big_tensor_no
|
|
376
|
-
if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
377
|
-
current_big_tensor = big_tensors[big_tensor_no]
|
|
378
|
-
if is_buffer :
|
|
379
|
-
_force_load_buffer(p) # otherwise potential memory leak
|
|
394
|
+
for n, (p, is_buffer) in params_dict.items():
|
|
395
|
+
if n in tied_weights:
|
|
380
396
|
if isinstance(p, QTensor):
|
|
381
|
-
if p._qtype == qint4:
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
397
|
+
if p._qtype == qint4:
|
|
398
|
+
assert p._data._data.data.is_pinned()
|
|
399
|
+
else:
|
|
400
|
+
assert p._data.is_pinned()
|
|
401
|
+
else:
|
|
402
|
+
assert p.data.is_pinned()
|
|
403
|
+
else:
|
|
404
|
+
big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
|
|
405
|
+
# if big_tensor_no != prev_big_tensor:
|
|
406
|
+
# gc.collect()
|
|
407
|
+
# prev_big_tensor = big_tensor_no
|
|
408
|
+
# match_param, match_isbuffer = tied_weights.get(n, (None, False))
|
|
409
|
+
# if match_param != None:
|
|
410
|
+
|
|
411
|
+
if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
412
|
+
current_big_tensor = big_tensors[big_tensor_no]
|
|
413
|
+
if is_buffer :
|
|
414
|
+
_force_load_buffer(p) # otherwise potential memory leak
|
|
415
|
+
if isinstance(p, QTensor):
|
|
416
|
+
if p._qtype == qint4:
|
|
417
|
+
length1 = torch.numel(p._data._data) * p._data._data.element_size()
|
|
418
|
+
p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
|
|
419
|
+
if hasattr(p,"_scale_shift"):
|
|
420
|
+
length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
421
|
+
p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
|
|
422
|
+
else:
|
|
423
|
+
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
424
|
+
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
425
|
+
length3 = torch.numel(p._shift) * p._shift.element_size()
|
|
426
|
+
p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
|
|
387
427
|
else:
|
|
428
|
+
length1 = torch.numel(p._data) * p._data.element_size()
|
|
429
|
+
p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
|
|
388
430
|
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
389
431
|
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
390
|
-
length3 = torch.numel(p._shift) * p._shift.element_size()
|
|
391
|
-
p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
|
|
392
432
|
else:
|
|
393
|
-
|
|
394
|
-
p.
|
|
395
|
-
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
396
|
-
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
397
|
-
else:
|
|
398
|
-
length = torch.numel(p.data) * p.data.element_size()
|
|
399
|
-
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
433
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
434
|
+
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
400
435
|
|
|
401
|
-
|
|
436
|
+
tensor_no += 1
|
|
437
|
+
del p
|
|
402
438
|
global total_pinned_bytes
|
|
403
439
|
total_pinned_bytes += total
|
|
440
|
+
del params_dict
|
|
404
441
|
gc.collect()
|
|
405
442
|
|
|
406
443
|
if verboseLevel >=1:
|
|
@@ -420,7 +457,7 @@ def _welcome():
|
|
|
420
457
|
if welcome_displayed:
|
|
421
458
|
return
|
|
422
459
|
welcome_displayed = True
|
|
423
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-
|
|
460
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-151) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
424
461
|
|
|
425
462
|
def _extract_num_from_str(num_in_str):
|
|
426
463
|
size = len(num_in_str)
|
|
@@ -518,16 +555,6 @@ def _requantize(model: torch.nn.Module, state_dict: dict, quantization_map: dict
|
|
|
518
555
|
|
|
519
556
|
def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 1000000000, model_id = 'Unknown'):
|
|
520
557
|
|
|
521
|
-
def compute_submodule_size(submodule):
|
|
522
|
-
size = 0
|
|
523
|
-
for p in submodule.parameters(recurse=False):
|
|
524
|
-
size += torch.numel(p.data) * sizeofbfloat16
|
|
525
|
-
|
|
526
|
-
for p in submodule.buffers(recurse=False):
|
|
527
|
-
size += torch.numel(p.data) * sizeofbfloat16
|
|
528
|
-
|
|
529
|
-
return size
|
|
530
|
-
|
|
531
558
|
total_size =0
|
|
532
559
|
total_excluded = 0
|
|
533
560
|
exclude_list = []
|
|
@@ -549,16 +576,31 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
549
576
|
tower_names ,_ = _detect_main_towers(model_to_quantize)
|
|
550
577
|
tower_names = [ n[:-1] for n in tower_names]
|
|
551
578
|
|
|
579
|
+
|
|
580
|
+
cache_ref = {}
|
|
581
|
+
tied_weights= {}
|
|
582
|
+
|
|
552
583
|
for submodule_name, submodule in model_to_quantize.named_modules():
|
|
553
584
|
if isinstance(submodule, QModuleMixin):
|
|
554
585
|
if verboseLevel>=1:
|
|
555
586
|
print("No quantization to do as model is already quantized")
|
|
556
587
|
return False
|
|
557
588
|
|
|
558
|
-
|
|
559
|
-
|
|
589
|
+
size = 0
|
|
590
|
+
for n, p in submodule.named_parameters(recurse = False):
|
|
591
|
+
ref = _get_tensor_ref(p)
|
|
592
|
+
match = cache_ref.get(ref, None)
|
|
593
|
+
if match != None:
|
|
594
|
+
tied_weights[submodule_name]= (n, ) + match
|
|
595
|
+
else:
|
|
596
|
+
cache_ref[ref] = (submodule_name, n)
|
|
597
|
+
size += torch.numel(p.data) * sizeofbfloat16
|
|
598
|
+
|
|
599
|
+
for p in submodule.buffers(recurse=False):
|
|
600
|
+
size += torch.numel(p.data) * sizeofbfloat16
|
|
601
|
+
|
|
602
|
+
|
|
560
603
|
|
|
561
|
-
size = compute_submodule_size(submodule)
|
|
562
604
|
if not any(submodule_name.startswith(pre) for pre in tower_names):
|
|
563
605
|
flush = False
|
|
564
606
|
if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
|
|
@@ -590,12 +632,13 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
590
632
|
submodule_names.append(submodule_name)
|
|
591
633
|
total_size += size
|
|
592
634
|
|
|
593
|
-
if submodule_size >
|
|
635
|
+
if submodule_size >0 and submodule_size <= threshold :
|
|
594
636
|
exclude_list += submodule_names
|
|
595
637
|
if verboseLevel >=2:
|
|
596
638
|
print(f"Excluded size {submodule_size/ONE_MB:.1f} MB: {prev_blocks_prefix} : {submodule_names}")
|
|
597
639
|
total_excluded += submodule_size
|
|
598
640
|
|
|
641
|
+
|
|
599
642
|
perc_excluded =total_excluded/ total_size if total_size >0 else 1
|
|
600
643
|
if verboseLevel >=2:
|
|
601
644
|
if total_excluded == 0:
|
|
@@ -608,7 +651,10 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
608
651
|
exclude_list = None
|
|
609
652
|
|
|
610
653
|
|
|
611
|
-
|
|
654
|
+
exclude_list += list(tied_weights)
|
|
655
|
+
quantize(model_to_quantize, weights= weights, exclude= exclude_list)
|
|
656
|
+
|
|
657
|
+
|
|
612
658
|
# quantize(model_to_quantize,weights, include= [ "*1.block.attn.to_out*"]) #"
|
|
613
659
|
|
|
614
660
|
# for name, m in model_to_quantize.named_modules():
|
|
@@ -618,24 +664,40 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
618
664
|
|
|
619
665
|
# force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
|
|
620
666
|
# otherwise we may end up keeping in memory both the quantized and the non quantize model
|
|
621
|
-
for n,m in model_to_quantize.named_modules()
|
|
667
|
+
named_modules = {n:m for n,m in model_to_quantize.named_modules()}
|
|
668
|
+
for module_name, module in named_modules.items():
|
|
622
669
|
# do not read quantized weights (detected them directly or behind an adapter)
|
|
623
|
-
if isinstance(
|
|
624
|
-
if hasattr(
|
|
625
|
-
_force_load_parameter(
|
|
670
|
+
if isinstance(module, QModuleMixin) or hasattr(module, "base_layer") and isinstance(module.base_layer, QModuleMixin):
|
|
671
|
+
if hasattr(module, "bias") and module.bias is not None:
|
|
672
|
+
_force_load_parameter(module.bias)
|
|
626
673
|
else:
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
674
|
+
tied_w = tied_weights.get(module_name, None)
|
|
675
|
+
for n, p in module.named_parameters(recurse = False):
|
|
676
|
+
if tied_w != None and n == tied_w[0]:
|
|
677
|
+
if isinstance( named_modules[tied_w[1]], QModuleMixin) :
|
|
678
|
+
setattr(module, n, None) # release refs of tied weights if source is going to be quantized
|
|
679
|
+
# otherwise don't force load as it will be loaded in the source anyway
|
|
680
|
+
else:
|
|
681
|
+
_force_load_parameter(p)
|
|
682
|
+
del p # del p if not it will still contain a ref to a tensor when leaving the loop
|
|
683
|
+
for b in module.buffers(recurse = False):
|
|
631
684
|
_force_load_buffer(b)
|
|
632
|
-
|
|
685
|
+
del b
|
|
633
686
|
|
|
634
687
|
|
|
635
688
|
freeze(model_to_quantize)
|
|
636
689
|
torch.cuda.empty_cache()
|
|
637
|
-
gc.collect()
|
|
690
|
+
gc.collect()
|
|
691
|
+
|
|
692
|
+
for tied_module, (tied_weight, src_module, src_weight) in tied_weights.items():
|
|
693
|
+
p = getattr(named_modules[src_module], src_weight)
|
|
694
|
+
if isinstance(p, QTensor):
|
|
695
|
+
setattr(named_modules[tied_module], tied_weight, p ) # copy refs to quantized sources
|
|
696
|
+
|
|
697
|
+
del named_modules
|
|
698
|
+
|
|
638
699
|
quantization_map = _quantization_map(model_to_quantize)
|
|
700
|
+
|
|
639
701
|
model_to_quantize._quanto_map = quantization_map
|
|
640
702
|
|
|
641
703
|
if hasattr(model_to_quantize, "_already_pinned"):
|
|
@@ -647,12 +709,81 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
647
709
|
|
|
648
710
|
return True
|
|
649
711
|
|
|
712
|
+
def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
|
713
|
+
self._check_forward_args(x, *args, **kwargs)
|
|
714
|
+
adapter_names = kwargs.pop("adapter_names", None)
|
|
715
|
+
if self.disable_adapters:
|
|
716
|
+
if self.merged:
|
|
717
|
+
self.unmerge()
|
|
718
|
+
result = self.base_layer(x, *args, **kwargs)
|
|
719
|
+
elif adapter_names is not None:
|
|
720
|
+
result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
|
|
721
|
+
elif self.merged:
|
|
722
|
+
result = self.base_layer(x, *args, **kwargs)
|
|
723
|
+
else:
|
|
724
|
+
base_weight = self.base_layer.weight
|
|
725
|
+
if base_weight.shape[-1] < x.shape[-2]: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
726
|
+
for active_adapter in self.active_adapters:
|
|
727
|
+
lora_A = self.lora_A[active_adapter]
|
|
728
|
+
lora_B = self.lora_B[active_adapter]
|
|
729
|
+
scaling = self.scaling[active_adapter]
|
|
730
|
+
lora_A_weight = lora_A.weight
|
|
731
|
+
lora_B_weight = lora_B.weight
|
|
732
|
+
lora_BA = lora_B_weight @ lora_A_weight
|
|
733
|
+
base_weight += scaling * lora_BA
|
|
734
|
+
|
|
735
|
+
result = torch.nn.functional.linear(x, base_weight, bias=self.base_layer.bias)
|
|
736
|
+
torch_result_dtype = result.dtype
|
|
737
|
+
|
|
738
|
+
else:
|
|
739
|
+
result = self.base_layer(x, *args, **kwargs)
|
|
740
|
+
torch_result_dtype = result.dtype
|
|
741
|
+
x = x.to(torch.bfloat16)
|
|
742
|
+
|
|
743
|
+
for active_adapter in self.active_adapters:
|
|
744
|
+
if active_adapter not in self.lora_A.keys():
|
|
745
|
+
continue
|
|
746
|
+
lora_A = self.lora_A[active_adapter]
|
|
747
|
+
lora_B = self.lora_B[active_adapter]
|
|
748
|
+
dropout = self.lora_dropout[active_adapter]
|
|
749
|
+
scaling = self.scaling[active_adapter]
|
|
750
|
+
x = x.to(lora_A.weight.dtype)
|
|
751
|
+
|
|
752
|
+
if not self.use_dora[active_adapter]:
|
|
753
|
+
y = lora_A(x)
|
|
754
|
+
y = lora_B(y)
|
|
755
|
+
y*= scaling
|
|
756
|
+
result+= y
|
|
757
|
+
del lora_A, lora_B, y
|
|
758
|
+
# result = result + lora_B(lora_A(dropout(x))) * scaling
|
|
759
|
+
else:
|
|
760
|
+
if isinstance(dropout, nn.Identity) or not self.training:
|
|
761
|
+
base_result = result
|
|
762
|
+
else:
|
|
763
|
+
x = dropout(x)
|
|
764
|
+
base_result = None
|
|
765
|
+
|
|
766
|
+
result = result + self.lora_magnitude_vector[active_adapter](
|
|
767
|
+
x,
|
|
768
|
+
lora_A=lora_A,
|
|
769
|
+
lora_B=lora_B,
|
|
770
|
+
scaling=scaling,
|
|
771
|
+
base_layer=self.get_base_layer(),
|
|
772
|
+
base_result=base_result,
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
result = result.to(torch_result_dtype)
|
|
776
|
+
return result
|
|
777
|
+
|
|
650
778
|
def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, verboseLevel = -1,):
|
|
651
779
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
652
780
|
|
|
653
781
|
if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or get_peft_kwargs == None:
|
|
654
782
|
raise Exception("Unable to load Lora, missing 'peft' and / or 'diffusers' modules")
|
|
655
|
-
|
|
783
|
+
|
|
784
|
+
from peft.tuners.lora import Linear
|
|
785
|
+
Linear.forward = _lora_linear_forward
|
|
786
|
+
|
|
656
787
|
if not isinstance(lora_path, list):
|
|
657
788
|
lora_path = [lora_path]
|
|
658
789
|
|
|
@@ -662,6 +793,9 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
662
793
|
for i, path in enumerate(lora_path):
|
|
663
794
|
adapter_name = str(i)
|
|
664
795
|
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
|
|
665
799
|
state_dict = safetensors2.torch_load_file(path)
|
|
666
800
|
|
|
667
801
|
keys = list(state_dict.keys())
|
|
@@ -843,7 +977,6 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
843
977
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
844
978
|
|
|
845
979
|
model = _remove_model_wrapper(model)
|
|
846
|
-
|
|
847
980
|
if not (".safetensors" in file_path or ".sft" in file_path):
|
|
848
981
|
if pinToMemory:
|
|
849
982
|
raise Exception("Pinning to memory while loading only supported for safe tensors files")
|
|
@@ -855,12 +988,20 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
855
988
|
|
|
856
989
|
if metadata is None:
|
|
857
990
|
quantization_map = None
|
|
991
|
+
tied_weights_map = None
|
|
858
992
|
else:
|
|
859
993
|
quantization_map = metadata.get("quantization_map", None)
|
|
860
994
|
config = metadata.get("config", None)
|
|
861
995
|
if config is not None:
|
|
862
996
|
model._config = config
|
|
863
997
|
|
|
998
|
+
tied_weights_map = metadata.get("tied_weights_map", None)
|
|
999
|
+
if tied_weights_map != None:
|
|
1000
|
+
for name, tied_weights_list in tied_weights_map.items():
|
|
1001
|
+
mapped_weight = state_dict[name]
|
|
1002
|
+
for tied_weights in tied_weights_list:
|
|
1003
|
+
state_dict[tied_weights] = mapped_weight
|
|
1004
|
+
|
|
864
1005
|
|
|
865
1006
|
|
|
866
1007
|
if quantization_map is None:
|
|
@@ -915,6 +1056,7 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
|
|
|
915
1056
|
"""
|
|
916
1057
|
|
|
917
1058
|
config = None
|
|
1059
|
+
extra_meta = None
|
|
918
1060
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
919
1061
|
if config_file_path !=None:
|
|
920
1062
|
with open(config_file_path, "r", encoding="utf-8") as reader:
|
|
@@ -928,8 +1070,10 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
|
|
|
928
1070
|
config_path = getattr(config_obj,"_name_or_path", None)
|
|
929
1071
|
if config_path != None:
|
|
930
1072
|
config_fullpath = os.path.join(config_path, "config.json")
|
|
931
|
-
|
|
932
|
-
|
|
1073
|
+
config_fullpath = _get_model(config_fullpath)
|
|
1074
|
+
|
|
1075
|
+
# if not os.path.isfile(config_fullpath):
|
|
1076
|
+
# config_fullpath = None
|
|
933
1077
|
if config_fullpath is None:
|
|
934
1078
|
config_fullpath = os.path.join(os.path.dirname(file_path), "config.json")
|
|
935
1079
|
if os.path.isfile(config_fullpath):
|
|
@@ -942,15 +1086,50 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
|
|
|
942
1086
|
|
|
943
1087
|
quantization_map = getattr(model, "_quanto_map", None)
|
|
944
1088
|
|
|
1089
|
+
from collections import OrderedDict
|
|
1090
|
+
|
|
1091
|
+
cache_ref = {}
|
|
1092
|
+
tied_weights_map = {}
|
|
1093
|
+
sd = model.state_dict()
|
|
1094
|
+
out_sd = OrderedDict()
|
|
1095
|
+
|
|
1096
|
+
|
|
1097
|
+
for name, weight in sd.items():
|
|
1098
|
+
ref = _get_tensor_ref(weight)
|
|
1099
|
+
match = cache_ref.get(ref, None)
|
|
1100
|
+
if match != None:
|
|
1101
|
+
tied_list = tied_weights_map.get(match, [])
|
|
1102
|
+
tied_list.append(name)
|
|
1103
|
+
tied_weights_map[match] = tied_list
|
|
1104
|
+
else:
|
|
1105
|
+
out_sd[name] = weight
|
|
1106
|
+
cache_ref[ref] = name
|
|
1107
|
+
|
|
1108
|
+
if len(tied_weights_map) > 0:
|
|
1109
|
+
extra_meta = { "tied_weights_map" : tied_weights_map }
|
|
1110
|
+
|
|
945
1111
|
if verboseLevel >=1:
|
|
946
1112
|
print(f"Saving file '{file_path}")
|
|
947
|
-
|
|
1113
|
+
|
|
1114
|
+
safetensors2.torch_write_file(out_sd, file_path , quantization_map = quantization_map, config = config, extra_meta= extra_meta)
|
|
948
1115
|
if verboseLevel >=1:
|
|
949
1116
|
print(f"File '{file_path}' saved")
|
|
950
1117
|
|
|
951
1118
|
|
|
952
|
-
def extract_models(
|
|
1119
|
+
def extract_models(obj = None, prefix = None):
|
|
1120
|
+
if isinstance(obj, str): # for compatibility as the two args were switched
|
|
1121
|
+
bkp = prefix
|
|
1122
|
+
prefix = obj
|
|
1123
|
+
obj = bkp
|
|
1124
|
+
|
|
953
1125
|
pipe = {}
|
|
1126
|
+
if obj == None:
|
|
1127
|
+
raise Exception("an object to analyze must be provided")
|
|
1128
|
+
if prefix==None or len(prefix)==0:
|
|
1129
|
+
prefix = ""
|
|
1130
|
+
elif prefix[ -1:] != "/":
|
|
1131
|
+
prefix + "/"
|
|
1132
|
+
|
|
954
1133
|
for name in dir(obj):
|
|
955
1134
|
element = getattr(obj,name)
|
|
956
1135
|
if name in ("pipeline", "pipe"):
|
|
@@ -958,16 +1137,16 @@ def extract_models(prefix, obj):
|
|
|
958
1137
|
if hasattr(pipeline , "components") and isinstance(pipeline.components, dict):
|
|
959
1138
|
for k, model in pipeline.components.items():
|
|
960
1139
|
if model != None:
|
|
961
|
-
pipe[prefix +
|
|
962
|
-
elif isinstance(element, torch.nn.Module):
|
|
963
|
-
if prefix
|
|
964
|
-
pipe[prefix
|
|
1140
|
+
pipe[prefix + k ] = model
|
|
1141
|
+
elif isinstance(element, torch.nn.Module) and name!="base_model":
|
|
1142
|
+
if prefix + name in pipe:
|
|
1143
|
+
pipe[prefix + "_" + name ] = element
|
|
965
1144
|
else:
|
|
966
|
-
pipe[prefix
|
|
1145
|
+
pipe[prefix + name ] = element
|
|
967
1146
|
elif isinstance(element, dict):
|
|
968
1147
|
for k, element in element.items():
|
|
969
1148
|
if hasattr(element , "pipeline"):
|
|
970
|
-
pipe.update( extract_models(prefix +
|
|
1149
|
+
pipe.update( extract_models(prefix + k,element ))
|
|
971
1150
|
|
|
972
1151
|
|
|
973
1152
|
return pipe
|
|
@@ -989,6 +1168,10 @@ class offload:
|
|
|
989
1168
|
self.active_models_ids = []
|
|
990
1169
|
self.active_subcaches = {}
|
|
991
1170
|
self.models = {}
|
|
1171
|
+
self.cotenants_map = {
|
|
1172
|
+
"text_encoder": ["vae", "text_encoder_2"],
|
|
1173
|
+
"text_encoder_2": ["vae", "text_encoder"],
|
|
1174
|
+
}
|
|
992
1175
|
self.verboseLevel = 0
|
|
993
1176
|
self.blocks_of_modules = {}
|
|
994
1177
|
self.blocks_of_modules_sizes = {}
|
|
@@ -1002,14 +1185,13 @@ class offload:
|
|
|
1002
1185
|
self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
|
|
1003
1186
|
self.transfer_stream = torch.cuda.Stream()
|
|
1004
1187
|
self.async_transfers = False
|
|
1188
|
+
self.parameters_ref = {}
|
|
1005
1189
|
global last_offload_obj
|
|
1006
1190
|
last_offload_obj = self
|
|
1007
1191
|
|
|
1008
1192
|
|
|
1009
|
-
def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name):
|
|
1193
|
+
def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
|
|
1010
1194
|
|
|
1011
|
-
if blocks_name is None:
|
|
1012
|
-
pass
|
|
1013
1195
|
entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
|
|
1014
1196
|
if entry_name in self.blocks_of_modules:
|
|
1015
1197
|
blocks_params = self.blocks_of_modules[entry_name]
|
|
@@ -1023,39 +1205,54 @@ class offload:
|
|
|
1023
1205
|
self.prev_blocks_names[entry_name] = prev_entry_name
|
|
1024
1206
|
if not prev_block_name == None:
|
|
1025
1207
|
self.next_blocks_names[prev_entry_name] = entry_name
|
|
1026
|
-
|
|
1208
|
+
bef = blocks_params_size
|
|
1027
1209
|
for k,p in submodule.named_parameters(recurse=False):
|
|
1210
|
+
param_size = 0
|
|
1211
|
+
ref = _get_tensor_ref(p)
|
|
1212
|
+
tied_param = self.parameters_ref.get(ref, None)
|
|
1028
1213
|
|
|
1029
1214
|
if isinstance(p, QTensor):
|
|
1030
|
-
blocks_params.append( (submodule, k, p, False ) )
|
|
1215
|
+
blocks_params.append( (submodule, k, p, False, tied_param ) )
|
|
1031
1216
|
|
|
1032
1217
|
if p._qtype == qint4:
|
|
1033
1218
|
if hasattr(p,"_scale_shift"):
|
|
1034
|
-
|
|
1035
|
-
|
|
1219
|
+
param_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
1220
|
+
param_size += torch.numel(p._data._data) * p._data._data.element_size()
|
|
1036
1221
|
else:
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1222
|
+
param_size += torch.numel(p._scale) * p._scale.element_size()
|
|
1223
|
+
param_size += torch.numel(p._shift) * p._shift.element_size()
|
|
1224
|
+
param_size += torch.numel(p._data._data) * p._data._data.element_size()
|
|
1040
1225
|
else:
|
|
1041
|
-
|
|
1042
|
-
|
|
1226
|
+
param_size += torch.numel(p._scale) * p._scale.element_size()
|
|
1227
|
+
param_size += torch.numel(p._data) * p._data.element_size()
|
|
1043
1228
|
else:
|
|
1044
|
-
blocks_params.append( (submodule, k, p, False) )
|
|
1045
|
-
|
|
1229
|
+
blocks_params.append( (submodule, k, p, False, tied_param) )
|
|
1230
|
+
param_size += torch.numel(p.data) * p.data.element_size()
|
|
1231
|
+
|
|
1232
|
+
|
|
1233
|
+
if tied_param == None:
|
|
1234
|
+
blocks_params_size += param_size
|
|
1235
|
+
self.parameters_ref[ref] = (submodule, k)
|
|
1046
1236
|
|
|
1047
1237
|
for k, p in submodule.named_buffers(recurse=False):
|
|
1048
|
-
blocks_params.append( (submodule, k, p, True) )
|
|
1238
|
+
blocks_params.append( (submodule, k, p, True, None) )
|
|
1049
1239
|
blocks_params_size += p.data.nbytes
|
|
1050
1240
|
|
|
1241
|
+
aft = blocks_params_size
|
|
1242
|
+
|
|
1243
|
+
# if blocks_name is None:
|
|
1244
|
+
# print(f"Default: {model_id}/{submodule_name} : {(aft-bef)/ONE_MB:0.2f} MB")
|
|
1245
|
+
# pass
|
|
1246
|
+
|
|
1051
1247
|
|
|
1052
1248
|
self.blocks_of_modules_sizes[entry_name] = blocks_params_size
|
|
1053
1249
|
|
|
1250
|
+
|
|
1054
1251
|
return blocks_params_size
|
|
1055
1252
|
|
|
1056
1253
|
|
|
1057
1254
|
def can_model_be_cotenant(self, model_id):
|
|
1058
|
-
potential_cotenants= cotenants_map.get(model_id, None)
|
|
1255
|
+
potential_cotenants= self.cotenants_map.get(model_id, None)
|
|
1059
1256
|
if potential_cotenants is None:
|
|
1060
1257
|
return False
|
|
1061
1258
|
for existing_cotenant in self.active_models_ids:
|
|
@@ -1073,20 +1270,23 @@ class offload:
|
|
|
1073
1270
|
def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
|
|
1074
1271
|
with torch.cuda.stream(stream_to_use):
|
|
1075
1272
|
for param in blocks_params:
|
|
1076
|
-
parent_module, n, p, is_buffer = param
|
|
1273
|
+
parent_module, n, p, is_buffer, tied_param = param
|
|
1274
|
+
if tied_param != None:
|
|
1275
|
+
tied_p = getattr( tied_param[0], tied_param[1])
|
|
1276
|
+
if tied_p.is_cuda:
|
|
1277
|
+
setattr(parent_module, n , tied_p)
|
|
1278
|
+
continue
|
|
1279
|
+
|
|
1077
1280
|
q = p.to("cuda", non_blocking=True)
|
|
1078
1281
|
if is_buffer:
|
|
1079
1282
|
q = torch.nn.Buffer(q)
|
|
1080
1283
|
else:
|
|
1081
1284
|
q = torch.nn.Parameter(q , requires_grad=False)
|
|
1082
1285
|
setattr(parent_module, n , q)
|
|
1083
|
-
# if record_for_stream != None:
|
|
1084
|
-
# if isinstance(p, QTensor):
|
|
1085
|
-
# q._data.record_stream(record_for_stream)
|
|
1086
|
-
# q._scale.record_stream(record_for_stream)
|
|
1087
|
-
# else:
|
|
1088
|
-
# p.data.record_stream(record_for_stream)
|
|
1089
1286
|
|
|
1287
|
+
if tied_param != None:
|
|
1288
|
+
setattr( tied_param[0], tied_param[1], q)
|
|
1289
|
+
del p, q
|
|
1090
1290
|
any_past_block = False
|
|
1091
1291
|
|
|
1092
1292
|
loaded_block = self.loaded_blocks[model_id]
|
|
@@ -1108,24 +1308,24 @@ class offload:
|
|
|
1108
1308
|
first = self.prev_blocks_names[entry_name] == None or not any_past_block
|
|
1109
1309
|
next_blocks_entry = self.next_blocks_names[entry_name] if entry_name in self.next_blocks_names else None
|
|
1110
1310
|
if first:
|
|
1111
|
-
cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
|
|
1112
1311
|
if self.verboseLevel >=2:
|
|
1113
1312
|
if preload:
|
|
1114
1313
|
print(f"Preloading model {entry_name} ({model_name}) in GPU")
|
|
1115
1314
|
else:
|
|
1116
1315
|
print(f"Loading model {entry_name} ({model_name}) in GPU")
|
|
1316
|
+
cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
|
|
1117
1317
|
|
|
1118
1318
|
torch.cuda.synchronize()
|
|
1119
1319
|
|
|
1120
1320
|
if next_blocks_entry != None:
|
|
1121
|
-
cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
|
|
1122
1321
|
if self.verboseLevel >=2:
|
|
1123
1322
|
print(f"Prefetching model {next_blocks_entry} ({model_name}) in GPU")
|
|
1323
|
+
cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
|
|
1124
1324
|
|
|
1125
1325
|
else:
|
|
1126
|
-
cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
|
|
1127
1326
|
if self.verboseLevel >=2:
|
|
1128
1327
|
print(f"Loading model {entry_name} ({model_name}) in GPU")
|
|
1328
|
+
cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
|
|
1129
1329
|
torch.cuda.synchronize()
|
|
1130
1330
|
|
|
1131
1331
|
if not preload:
|
|
@@ -1149,12 +1349,13 @@ class offload:
|
|
|
1149
1349
|
|
|
1150
1350
|
blocks_params = self.blocks_of_modules[blocks_name]
|
|
1151
1351
|
for param in blocks_params:
|
|
1152
|
-
parent_module, n, p, is_buffer = param
|
|
1352
|
+
parent_module, n, p, is_buffer, _ = param
|
|
1153
1353
|
if is_buffer:
|
|
1154
1354
|
q = torch.nn.Buffer(p)
|
|
1155
1355
|
else:
|
|
1156
1356
|
q = torch.nn.Parameter(p , requires_grad=False)
|
|
1157
1357
|
setattr(parent_module, n , q)
|
|
1358
|
+
del p, q
|
|
1158
1359
|
# cl.stop()
|
|
1159
1360
|
# print(f"unload time: {cl.format_time_gap()}")
|
|
1160
1361
|
|
|
@@ -1168,9 +1369,6 @@ class offload:
|
|
|
1168
1369
|
for block_name in self.preloaded_blocks_per_model[model_id]:
|
|
1169
1370
|
self.gpu_load_blocks(model_id, block_name, True)
|
|
1170
1371
|
|
|
1171
|
-
|
|
1172
|
-
# torch.cuda.current_stream().synchronize()
|
|
1173
|
-
|
|
1174
1372
|
def unload_all(self):
|
|
1175
1373
|
for model_id in self.active_models_ids:
|
|
1176
1374
|
self.gpu_unload_blocks(model_id, None)
|
|
@@ -1246,6 +1444,16 @@ class offload:
|
|
|
1246
1444
|
|
|
1247
1445
|
return False
|
|
1248
1446
|
|
|
1447
|
+
def ensure_model_loaded(self, model_id):
|
|
1448
|
+
if model_id in self.active_models_ids:
|
|
1449
|
+
return
|
|
1450
|
+
# new_model_id = getattr(module, "_mm_id")
|
|
1451
|
+
# do not always unload existing models if it is more efficient to keep in them in the GPU
|
|
1452
|
+
# (e.g: small modules whose calls are text encoders)
|
|
1453
|
+
if not self.can_model_be_cotenant(model_id) :
|
|
1454
|
+
self.unload_all()
|
|
1455
|
+
self.gpu_load(model_id)
|
|
1456
|
+
|
|
1249
1457
|
def hook_preload_blocks_for_compilation(self, target_module, model_id,blocks_name, context):
|
|
1250
1458
|
|
|
1251
1459
|
# @torch.compiler.disable()
|
|
@@ -1259,16 +1467,27 @@ class offload:
|
|
|
1259
1467
|
target_module.register_forward_pre_hook(preload_blocks_for_compile)
|
|
1260
1468
|
|
|
1261
1469
|
|
|
1262
|
-
def hook_check_empty_cache_needed(self, target_module, model_id,blocks_name, previous_method, context):
|
|
1470
|
+
def hook_check_empty_cache_needed(self, target_module, model_id, blocks_name, previous_method, context):
|
|
1263
1471
|
|
|
1264
1472
|
qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
|
|
1265
1473
|
if qint4quantization:
|
|
1266
1474
|
pass
|
|
1267
1475
|
|
|
1268
|
-
|
|
1269
|
-
#
|
|
1270
|
-
|
|
1476
|
+
if hasattr(target_module, "_mm_id"):
|
|
1477
|
+
# no hook for a shared module with no weights (otherwise this will cause models loading / unloading for nothing)
|
|
1478
|
+
orig_model_id = getattr(target_module, "_mm_id")
|
|
1479
|
+
if self.verboseLevel >=2:
|
|
1480
|
+
print(f"Model '{model_id}' shares module '{target_module._get_name()}' with module(s) '{orig_model_id}' ")
|
|
1481
|
+
assert not self.any_param_or_buffer(target_module)
|
|
1482
|
+
if not isinstance(orig_model_id, list):
|
|
1483
|
+
orig_model_id = [orig_model_id]
|
|
1484
|
+
orig_model_id.append(model_id)
|
|
1485
|
+
setattr(target_module, "_mm_id", orig_model_id)
|
|
1486
|
+
target_module.forward = target_module._mm_forward
|
|
1487
|
+
return
|
|
1271
1488
|
|
|
1489
|
+
def check_empty_cuda_cache(module, *args, **kwargs):
|
|
1490
|
+
self.ensure_model_loaded(model_id)
|
|
1272
1491
|
if blocks_name == None:
|
|
1273
1492
|
if self.ready_to_check_mem():
|
|
1274
1493
|
self.empty_cache_if_needed()
|
|
@@ -1279,34 +1498,18 @@ class offload:
|
|
|
1279
1498
|
|
|
1280
1499
|
return previous_method(*args, **kwargs)
|
|
1281
1500
|
|
|
1282
|
-
|
|
1283
|
-
if hasattr(target_module, "_mm_id"):
|
|
1284
|
-
orig_model_id = getattr(target_module, "_mm_id")
|
|
1285
|
-
if self.verboseLevel >=2:
|
|
1286
|
-
print(f"Model '{model_id}' shares module '{target_module._get_name()}' with module '{orig_model_id}' ")
|
|
1287
|
-
assert not self.any_param_or_buffer(target_module)
|
|
1288
|
-
|
|
1289
|
-
return
|
|
1290
1501
|
setattr(target_module, "_mm_id", model_id)
|
|
1502
|
+
setattr(target_module, "_mm_forward", previous_method)
|
|
1503
|
+
|
|
1291
1504
|
setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
|
|
1292
1505
|
|
|
1293
1506
|
|
|
1294
1507
|
def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
new_model_id = getattr(module, "_mm_id")
|
|
1299
|
-
# do not always unload existing models if it is more efficient to keep in them in the GPU
|
|
1300
|
-
# (e.g: small modules whose calls are text encoders)
|
|
1301
|
-
if not self.can_model_be_cotenant(new_model_id) :
|
|
1302
|
-
self.unload_all()
|
|
1303
|
-
performEmptyCacheTest = False
|
|
1304
|
-
self.gpu_load(new_model_id)
|
|
1508
|
+
|
|
1509
|
+
def check_change_module(module, *args, **kwargs):
|
|
1510
|
+
self.ensure_model_loaded(model_id)
|
|
1305
1511
|
# transfer leftovers inputs that were incorrectly created in the RAM (mostly due to some .device tests that returned incorrectly "cpu")
|
|
1306
1512
|
args, kwargs = self.move_args_to_gpu(*args, **kwargs)
|
|
1307
|
-
if performEmptyCacheTest:
|
|
1308
|
-
self.empty_cache_if_needed()
|
|
1309
|
-
|
|
1310
1513
|
return previous_method(*args, **kwargs)
|
|
1311
1514
|
|
|
1312
1515
|
if hasattr(target_module, "_mm_id"):
|
|
@@ -1337,6 +1540,8 @@ class offload:
|
|
|
1337
1540
|
base_size = self.blocks_of_modules_sizes[model_id]
|
|
1338
1541
|
current_budget -= base_size
|
|
1339
1542
|
if current_budget <= 0:
|
|
1543
|
+
if self.verboseLevel >=1:
|
|
1544
|
+
print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only only base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
|
|
1340
1545
|
return
|
|
1341
1546
|
|
|
1342
1547
|
towers = []
|
|
@@ -1357,6 +1562,8 @@ class offload:
|
|
|
1357
1562
|
total_size += tower_size
|
|
1358
1563
|
current_budget -= 2 * max_floor_size
|
|
1359
1564
|
if current_budget <= 0:
|
|
1565
|
+
if self.verboseLevel >=1:
|
|
1566
|
+
print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
|
|
1360
1567
|
return
|
|
1361
1568
|
|
|
1362
1569
|
|
|
@@ -1366,6 +1573,8 @@ class offload:
|
|
|
1366
1573
|
preload_total += preload_blocks_count * max_floor_size
|
|
1367
1574
|
max_blocks_fetch = max(max_floor_size, max_blocks_fetch)
|
|
1368
1575
|
if preload_blocks_count <= 0:
|
|
1576
|
+
if self.verboseLevel >=1:
|
|
1577
|
+
print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
|
|
1369
1578
|
return
|
|
1370
1579
|
|
|
1371
1580
|
nb_blocks= len(floors)
|
|
@@ -1396,11 +1605,11 @@ class offload:
|
|
|
1396
1605
|
|
|
1397
1606
|
self.preloaded_blocks_per_model[model_id] = preloaded_blocks
|
|
1398
1607
|
|
|
1399
|
-
if self.verboseLevel >=
|
|
1400
|
-
print(f"Async loading plan for model '{model_id}' : {preload_total/ONE_MB:0.2f} MB will be preloaded ({preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async shuttle")
|
|
1608
|
+
if self.verboseLevel >=1:
|
|
1609
|
+
print(f"Async loading plan for model '{model_id}' : {(preload_total+base_size)/ONE_MB:0.2f} MB will be preloaded (base size of {base_size/ONE_MB:0.2f} MB + {preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async" + (" circular" if len(towers) == 1 else "") + " shuttle")
|
|
1401
1610
|
|
|
1402
1611
|
|
|
1403
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
|
|
1612
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
1404
1613
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
1405
1614
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
1406
1615
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -1417,9 +1626,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1417
1626
|
model_budgets = {}
|
|
1418
1627
|
|
|
1419
1628
|
windows_os = os.name == 'nt'
|
|
1420
|
-
global total_pinned_bytes
|
|
1421
1629
|
|
|
1422
|
-
|
|
1423
1630
|
budget = 0
|
|
1424
1631
|
if not budgets is None:
|
|
1425
1632
|
if isinstance(budgets , dict):
|
|
@@ -1448,6 +1655,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1448
1655
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
1449
1656
|
|
|
1450
1657
|
_welcome()
|
|
1658
|
+
if coTenantsMap != None:
|
|
1659
|
+
self.cotenants_map = coTenantsMap
|
|
1451
1660
|
|
|
1452
1661
|
self.models = models
|
|
1453
1662
|
|
|
@@ -1528,9 +1737,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1528
1737
|
current_model_size += torch.numel(p.data) * p.data.element_size()
|
|
1529
1738
|
|
|
1530
1739
|
for b in current_model.buffers():
|
|
1531
|
-
|
|
1532
|
-
# convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
|
|
1533
|
-
b.data = b.data.to(torch.bfloat16)
|
|
1740
|
+
# do not convert 32 bits float to 16 bits since buffers are few (and potential gain low) and usually they are needed for precision calculation (for instance Rope)
|
|
1534
1741
|
current_model_size += torch.numel(b.data) * b.data.element_size()
|
|
1535
1742
|
|
|
1536
1743
|
if modelPinned:
|
|
@@ -1538,17 +1745,39 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1538
1745
|
|
|
1539
1746
|
|
|
1540
1747
|
model_budget = model_budgets[model_id] * ONE_MB if model_id in model_budgets else budget
|
|
1541
|
-
|
|
1748
|
+
if workingVRAM != None:
|
|
1749
|
+
model_minimumVRAM = -1
|
|
1750
|
+
if isinstance(workingVRAM, dict):
|
|
1751
|
+
if model_id in workingVRAM:
|
|
1752
|
+
model_minimumVRAM = workingVRAM[model_id]
|
|
1753
|
+
elif "*" in model_id in workingVRAM:
|
|
1754
|
+
model_minimumVRAM = workingVRAM["*"]
|
|
1755
|
+
else:
|
|
1756
|
+
model_minimumVRAM = workingVRAM
|
|
1757
|
+
if model_minimumVRAM > 0:
|
|
1758
|
+
new_budget = self.device_mem_capacity - model_minimumVRAM * ONE_MB
|
|
1759
|
+
new_budget = 1 if new_budget < 0 else new_budget
|
|
1760
|
+
model_budget = new_budget if model_budget == 0 or new_budget < model_budget else model_budget
|
|
1542
1761
|
if model_budget > 0 and model_budget > current_model_size:
|
|
1543
1762
|
model_budget = 0
|
|
1763
|
+
coef =0.8
|
|
1764
|
+
if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
|
|
1765
|
+
if verboseLevel >= 1:
|
|
1766
|
+
if model_budget == 0:
|
|
1767
|
+
print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
|
|
1768
|
+
else:
|
|
1769
|
+
print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
|
|
1770
|
+
print(f"Budget allocation for this model has been consequently reduced to the 80% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
|
|
1771
|
+
model_budget = coef * self.device_mem_capacity
|
|
1772
|
+
|
|
1544
1773
|
|
|
1545
|
-
model_budgets[model_id] = model_budget
|
|
1774
|
+
model_budgets[model_id] = model_budget
|
|
1546
1775
|
|
|
1547
1776
|
partialPinning = False
|
|
1548
1777
|
|
|
1549
1778
|
if estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
|
|
1550
1779
|
if self.verboseLevel >=1:
|
|
1551
|
-
print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated reservable RAM is {max_reservable_memory/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
|
|
1780
|
+
print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated available reservable RAM is {(max_reservable_memory-total_pinned_bytes)/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
|
|
1552
1781
|
partialPinning = True
|
|
1553
1782
|
|
|
1554
1783
|
# Hook forward methods of modules
|
|
@@ -1577,15 +1806,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1577
1806
|
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, verboseLevel=verboseLevel)
|
|
1578
1807
|
|
|
1579
1808
|
current_budget = model_budgets[model_id]
|
|
1580
|
-
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
|
|
1809
|
+
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
1581
1810
|
self.loaded_blocks[model_id] = None
|
|
1582
1811
|
|
|
1583
1812
|
for submodule_name, submodule in current_model.named_modules():
|
|
1584
1813
|
# create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
|
|
1585
1814
|
# (it is queried in many pipelines even if offloading is not properly implemented)
|
|
1586
|
-
if
|
|
1815
|
+
if not hasattr(submodule, "_hf_hook"):
|
|
1587
1816
|
setattr(submodule, "_hf_hook", HfHook())
|
|
1588
|
-
|
|
1589
1817
|
if current_budget > 0 and len(submodule_name) > 0:
|
|
1590
1818
|
if cur_blocks_prefix != None:
|
|
1591
1819
|
if submodule_name.startswith(cur_blocks_prefix):
|
|
@@ -1593,20 +1821,20 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1593
1821
|
depth_name = submodule_name.split(".")
|
|
1594
1822
|
level = depth_name[len(depth_prefix)-1]
|
|
1595
1823
|
pre , num = _extract_num_from_str(level)
|
|
1596
|
-
if num != cur_blocks_seq
|
|
1824
|
+
if num != cur_blocks_seq and not (is_mod_seq and cur_blocks_seq>=0):
|
|
1597
1825
|
prev_blocks_name = cur_blocks_name
|
|
1598
1826
|
cur_blocks_name = cur_blocks_prefix + str(num)
|
|
1599
1827
|
# print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
1600
1828
|
cur_blocks_seq = num
|
|
1601
1829
|
else:
|
|
1602
|
-
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
|
|
1830
|
+
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
1603
1831
|
|
|
1604
1832
|
if cur_blocks_prefix == None:
|
|
1605
1833
|
pre , num = _extract_num_from_str(submodule_name)
|
|
1606
1834
|
if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
|
|
1607
|
-
cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre + ".", None, -1
|
|
1835
|
+
cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre + ".", None, -1, isinstance(submodule, torch.nn.Sequential)
|
|
1608
1836
|
elif num >=0:
|
|
1609
|
-
cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre, None, num
|
|
1837
|
+
cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre, None, num, False
|
|
1610
1838
|
cur_blocks_name = submodule_name
|
|
1611
1839
|
# print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
1612
1840
|
|
|
@@ -1621,7 +1849,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1621
1849
|
else:
|
|
1622
1850
|
self.hook_check_empty_cache_needed(submodule, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
1623
1851
|
|
|
1624
|
-
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
|
|
1852
|
+
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
|
|
1625
1853
|
|
|
1626
1854
|
self.tune_preloading(model_id, current_budget, towers_names)
|
|
1627
1855
|
|
|
@@ -1635,9 +1863,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1635
1863
|
elif prev_num - start_num <=1:
|
|
1636
1864
|
print(f"Size of submodel '{n+ str(start_num)}': {prev_size/ONE_MB:.1f} MB")
|
|
1637
1865
|
else:
|
|
1638
|
-
print(f"Size of submodel '{n+ str(start_num) +'-'+ str(prev_num)}': {prev_size/ONE_MB:.1f} MB")
|
|
1866
|
+
print(f"Size of submodel '{n+ str(start_num) +'-'+ str(prev_num)}': {(prev_num-start_num+1)*prev_size/ONE_MB:.1f} MB ({prev_size/ONE_MB:.1f} MB x {prev_num-start_num+1})")
|
|
1639
1867
|
|
|
1640
1868
|
for n, size in self.blocks_of_modules_sizes.items():
|
|
1869
|
+
size = int(size / 10000)* 10000
|
|
1641
1870
|
pre, num = _extract_num_from_str(n) if "/" in n else (n, -1)
|
|
1642
1871
|
if prev_pre == None :
|
|
1643
1872
|
start_num = num
|
|
@@ -1709,21 +1938,21 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1709
1938
|
if profile_no == profile_type.HighRAM_HighVRAM:
|
|
1710
1939
|
pinnedMemory= True
|
|
1711
1940
|
budgets = None
|
|
1712
|
-
info = "You have chosen a profile that
|
|
1941
|
+
info = "You have chosen a profile that may require 48 GB of RAM and up to 24 GB of VRAM on some applications."
|
|
1713
1942
|
elif profile_no == profile_type.HighRAM_LowVRAM:
|
|
1714
1943
|
pinnedMemory= True
|
|
1715
1944
|
budgets["*"] = 3000
|
|
1716
|
-
info = "You have chosen a profile that
|
|
1945
|
+
info = "You have chosen a profile that may require 48 GB of RAM and up to 12 GB of VRAM on some applications."
|
|
1717
1946
|
elif profile_no == profile_type.LowRAM_HighVRAM:
|
|
1718
1947
|
pinnedMemory= "transformer"
|
|
1719
1948
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
1720
1949
|
budgets = None
|
|
1721
|
-
info = "You have chosen a Medium speed profile that
|
|
1950
|
+
info = "You have chosen a Medium speed profile that may require 32 GB of RAM and up to 24 GB of VRAM on some applications."
|
|
1722
1951
|
elif profile_no == profile_type.LowRAM_LowVRAM:
|
|
1723
1952
|
pinnedMemory= "transformer"
|
|
1724
1953
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
1725
1954
|
budgets["*"] = 3000
|
|
1726
|
-
info = "You have chosen a profile that
|
|
1955
|
+
info = "You have chosen a profile that usually may require 32 GB of RAM and up to 12 GB of VRAM on some applications."
|
|
1727
1956
|
elif profile_no == profile_type.VerylowRAM_LowVRAM:
|
|
1728
1957
|
pinnedMemory= False
|
|
1729
1958
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
@@ -1731,9 +1960,10 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1731
1960
|
if "transformer" in modules:
|
|
1732
1961
|
budgets["transformer"] = 400
|
|
1733
1962
|
#asyncTransfers = False
|
|
1734
|
-
info = "You have chosen the slowest profile that
|
|
1963
|
+
info = "You have chosen the slowest profile that may require 24 GB of RAM and up to 10 GB of VRAM on some applications."
|
|
1735
1964
|
else:
|
|
1736
1965
|
raise Exception("Unknown profile")
|
|
1966
|
+
info += " Actual requirements may varry depending on the application or on the tuning done to the profile."
|
|
1737
1967
|
|
|
1738
1968
|
if budgets != None and len(budgets) == 0:
|
|
1739
1969
|
budgets = None
|
mmgp/safetensors2.py
CHANGED
|
@@ -146,7 +146,7 @@ def _read_safetensors_header(path, file):
|
|
|
146
146
|
return catalog, metadata, length_of_header + 8
|
|
147
147
|
|
|
148
148
|
|
|
149
|
-
def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
149
|
+
def torch_write_file(sd, file_path, quantization_map = None, config = None, extra_meta = None):
|
|
150
150
|
from collections import OrderedDict
|
|
151
151
|
sf_sd = OrderedDict()
|
|
152
152
|
|
|
@@ -189,6 +189,14 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
|
189
189
|
if not config is None:
|
|
190
190
|
metadata["config_base64"] = base64.b64encode(json.dumps(config, ensure_ascii=False).encode('utf8')).decode('utf8')
|
|
191
191
|
|
|
192
|
+
if not extra_meta is None:
|
|
193
|
+
for n , m in extra_meta.items():
|
|
194
|
+
if isinstance(m, str):
|
|
195
|
+
metadata[n] = m
|
|
196
|
+
else:
|
|
197
|
+
metadata[n + "_base64"] = base64.b64encode(json.dumps(m, ensure_ascii=False).encode('utf8')).decode('utf8')
|
|
198
|
+
|
|
199
|
+
|
|
192
200
|
if len(metadata) > 0:
|
|
193
201
|
sf_sd["__metadata__"] = metadata
|
|
194
202
|
|
|
@@ -443,6 +451,4 @@ try:
|
|
|
443
451
|
transformers.modeling_utils.safe_open = safe_open
|
|
444
452
|
transformers.modeling_utils.safe_load_file = torch_load_file
|
|
445
453
|
except:
|
|
446
|
-
pass
|
|
447
|
-
|
|
448
|
-
|
|
454
|
+
pass
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.1.4.
|
|
3
|
+
Version: 3.1.4.post151
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -17,7 +17,7 @@ Requires-Dist: peft
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep</H2>
|
|
20
|
+
<H2>Memory Management 3.1.4-151 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
23
|
|
|
@@ -44,21 +44,23 @@ Each profile may use a combination of the following:
|
|
|
44
44
|
|
|
45
45
|
## Sample applications that use mmgp
|
|
46
46
|
It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
|
|
47
|
-
- Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP
|
|
47
|
+
- Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
|
|
48
48
|
A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
|
|
49
49
|
|
|
50
|
-
- HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP
|
|
50
|
+
- HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP :\
|
|
51
51
|
One of the best open source Text to Video generator
|
|
52
52
|
|
|
53
|
-
- FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP
|
|
53
|
+
- FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP :\
|
|
54
54
|
One of the best inpainting / outpainting tools based on Flux that can run with less than 12 GB of VRAM.
|
|
55
55
|
|
|
56
|
-
- Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP
|
|
56
|
+
- Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP :\
|
|
57
57
|
This application include two models: a text to world generator and a image / video to world (probably the best open source image to video generator).
|
|
58
58
|
|
|
59
|
-
- OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP
|
|
59
|
+
- OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP :\
|
|
60
60
|
A Flux derived application very powerful that can be used to transfer an object of your choice in a prompted scene. With mmgp you can run it with only 6 GB of VRAM.
|
|
61
61
|
|
|
62
|
+
- YuE GP: https://github.com/deepbeepmeep/YuEGP :\
|
|
63
|
+
A great song generator (instruments + singer's voice) based on prompted Lyrics and a genre description. Thanks to mmgp you can run it with less than 10 GB of VRAM without waiting forever.
|
|
62
64
|
|
|
63
65
|
## Installation
|
|
64
66
|
First you need to install the module in your current project with:
|
|
@@ -88,7 +90,7 @@ You can choose between 5 profiles depending on your hardware:
|
|
|
88
90
|
- LowRAM_LowVRAM (4): at least 32 GB of RAM and 12 GB of VRAM : if you have little VRAM or want to generate longer videos / more images
|
|
89
91
|
- VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
|
|
90
92
|
|
|
91
|
-
Profile 2 (High RAM) and 4 (Low RAM)are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
|
|
93
|
+
Profile 2 (High RAM) and 4 (Low RAM) are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
|
|
92
94
|
If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
|
|
93
95
|
In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
|
|
94
96
|
|
|
@@ -114,11 +116,13 @@ For example:
|
|
|
114
116
|
- pinnedMemory: Boolean (for all models) or List of models ids to pin to RAM. Every model pinned to RAM will load much faster (up to 2 times) but this requires more RAM
|
|
115
117
|
- quantizeTransformer: boolean by default True. The 'transformer' model in the pipe contains usually the video or image generator is by defaut; quantized on the fly by default to 8 bits. If you want to save time on disk and reduce the loading time, you may want to load directly a prequantized model. If you don't want to quantize the image generator, you need to set the option *quantizeTransformer* to *False* to turn off on the fly quantization.
|
|
116
118
|
- extraModelsToQuantize: list of additional modelids of models to quantize on the fly. If the corresponding model is already quantized, this option will be ignored.
|
|
117
|
-
- budgets: either a number in mega bytes (for all models, if 0 unlimited budget) or a dictionary that maps model ids to mega bytes : define the budget in
|
|
119
|
+
- budgets: either a number in mega bytes (for all models, if 0 unlimited budget) or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
|
|
118
120
|
The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
|
|
121
|
+
- workingVRAM: either a number in mega bytes or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
|
|
119
122
|
- asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
|
|
120
123
|
- verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
|
|
121
124
|
- compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows
|
|
125
|
+
- coTenantsMap: a dictionary that maps a model id to a list of other models with which it accepts to share the VRAM at the same time. This is useful to avoid unefficient loading / unloading when two models processes are interleaved. For instance *coTenantsMap = { "text_encoder_2": ["text_encoder"] }* , here when *text_encoder_2* is loaded it won't unload *text_encoder*. Please note that the reverse is not true as these maps by design are not symetrical to allow tailored workflows. If you need to have as well *text_encoder* that won't unload *text_encoder_2* if it is already loaded *coTenantsMap = { "text_encoder_2": ["text_encoder"], "text_encoder": ["text_encoder_2"] }*
|
|
122
126
|
|
|
123
127
|
If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
|
|
124
128
|
|
|
@@ -126,11 +130,14 @@ If you are short on RAM and plan to work with quantized models, it is recommende
|
|
|
126
130
|
|
|
127
131
|
The module includes several tools to package a light version of your favorite video / image generator:
|
|
128
132
|
- *extract_models(string prefix, obj to explore)*\
|
|
129
|
-
This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required
|
|
133
|
+
This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required by *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
|
|
130
134
|
|
|
131
|
-
- *load_loras_into_model(model, lora_path, lora_multi)*\
|
|
135
|
+
- *load_loras_into_model(model, lora_path, lora_multi, activate_all_loras = True)*\
|
|
132
136
|
Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
|
|
133
|
-
The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
|
|
137
|
+
The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions. By default all the load loras will be activated or they can be activated later using *activate_loras*.
|
|
138
|
+
|
|
139
|
+
-*activate_loras(model, lora_nos, lora_multi = None )*\
|
|
140
|
+
Activate the loras whose nos are in the list of nos. Every lora that is not this list and that was activated previously will be disactivated.
|
|
134
141
|
|
|
135
142
|
- *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
|
|
136
143
|
Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=4gGj7ibuilYEk7N4_x9M_fx6tYO2OUU8jeSq5YsF_0E,85992
|
|
4
|
+
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
+
mmgp-3.1.4.post151.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.1.4.post151.dist-info/METADATA,sha256=qBS_HYUidog3kLKr25x0YJ7EyCCCcHbghXrFJcYoUZE,15946
|
|
7
|
+
mmgp-3.1.4.post151.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
8
|
+
mmgp-3.1.4.post151.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.1.4.post151.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=DEGTt5RPoLx9JK-d7Ld_B_rIuQrmhblQJw3V5CL9Lo8,74519
|
|
4
|
-
mmgp/safetensors2.py,sha256=OkJAvENfWeb-PL0FcxS1-eYeHLbemTaNXYvNxURrzIs,16154
|
|
5
|
-
mmgp-3.1.4.post15.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.1.4.post15.dist-info/METADATA,sha256=IMmhK6xAu0A96mLlpby9V2H-K8RYIqRpORaBngvtC0U,14278
|
|
7
|
-
mmgp-3.1.4.post15.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
8
|
-
mmgp-3.1.4.post15.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.1.4.post15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|