mmgp 3.1.4.post15__py3-none-any.whl → 3.1.4.post1519__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +421 -187
- mmgp/safetensors2.py +10 -4
- {mmgp-3.1.4.post15.dist-info → mmgp-3.1.4.post1519.dist-info}/METADATA +19 -12
- mmgp-3.1.4.post1519.dist-info/RECORD +9 -0
- mmgp-3.1.4.post15.dist-info/RECORD +0 -9
- {mmgp-3.1.4.post15.dist-info → mmgp-3.1.4.post1519.dist-info}/LICENSE.md +0 -0
- {mmgp-3.1.4.post15.dist-info → mmgp-3.1.4.post1519.dist-info}/WHEEL +0 -0
- {mmgp-3.1.4.post15.dist-info → mmgp-3.1.4.post1519.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
1
|
+
# ------------------ Memory Management 3.1.4-1519 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -76,7 +76,18 @@ except:
|
|
|
76
76
|
from mmgp import safetensors2
|
|
77
77
|
from mmgp import profile_type
|
|
78
78
|
|
|
79
|
-
from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module
|
|
79
|
+
from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module, register_qmodule
|
|
80
|
+
|
|
81
|
+
# support for Embedding module quantization that is not supported by default by quanto
|
|
82
|
+
@register_qmodule(torch.nn.Embedding)
|
|
83
|
+
class QEmbedding(QModuleMixin, torch.nn.Embedding):
|
|
84
|
+
@classmethod
|
|
85
|
+
def qcreate(cls, module, weights, activations = None, optimizer = None, device = None):
|
|
86
|
+
module.bias = None
|
|
87
|
+
return cls( module.num_embeddings, module.embedding_dim, module.padding_idx , module.max_norm, module.norm_type, module.scale_grad_by_freq, module.sparse, dtype=module.weight.dtype, device=device, weights=weights,
|
|
88
|
+
activations=activations, optimizer=optimizer, quantize_input=True)
|
|
89
|
+
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
|
90
|
+
return torch.nn.functional.embedding( input, self.qweight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse )
|
|
80
91
|
|
|
81
92
|
|
|
82
93
|
shared_state = {}
|
|
@@ -96,11 +107,6 @@ ENDC = '\033[0m'
|
|
|
96
107
|
BOLD ='\033[1m'
|
|
97
108
|
UNBOLD ='\033[0m'
|
|
98
109
|
|
|
99
|
-
cotenants_map = {
|
|
100
|
-
"text_encoder": ["vae", "text_encoder_2"],
|
|
101
|
-
"text_encoder_2": ["vae", "text_encoder"],
|
|
102
|
-
}
|
|
103
|
-
|
|
104
110
|
class clock:
|
|
105
111
|
def __init__(self):
|
|
106
112
|
self.start_time = 0
|
|
@@ -216,15 +222,17 @@ def _get_model(model_path):
|
|
|
216
222
|
if len(_path)<=1:
|
|
217
223
|
raise("file not found")
|
|
218
224
|
else:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
if len(_path) > 2:
|
|
223
|
-
_subfolder = os.path.join(*_path[2:] )
|
|
224
|
-
model_path = hf_hub_download(repo_id=repoId, filename=_filename, subfolder=_subfolder)
|
|
225
|
-
else:
|
|
226
|
-
model_path = hf_hub_download(repo_id=repoId, filename=_filename)
|
|
225
|
+
try:
|
|
226
|
+
from huggingface_hub import hf_hub_download #snapshot_download,
|
|
227
|
+
repoId= os.path.join(*_path[0:2] ).replace("\\", "/")
|
|
227
228
|
|
|
229
|
+
if len(_path) > 2:
|
|
230
|
+
_subfolder = os.path.join(*_path[2:] )
|
|
231
|
+
model_path = hf_hub_download(repo_id=repoId, filename=_filename, subfolder=_subfolder)
|
|
232
|
+
else:
|
|
233
|
+
model_path = hf_hub_download(repo_id=repoId, filename=_filename)
|
|
234
|
+
except:
|
|
235
|
+
model_path = None
|
|
228
236
|
return model_path
|
|
229
237
|
|
|
230
238
|
|
|
@@ -278,9 +286,17 @@ def _force_load_parameter(p):
|
|
|
278
286
|
torch.utils.swap_tensors(p, q)
|
|
279
287
|
del q
|
|
280
288
|
|
|
281
|
-
def
|
|
289
|
+
def _get_tensor_ref(p):
|
|
290
|
+
if isinstance(p, QTensor):
|
|
291
|
+
if p._qtype == qint4:
|
|
292
|
+
return p._data._data.data_ptr()
|
|
293
|
+
else:
|
|
294
|
+
return p._data.data_ptr()
|
|
295
|
+
else:
|
|
296
|
+
return p.data_ptr()
|
|
282
297
|
|
|
283
298
|
|
|
299
|
+
def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
|
|
284
300
|
if partialPinning:
|
|
285
301
|
towers_names, _ = _detect_main_towers(model)
|
|
286
302
|
|
|
@@ -292,56 +308,63 @@ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
|
|
|
292
308
|
tensor_map_indexes = []
|
|
293
309
|
total_tensor_bytes = 0
|
|
294
310
|
|
|
295
|
-
|
|
311
|
+
params_dict = {} # OrderedDict
|
|
296
312
|
for k, sub_module in model.named_modules():
|
|
297
313
|
include = True
|
|
298
314
|
if partialPinning:
|
|
299
315
|
include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
|
|
300
316
|
if include:
|
|
301
|
-
|
|
302
|
-
|
|
317
|
+
params_dict.update( { k + '.' + n : (p, False) for n, p in sub_module.named_parameters(recurse=False) } )
|
|
318
|
+
params_dict.update( { k + '.' + n : (b, True) for n, b in sub_module.named_buffers(recurse=False) } )
|
|
303
319
|
|
|
304
320
|
if verboseLevel>=1 :
|
|
305
321
|
if partialPinning:
|
|
306
|
-
if len(
|
|
322
|
+
if len(params_dict) == 0:
|
|
307
323
|
print(f"Unable to apply Partial of '{model_id}' as no isolated main structures were found")
|
|
308
324
|
else:
|
|
309
325
|
print(f"Partial pinning of data of '{model_id}' to reserved RAM")
|
|
310
326
|
else:
|
|
311
327
|
print(f"Pinning data of '{model_id}' to reserved RAM")
|
|
312
328
|
|
|
313
|
-
if partialPinning and len(
|
|
329
|
+
if partialPinning and len(params_dict) == 0:
|
|
314
330
|
return
|
|
315
331
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
for n, p, _ in
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
332
|
+
ref_cache = {}
|
|
333
|
+
tied_weights = {}
|
|
334
|
+
for n, (p, _) in params_dict.items():
|
|
335
|
+
ref = _get_tensor_ref(p)
|
|
336
|
+
match = ref_cache.get(ref, None)
|
|
337
|
+
if match != None:
|
|
338
|
+
match_name, match_size = match
|
|
339
|
+
if verboseLevel >=1:
|
|
340
|
+
print(f"Tied weights of {match_size/ONE_MB:0.2f} MB detected: {match_name} <-> {n}")
|
|
341
|
+
tied_weights[n] = match_name
|
|
342
|
+
else:
|
|
343
|
+
if isinstance(p, QTensor):
|
|
344
|
+
if p._qtype == qint4:
|
|
345
|
+
if hasattr(p,"_scale_shift"):
|
|
346
|
+
length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
347
|
+
else:
|
|
348
|
+
length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
|
|
323
349
|
else:
|
|
324
|
-
length = torch.numel(p._data
|
|
350
|
+
length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
|
|
325
351
|
else:
|
|
326
|
-
length = torch.numel(p.
|
|
327
|
-
else:
|
|
328
|
-
length = torch.numel(p.data) * p.data.element_size()
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
|
|
332
|
-
big_tensors_sizes.append(current_big_tensor_size)
|
|
333
|
-
current_big_tensor_size = 0
|
|
334
|
-
big_tensor_no += 1
|
|
352
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
335
353
|
|
|
354
|
+
ref_cache[ref] = (n, length)
|
|
355
|
+
if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
|
|
356
|
+
big_tensors_sizes.append(current_big_tensor_size)
|
|
357
|
+
current_big_tensor_size = 0
|
|
358
|
+
big_tensor_no += 1
|
|
336
359
|
|
|
337
|
-
itemsize = p.data.dtype.itemsize
|
|
338
|
-
if current_big_tensor_size % itemsize:
|
|
339
|
-
current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
|
|
340
|
-
tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
|
|
341
|
-
current_big_tensor_size += length
|
|
342
360
|
|
|
343
|
-
|
|
361
|
+
itemsize = p.data.dtype.itemsize
|
|
362
|
+
if current_big_tensor_size % itemsize:
|
|
363
|
+
current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
|
|
364
|
+
tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
|
|
365
|
+
current_big_tensor_size += length
|
|
344
366
|
|
|
367
|
+
total_tensor_bytes += length
|
|
345
368
|
|
|
346
369
|
big_tensors_sizes.append(current_big_tensor_size)
|
|
347
370
|
|
|
@@ -368,39 +391,53 @@ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
|
|
|
368
391
|
|
|
369
392
|
tensor_no = 0
|
|
370
393
|
# prev_big_tensor = 0
|
|
371
|
-
for n, p,
|
|
372
|
-
|
|
373
|
-
# if big_tensor_no != prev_big_tensor:
|
|
374
|
-
# gc.collect()
|
|
375
|
-
# prev_big_tensor = big_tensor_no
|
|
376
|
-
if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
377
|
-
current_big_tensor = big_tensors[big_tensor_no]
|
|
378
|
-
if is_buffer :
|
|
379
|
-
_force_load_buffer(p) # otherwise potential memory leak
|
|
394
|
+
for n, (p, is_buffer) in params_dict.items():
|
|
395
|
+
if n in tied_weights:
|
|
380
396
|
if isinstance(p, QTensor):
|
|
381
|
-
if p._qtype == qint4:
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
397
|
+
if p._qtype == qint4:
|
|
398
|
+
assert p._data._data.data.is_pinned()
|
|
399
|
+
else:
|
|
400
|
+
assert p._data.is_pinned()
|
|
401
|
+
else:
|
|
402
|
+
assert p.data.is_pinned()
|
|
403
|
+
else:
|
|
404
|
+
big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
|
|
405
|
+
# if big_tensor_no != prev_big_tensor:
|
|
406
|
+
# gc.collect()
|
|
407
|
+
# prev_big_tensor = big_tensor_no
|
|
408
|
+
# match_param, match_isbuffer = tied_weights.get(n, (None, False))
|
|
409
|
+
# if match_param != None:
|
|
410
|
+
|
|
411
|
+
if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
412
|
+
current_big_tensor = big_tensors[big_tensor_no]
|
|
413
|
+
if is_buffer :
|
|
414
|
+
_force_load_buffer(p) # otherwise potential memory leak
|
|
415
|
+
if isinstance(p, QTensor):
|
|
416
|
+
if p._qtype == qint4:
|
|
417
|
+
length1 = torch.numel(p._data._data) * p._data._data.element_size()
|
|
418
|
+
p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
|
|
419
|
+
if hasattr(p,"_scale_shift"):
|
|
420
|
+
length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
421
|
+
p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
|
|
422
|
+
else:
|
|
423
|
+
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
424
|
+
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
425
|
+
length3 = torch.numel(p._shift) * p._shift.element_size()
|
|
426
|
+
p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
|
|
387
427
|
else:
|
|
428
|
+
length1 = torch.numel(p._data) * p._data.element_size()
|
|
429
|
+
p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
|
|
388
430
|
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
389
431
|
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
390
|
-
length3 = torch.numel(p._shift) * p._shift.element_size()
|
|
391
|
-
p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
|
|
392
432
|
else:
|
|
393
|
-
|
|
394
|
-
p.
|
|
395
|
-
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
396
|
-
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
397
|
-
else:
|
|
398
|
-
length = torch.numel(p.data) * p.data.element_size()
|
|
399
|
-
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
433
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
434
|
+
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
400
435
|
|
|
401
|
-
|
|
436
|
+
tensor_no += 1
|
|
437
|
+
del p
|
|
402
438
|
global total_pinned_bytes
|
|
403
439
|
total_pinned_bytes += total
|
|
440
|
+
del params_dict
|
|
404
441
|
gc.collect()
|
|
405
442
|
|
|
406
443
|
if verboseLevel >=1:
|
|
@@ -420,7 +457,7 @@ def _welcome():
|
|
|
420
457
|
if welcome_displayed:
|
|
421
458
|
return
|
|
422
459
|
welcome_displayed = True
|
|
423
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-
|
|
460
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-151) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
424
461
|
|
|
425
462
|
def _extract_num_from_str(num_in_str):
|
|
426
463
|
size = len(num_in_str)
|
|
@@ -518,16 +555,6 @@ def _requantize(model: torch.nn.Module, state_dict: dict, quantization_map: dict
|
|
|
518
555
|
|
|
519
556
|
def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 1000000000, model_id = 'Unknown'):
|
|
520
557
|
|
|
521
|
-
def compute_submodule_size(submodule):
|
|
522
|
-
size = 0
|
|
523
|
-
for p in submodule.parameters(recurse=False):
|
|
524
|
-
size += torch.numel(p.data) * sizeofbfloat16
|
|
525
|
-
|
|
526
|
-
for p in submodule.buffers(recurse=False):
|
|
527
|
-
size += torch.numel(p.data) * sizeofbfloat16
|
|
528
|
-
|
|
529
|
-
return size
|
|
530
|
-
|
|
531
558
|
total_size =0
|
|
532
559
|
total_excluded = 0
|
|
533
560
|
exclude_list = []
|
|
@@ -549,16 +576,31 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
549
576
|
tower_names ,_ = _detect_main_towers(model_to_quantize)
|
|
550
577
|
tower_names = [ n[:-1] for n in tower_names]
|
|
551
578
|
|
|
579
|
+
|
|
580
|
+
cache_ref = {}
|
|
581
|
+
tied_weights= {}
|
|
582
|
+
|
|
552
583
|
for submodule_name, submodule in model_to_quantize.named_modules():
|
|
553
584
|
if isinstance(submodule, QModuleMixin):
|
|
554
585
|
if verboseLevel>=1:
|
|
555
586
|
print("No quantization to do as model is already quantized")
|
|
556
587
|
return False
|
|
557
588
|
|
|
558
|
-
|
|
559
|
-
|
|
589
|
+
size = 0
|
|
590
|
+
for n, p in submodule.named_parameters(recurse = False):
|
|
591
|
+
ref = _get_tensor_ref(p)
|
|
592
|
+
match = cache_ref.get(ref, None)
|
|
593
|
+
if match != None:
|
|
594
|
+
tied_weights[submodule_name]= (n, ) + match
|
|
595
|
+
else:
|
|
596
|
+
cache_ref[ref] = (submodule_name, n)
|
|
597
|
+
size += torch.numel(p.data) * sizeofbfloat16
|
|
598
|
+
|
|
599
|
+
for p in submodule.buffers(recurse=False):
|
|
600
|
+
size += torch.numel(p.data) * sizeofbfloat16
|
|
601
|
+
|
|
602
|
+
|
|
560
603
|
|
|
561
|
-
size = compute_submodule_size(submodule)
|
|
562
604
|
if not any(submodule_name.startswith(pre) for pre in tower_names):
|
|
563
605
|
flush = False
|
|
564
606
|
if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
|
|
@@ -590,12 +632,13 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
590
632
|
submodule_names.append(submodule_name)
|
|
591
633
|
total_size += size
|
|
592
634
|
|
|
593
|
-
if submodule_size >
|
|
635
|
+
if submodule_size >0 and submodule_size <= threshold :
|
|
594
636
|
exclude_list += submodule_names
|
|
595
637
|
if verboseLevel >=2:
|
|
596
638
|
print(f"Excluded size {submodule_size/ONE_MB:.1f} MB: {prev_blocks_prefix} : {submodule_names}")
|
|
597
639
|
total_excluded += submodule_size
|
|
598
640
|
|
|
641
|
+
|
|
599
642
|
perc_excluded =total_excluded/ total_size if total_size >0 else 1
|
|
600
643
|
if verboseLevel >=2:
|
|
601
644
|
if total_excluded == 0:
|
|
@@ -608,7 +651,10 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
608
651
|
exclude_list = None
|
|
609
652
|
|
|
610
653
|
|
|
611
|
-
|
|
654
|
+
exclude_list += list(tied_weights)
|
|
655
|
+
quantize(model_to_quantize, weights= weights, exclude= exclude_list)
|
|
656
|
+
|
|
657
|
+
|
|
612
658
|
# quantize(model_to_quantize,weights, include= [ "*1.block.attn.to_out*"]) #"
|
|
613
659
|
|
|
614
660
|
# for name, m in model_to_quantize.named_modules():
|
|
@@ -618,24 +664,40 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
618
664
|
|
|
619
665
|
# force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
|
|
620
666
|
# otherwise we may end up keeping in memory both the quantized and the non quantize model
|
|
621
|
-
for n,m in model_to_quantize.named_modules()
|
|
667
|
+
named_modules = {n:m for n,m in model_to_quantize.named_modules()}
|
|
668
|
+
for module_name, module in named_modules.items():
|
|
622
669
|
# do not read quantized weights (detected them directly or behind an adapter)
|
|
623
|
-
if isinstance(
|
|
624
|
-
if hasattr(
|
|
625
|
-
_force_load_parameter(
|
|
670
|
+
if isinstance(module, QModuleMixin) or hasattr(module, "base_layer") and isinstance(module.base_layer, QModuleMixin):
|
|
671
|
+
if hasattr(module, "bias") and module.bias is not None:
|
|
672
|
+
_force_load_parameter(module.bias)
|
|
626
673
|
else:
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
674
|
+
tied_w = tied_weights.get(module_name, None)
|
|
675
|
+
for n, p in module.named_parameters(recurse = False):
|
|
676
|
+
if tied_w != None and n == tied_w[0]:
|
|
677
|
+
if isinstance( named_modules[tied_w[1]], QModuleMixin) :
|
|
678
|
+
setattr(module, n, None) # release refs of tied weights if source is going to be quantized
|
|
679
|
+
# otherwise don't force load as it will be loaded in the source anyway
|
|
680
|
+
else:
|
|
681
|
+
_force_load_parameter(p)
|
|
682
|
+
del p # del p if not it will still contain a ref to a tensor when leaving the loop
|
|
683
|
+
for b in module.buffers(recurse = False):
|
|
631
684
|
_force_load_buffer(b)
|
|
632
|
-
|
|
685
|
+
del b
|
|
633
686
|
|
|
634
687
|
|
|
635
688
|
freeze(model_to_quantize)
|
|
636
689
|
torch.cuda.empty_cache()
|
|
637
|
-
gc.collect()
|
|
690
|
+
gc.collect()
|
|
691
|
+
|
|
692
|
+
for tied_module, (tied_weight, src_module, src_weight) in tied_weights.items():
|
|
693
|
+
p = getattr(named_modules[src_module], src_weight)
|
|
694
|
+
if isinstance(p, QTensor):
|
|
695
|
+
setattr(named_modules[tied_module], tied_weight, p ) # copy refs to quantized sources
|
|
696
|
+
|
|
697
|
+
del named_modules
|
|
698
|
+
|
|
638
699
|
quantization_map = _quantization_map(model_to_quantize)
|
|
700
|
+
|
|
639
701
|
model_to_quantize._quanto_map = quantization_map
|
|
640
702
|
|
|
641
703
|
if hasattr(model_to_quantize, "_already_pinned"):
|
|
@@ -647,12 +709,85 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
647
709
|
|
|
648
710
|
return True
|
|
649
711
|
|
|
712
|
+
def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
|
713
|
+
self._check_forward_args(x, *args, **kwargs)
|
|
714
|
+
adapter_names = kwargs.pop("adapter_names", None)
|
|
715
|
+
if self.disable_adapters:
|
|
716
|
+
if self.merged:
|
|
717
|
+
self.unmerge()
|
|
718
|
+
result = self.base_layer(x, *args, **kwargs)
|
|
719
|
+
elif adapter_names is not None:
|
|
720
|
+
result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
|
|
721
|
+
elif self.merged:
|
|
722
|
+
result = self.base_layer(x, *args, **kwargs)
|
|
723
|
+
else:
|
|
724
|
+
base_weight = self.base_layer.weight
|
|
725
|
+
if base_weight.shape[-1] < x.shape[-2]: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
726
|
+
for active_adapter in self.active_adapters:
|
|
727
|
+
if active_adapter not in self.lora_A.keys():
|
|
728
|
+
continue
|
|
729
|
+
if self.use_dora[active_adapter]:
|
|
730
|
+
raise Exception("Dora not yet supported by mmgp")
|
|
731
|
+
lora_A = self.lora_A[active_adapter]
|
|
732
|
+
lora_B = self.lora_B[active_adapter]
|
|
733
|
+
scaling = self.scaling[active_adapter]
|
|
734
|
+
lora_A_weight = lora_A.weight
|
|
735
|
+
lora_B_weight = lora_B.weight
|
|
736
|
+
lora_BA = lora_B_weight @ lora_A_weight
|
|
737
|
+
base_weight += scaling * lora_BA
|
|
738
|
+
|
|
739
|
+
result = torch.nn.functional.linear(x, base_weight, bias=self.base_layer.bias)
|
|
740
|
+
torch_result_dtype = result.dtype
|
|
741
|
+
|
|
742
|
+
else:
|
|
743
|
+
result = self.base_layer(x, *args, **kwargs)
|
|
744
|
+
torch_result_dtype = result.dtype
|
|
745
|
+
x = x.to(torch.bfloat16)
|
|
746
|
+
|
|
747
|
+
for active_adapter in self.active_adapters:
|
|
748
|
+
if active_adapter not in self.lora_A.keys():
|
|
749
|
+
continue
|
|
750
|
+
lora_A = self.lora_A[active_adapter]
|
|
751
|
+
lora_B = self.lora_B[active_adapter]
|
|
752
|
+
dropout = self.lora_dropout[active_adapter]
|
|
753
|
+
scaling = self.scaling[active_adapter]
|
|
754
|
+
x = x.to(lora_A.weight.dtype)
|
|
755
|
+
|
|
756
|
+
if not self.use_dora[active_adapter]:
|
|
757
|
+
y = lora_A(x)
|
|
758
|
+
y = lora_B(y)
|
|
759
|
+
y*= scaling
|
|
760
|
+
result+= y
|
|
761
|
+
del lora_A, lora_B, y
|
|
762
|
+
# result = result + lora_B(lora_A(dropout(x))) * scaling
|
|
763
|
+
else:
|
|
764
|
+
if isinstance(dropout, nn.Identity) or not self.training:
|
|
765
|
+
base_result = result
|
|
766
|
+
else:
|
|
767
|
+
x = dropout(x)
|
|
768
|
+
base_result = None
|
|
769
|
+
|
|
770
|
+
result = result + self.lora_magnitude_vector[active_adapter](
|
|
771
|
+
x,
|
|
772
|
+
lora_A=lora_A,
|
|
773
|
+
lora_B=lora_B,
|
|
774
|
+
scaling=scaling,
|
|
775
|
+
base_layer=self.get_base_layer(),
|
|
776
|
+
base_result=base_result,
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
result = result.to(torch_result_dtype)
|
|
780
|
+
return result
|
|
781
|
+
|
|
650
782
|
def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, verboseLevel = -1,):
|
|
651
783
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
652
784
|
|
|
653
785
|
if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or get_peft_kwargs == None:
|
|
654
786
|
raise Exception("Unable to load Lora, missing 'peft' and / or 'diffusers' modules")
|
|
655
|
-
|
|
787
|
+
|
|
788
|
+
from peft.tuners.lora import Linear
|
|
789
|
+
Linear.forward = _lora_linear_forward
|
|
790
|
+
|
|
656
791
|
if not isinstance(lora_path, list):
|
|
657
792
|
lora_path = [lora_path]
|
|
658
793
|
|
|
@@ -662,6 +797,9 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
662
797
|
for i, path in enumerate(lora_path):
|
|
663
798
|
adapter_name = str(i)
|
|
664
799
|
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
|
|
665
803
|
state_dict = safetensors2.torch_load_file(path)
|
|
666
804
|
|
|
667
805
|
keys = list(state_dict.keys())
|
|
@@ -843,7 +981,6 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
843
981
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
844
982
|
|
|
845
983
|
model = _remove_model_wrapper(model)
|
|
846
|
-
|
|
847
984
|
if not (".safetensors" in file_path or ".sft" in file_path):
|
|
848
985
|
if pinToMemory:
|
|
849
986
|
raise Exception("Pinning to memory while loading only supported for safe tensors files")
|
|
@@ -855,12 +992,20 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
855
992
|
|
|
856
993
|
if metadata is None:
|
|
857
994
|
quantization_map = None
|
|
995
|
+
tied_weights_map = None
|
|
858
996
|
else:
|
|
859
997
|
quantization_map = metadata.get("quantization_map", None)
|
|
860
998
|
config = metadata.get("config", None)
|
|
861
999
|
if config is not None:
|
|
862
1000
|
model._config = config
|
|
863
1001
|
|
|
1002
|
+
tied_weights_map = metadata.get("tied_weights_map", None)
|
|
1003
|
+
if tied_weights_map != None:
|
|
1004
|
+
for name, tied_weights_list in tied_weights_map.items():
|
|
1005
|
+
mapped_weight = state_dict[name]
|
|
1006
|
+
for tied_weights in tied_weights_list:
|
|
1007
|
+
state_dict[tied_weights] = mapped_weight
|
|
1008
|
+
|
|
864
1009
|
|
|
865
1010
|
|
|
866
1011
|
if quantization_map is None:
|
|
@@ -915,6 +1060,7 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
|
|
|
915
1060
|
"""
|
|
916
1061
|
|
|
917
1062
|
config = None
|
|
1063
|
+
extra_meta = None
|
|
918
1064
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
919
1065
|
if config_file_path !=None:
|
|
920
1066
|
with open(config_file_path, "r", encoding="utf-8") as reader:
|
|
@@ -928,8 +1074,10 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
|
|
|
928
1074
|
config_path = getattr(config_obj,"_name_or_path", None)
|
|
929
1075
|
if config_path != None:
|
|
930
1076
|
config_fullpath = os.path.join(config_path, "config.json")
|
|
931
|
-
|
|
932
|
-
|
|
1077
|
+
config_fullpath = _get_model(config_fullpath)
|
|
1078
|
+
|
|
1079
|
+
# if not os.path.isfile(config_fullpath):
|
|
1080
|
+
# config_fullpath = None
|
|
933
1081
|
if config_fullpath is None:
|
|
934
1082
|
config_fullpath = os.path.join(os.path.dirname(file_path), "config.json")
|
|
935
1083
|
if os.path.isfile(config_fullpath):
|
|
@@ -942,15 +1090,50 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
|
|
|
942
1090
|
|
|
943
1091
|
quantization_map = getattr(model, "_quanto_map", None)
|
|
944
1092
|
|
|
1093
|
+
from collections import OrderedDict
|
|
1094
|
+
|
|
1095
|
+
cache_ref = {}
|
|
1096
|
+
tied_weights_map = {}
|
|
1097
|
+
sd = model.state_dict()
|
|
1098
|
+
out_sd = OrderedDict()
|
|
1099
|
+
|
|
1100
|
+
|
|
1101
|
+
for name, weight in sd.items():
|
|
1102
|
+
ref = _get_tensor_ref(weight)
|
|
1103
|
+
match = cache_ref.get(ref, None)
|
|
1104
|
+
if match != None:
|
|
1105
|
+
tied_list = tied_weights_map.get(match, [])
|
|
1106
|
+
tied_list.append(name)
|
|
1107
|
+
tied_weights_map[match] = tied_list
|
|
1108
|
+
else:
|
|
1109
|
+
out_sd[name] = weight
|
|
1110
|
+
cache_ref[ref] = name
|
|
1111
|
+
|
|
1112
|
+
if len(tied_weights_map) > 0:
|
|
1113
|
+
extra_meta = { "tied_weights_map" : tied_weights_map }
|
|
1114
|
+
|
|
945
1115
|
if verboseLevel >=1:
|
|
946
1116
|
print(f"Saving file '{file_path}")
|
|
947
|
-
|
|
1117
|
+
|
|
1118
|
+
safetensors2.torch_write_file(out_sd, file_path , quantization_map = quantization_map, config = config, extra_meta= extra_meta)
|
|
948
1119
|
if verboseLevel >=1:
|
|
949
1120
|
print(f"File '{file_path}' saved")
|
|
950
1121
|
|
|
951
1122
|
|
|
952
|
-
def extract_models(
|
|
1123
|
+
def extract_models(obj = None, prefix = None):
|
|
1124
|
+
if isinstance(obj, str): # for compatibility as the two args were switched
|
|
1125
|
+
bkp = prefix
|
|
1126
|
+
prefix = obj
|
|
1127
|
+
obj = bkp
|
|
1128
|
+
|
|
953
1129
|
pipe = {}
|
|
1130
|
+
if obj == None:
|
|
1131
|
+
raise Exception("an object to analyze must be provided")
|
|
1132
|
+
if prefix==None or len(prefix)==0:
|
|
1133
|
+
prefix = ""
|
|
1134
|
+
elif prefix[ -1:] != "/":
|
|
1135
|
+
prefix + "/"
|
|
1136
|
+
|
|
954
1137
|
for name in dir(obj):
|
|
955
1138
|
element = getattr(obj,name)
|
|
956
1139
|
if name in ("pipeline", "pipe"):
|
|
@@ -958,16 +1141,16 @@ def extract_models(prefix, obj):
|
|
|
958
1141
|
if hasattr(pipeline , "components") and isinstance(pipeline.components, dict):
|
|
959
1142
|
for k, model in pipeline.components.items():
|
|
960
1143
|
if model != None:
|
|
961
|
-
pipe[prefix +
|
|
962
|
-
elif isinstance(element, torch.nn.Module):
|
|
963
|
-
if prefix
|
|
964
|
-
pipe[prefix
|
|
1144
|
+
pipe[prefix + k ] = model
|
|
1145
|
+
elif isinstance(element, torch.nn.Module) and name!="base_model":
|
|
1146
|
+
if prefix + name in pipe:
|
|
1147
|
+
pipe[prefix + "_" + name ] = element
|
|
965
1148
|
else:
|
|
966
|
-
pipe[prefix
|
|
1149
|
+
pipe[prefix + name ] = element
|
|
967
1150
|
elif isinstance(element, dict):
|
|
968
1151
|
for k, element in element.items():
|
|
969
1152
|
if hasattr(element , "pipeline"):
|
|
970
|
-
pipe.update( extract_models(prefix +
|
|
1153
|
+
pipe.update( extract_models(prefix + k,element ))
|
|
971
1154
|
|
|
972
1155
|
|
|
973
1156
|
return pipe
|
|
@@ -989,6 +1172,10 @@ class offload:
|
|
|
989
1172
|
self.active_models_ids = []
|
|
990
1173
|
self.active_subcaches = {}
|
|
991
1174
|
self.models = {}
|
|
1175
|
+
self.cotenants_map = {
|
|
1176
|
+
"text_encoder": ["vae", "text_encoder_2"],
|
|
1177
|
+
"text_encoder_2": ["vae", "text_encoder"],
|
|
1178
|
+
}
|
|
992
1179
|
self.verboseLevel = 0
|
|
993
1180
|
self.blocks_of_modules = {}
|
|
994
1181
|
self.blocks_of_modules_sizes = {}
|
|
@@ -1002,14 +1189,13 @@ class offload:
|
|
|
1002
1189
|
self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
|
|
1003
1190
|
self.transfer_stream = torch.cuda.Stream()
|
|
1004
1191
|
self.async_transfers = False
|
|
1192
|
+
self.parameters_ref = {}
|
|
1005
1193
|
global last_offload_obj
|
|
1006
1194
|
last_offload_obj = self
|
|
1007
1195
|
|
|
1008
1196
|
|
|
1009
|
-
def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name):
|
|
1197
|
+
def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
|
|
1010
1198
|
|
|
1011
|
-
if blocks_name is None:
|
|
1012
|
-
pass
|
|
1013
1199
|
entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
|
|
1014
1200
|
if entry_name in self.blocks_of_modules:
|
|
1015
1201
|
blocks_params = self.blocks_of_modules[entry_name]
|
|
@@ -1023,39 +1209,54 @@ class offload:
|
|
|
1023
1209
|
self.prev_blocks_names[entry_name] = prev_entry_name
|
|
1024
1210
|
if not prev_block_name == None:
|
|
1025
1211
|
self.next_blocks_names[prev_entry_name] = entry_name
|
|
1026
|
-
|
|
1212
|
+
bef = blocks_params_size
|
|
1027
1213
|
for k,p in submodule.named_parameters(recurse=False):
|
|
1214
|
+
param_size = 0
|
|
1215
|
+
ref = _get_tensor_ref(p)
|
|
1216
|
+
tied_param = self.parameters_ref.get(ref, None)
|
|
1028
1217
|
|
|
1029
1218
|
if isinstance(p, QTensor):
|
|
1030
|
-
blocks_params.append( (submodule, k, p, False ) )
|
|
1219
|
+
blocks_params.append( (submodule, k, p, False, tied_param ) )
|
|
1031
1220
|
|
|
1032
1221
|
if p._qtype == qint4:
|
|
1033
1222
|
if hasattr(p,"_scale_shift"):
|
|
1034
|
-
|
|
1035
|
-
|
|
1223
|
+
param_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
1224
|
+
param_size += torch.numel(p._data._data) * p._data._data.element_size()
|
|
1036
1225
|
else:
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1226
|
+
param_size += torch.numel(p._scale) * p._scale.element_size()
|
|
1227
|
+
param_size += torch.numel(p._shift) * p._shift.element_size()
|
|
1228
|
+
param_size += torch.numel(p._data._data) * p._data._data.element_size()
|
|
1040
1229
|
else:
|
|
1041
|
-
|
|
1042
|
-
|
|
1230
|
+
param_size += torch.numel(p._scale) * p._scale.element_size()
|
|
1231
|
+
param_size += torch.numel(p._data) * p._data.element_size()
|
|
1043
1232
|
else:
|
|
1044
|
-
blocks_params.append( (submodule, k, p, False) )
|
|
1045
|
-
|
|
1233
|
+
blocks_params.append( (submodule, k, p, False, tied_param) )
|
|
1234
|
+
param_size += torch.numel(p.data) * p.data.element_size()
|
|
1235
|
+
|
|
1236
|
+
|
|
1237
|
+
if tied_param == None:
|
|
1238
|
+
blocks_params_size += param_size
|
|
1239
|
+
self.parameters_ref[ref] = (submodule, k)
|
|
1046
1240
|
|
|
1047
1241
|
for k, p in submodule.named_buffers(recurse=False):
|
|
1048
|
-
blocks_params.append( (submodule, k, p, True) )
|
|
1242
|
+
blocks_params.append( (submodule, k, p, True, None) )
|
|
1049
1243
|
blocks_params_size += p.data.nbytes
|
|
1050
1244
|
|
|
1245
|
+
aft = blocks_params_size
|
|
1246
|
+
|
|
1247
|
+
# if blocks_name is None:
|
|
1248
|
+
# print(f"Default: {model_id}/{submodule_name} : {(aft-bef)/ONE_MB:0.2f} MB")
|
|
1249
|
+
# pass
|
|
1250
|
+
|
|
1051
1251
|
|
|
1052
1252
|
self.blocks_of_modules_sizes[entry_name] = blocks_params_size
|
|
1053
1253
|
|
|
1254
|
+
|
|
1054
1255
|
return blocks_params_size
|
|
1055
1256
|
|
|
1056
1257
|
|
|
1057
1258
|
def can_model_be_cotenant(self, model_id):
|
|
1058
|
-
potential_cotenants= cotenants_map.get(model_id, None)
|
|
1259
|
+
potential_cotenants= self.cotenants_map.get(model_id, None)
|
|
1059
1260
|
if potential_cotenants is None:
|
|
1060
1261
|
return False
|
|
1061
1262
|
for existing_cotenant in self.active_models_ids:
|
|
@@ -1073,20 +1274,23 @@ class offload:
|
|
|
1073
1274
|
def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
|
|
1074
1275
|
with torch.cuda.stream(stream_to_use):
|
|
1075
1276
|
for param in blocks_params:
|
|
1076
|
-
parent_module, n, p, is_buffer = param
|
|
1277
|
+
parent_module, n, p, is_buffer, tied_param = param
|
|
1278
|
+
if tied_param != None:
|
|
1279
|
+
tied_p = getattr( tied_param[0], tied_param[1])
|
|
1280
|
+
if tied_p.is_cuda:
|
|
1281
|
+
setattr(parent_module, n , tied_p)
|
|
1282
|
+
continue
|
|
1283
|
+
|
|
1077
1284
|
q = p.to("cuda", non_blocking=True)
|
|
1078
1285
|
if is_buffer:
|
|
1079
1286
|
q = torch.nn.Buffer(q)
|
|
1080
1287
|
else:
|
|
1081
1288
|
q = torch.nn.Parameter(q , requires_grad=False)
|
|
1082
1289
|
setattr(parent_module, n , q)
|
|
1083
|
-
# if record_for_stream != None:
|
|
1084
|
-
# if isinstance(p, QTensor):
|
|
1085
|
-
# q._data.record_stream(record_for_stream)
|
|
1086
|
-
# q._scale.record_stream(record_for_stream)
|
|
1087
|
-
# else:
|
|
1088
|
-
# p.data.record_stream(record_for_stream)
|
|
1089
1290
|
|
|
1291
|
+
if tied_param != None:
|
|
1292
|
+
setattr( tied_param[0], tied_param[1], q)
|
|
1293
|
+
del p, q
|
|
1090
1294
|
any_past_block = False
|
|
1091
1295
|
|
|
1092
1296
|
loaded_block = self.loaded_blocks[model_id]
|
|
@@ -1108,24 +1312,24 @@ class offload:
|
|
|
1108
1312
|
first = self.prev_blocks_names[entry_name] == None or not any_past_block
|
|
1109
1313
|
next_blocks_entry = self.next_blocks_names[entry_name] if entry_name in self.next_blocks_names else None
|
|
1110
1314
|
if first:
|
|
1111
|
-
cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
|
|
1112
1315
|
if self.verboseLevel >=2:
|
|
1113
1316
|
if preload:
|
|
1114
1317
|
print(f"Preloading model {entry_name} ({model_name}) in GPU")
|
|
1115
1318
|
else:
|
|
1116
1319
|
print(f"Loading model {entry_name} ({model_name}) in GPU")
|
|
1320
|
+
cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
|
|
1117
1321
|
|
|
1118
1322
|
torch.cuda.synchronize()
|
|
1119
1323
|
|
|
1120
1324
|
if next_blocks_entry != None:
|
|
1121
|
-
cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
|
|
1122
1325
|
if self.verboseLevel >=2:
|
|
1123
1326
|
print(f"Prefetching model {next_blocks_entry} ({model_name}) in GPU")
|
|
1327
|
+
cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
|
|
1124
1328
|
|
|
1125
1329
|
else:
|
|
1126
|
-
cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
|
|
1127
1330
|
if self.verboseLevel >=2:
|
|
1128
1331
|
print(f"Loading model {entry_name} ({model_name}) in GPU")
|
|
1332
|
+
cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
|
|
1129
1333
|
torch.cuda.synchronize()
|
|
1130
1334
|
|
|
1131
1335
|
if not preload:
|
|
@@ -1149,12 +1353,13 @@ class offload:
|
|
|
1149
1353
|
|
|
1150
1354
|
blocks_params = self.blocks_of_modules[blocks_name]
|
|
1151
1355
|
for param in blocks_params:
|
|
1152
|
-
parent_module, n, p, is_buffer = param
|
|
1356
|
+
parent_module, n, p, is_buffer, _ = param
|
|
1153
1357
|
if is_buffer:
|
|
1154
1358
|
q = torch.nn.Buffer(p)
|
|
1155
1359
|
else:
|
|
1156
1360
|
q = torch.nn.Parameter(p , requires_grad=False)
|
|
1157
1361
|
setattr(parent_module, n , q)
|
|
1362
|
+
del p, q
|
|
1158
1363
|
# cl.stop()
|
|
1159
1364
|
# print(f"unload time: {cl.format_time_gap()}")
|
|
1160
1365
|
|
|
@@ -1168,9 +1373,6 @@ class offload:
|
|
|
1168
1373
|
for block_name in self.preloaded_blocks_per_model[model_id]:
|
|
1169
1374
|
self.gpu_load_blocks(model_id, block_name, True)
|
|
1170
1375
|
|
|
1171
|
-
|
|
1172
|
-
# torch.cuda.current_stream().synchronize()
|
|
1173
|
-
|
|
1174
1376
|
def unload_all(self):
|
|
1175
1377
|
for model_id in self.active_models_ids:
|
|
1176
1378
|
self.gpu_unload_blocks(model_id, None)
|
|
@@ -1246,6 +1448,16 @@ class offload:
|
|
|
1246
1448
|
|
|
1247
1449
|
return False
|
|
1248
1450
|
|
|
1451
|
+
def ensure_model_loaded(self, model_id):
|
|
1452
|
+
if model_id in self.active_models_ids:
|
|
1453
|
+
return
|
|
1454
|
+
# new_model_id = getattr(module, "_mm_id")
|
|
1455
|
+
# do not always unload existing models if it is more efficient to keep in them in the GPU
|
|
1456
|
+
# (e.g: small modules whose calls are text encoders)
|
|
1457
|
+
if not self.can_model_be_cotenant(model_id) :
|
|
1458
|
+
self.unload_all()
|
|
1459
|
+
self.gpu_load(model_id)
|
|
1460
|
+
|
|
1249
1461
|
def hook_preload_blocks_for_compilation(self, target_module, model_id,blocks_name, context):
|
|
1250
1462
|
|
|
1251
1463
|
# @torch.compiler.disable()
|
|
@@ -1259,16 +1471,27 @@ class offload:
|
|
|
1259
1471
|
target_module.register_forward_pre_hook(preload_blocks_for_compile)
|
|
1260
1472
|
|
|
1261
1473
|
|
|
1262
|
-
def hook_check_empty_cache_needed(self, target_module, model_id,blocks_name, previous_method, context):
|
|
1474
|
+
def hook_check_empty_cache_needed(self, target_module, model_id, blocks_name, previous_method, context):
|
|
1263
1475
|
|
|
1264
1476
|
qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
|
|
1265
1477
|
if qint4quantization:
|
|
1266
1478
|
pass
|
|
1267
1479
|
|
|
1268
|
-
|
|
1269
|
-
#
|
|
1270
|
-
|
|
1480
|
+
if hasattr(target_module, "_mm_id"):
|
|
1481
|
+
# no hook for a shared module with no weights (otherwise this will cause models loading / unloading for nothing)
|
|
1482
|
+
orig_model_id = getattr(target_module, "_mm_id")
|
|
1483
|
+
if self.verboseLevel >=2:
|
|
1484
|
+
print(f"Model '{model_id}' shares module '{target_module._get_name()}' with module(s) '{orig_model_id}' ")
|
|
1485
|
+
assert not self.any_param_or_buffer(target_module)
|
|
1486
|
+
if not isinstance(orig_model_id, list):
|
|
1487
|
+
orig_model_id = [orig_model_id]
|
|
1488
|
+
orig_model_id.append(model_id)
|
|
1489
|
+
setattr(target_module, "_mm_id", orig_model_id)
|
|
1490
|
+
target_module.forward = target_module._mm_forward
|
|
1491
|
+
return
|
|
1271
1492
|
|
|
1493
|
+
def check_empty_cuda_cache(module, *args, **kwargs):
|
|
1494
|
+
self.ensure_model_loaded(model_id)
|
|
1272
1495
|
if blocks_name == None:
|
|
1273
1496
|
if self.ready_to_check_mem():
|
|
1274
1497
|
self.empty_cache_if_needed()
|
|
@@ -1279,34 +1502,18 @@ class offload:
|
|
|
1279
1502
|
|
|
1280
1503
|
return previous_method(*args, **kwargs)
|
|
1281
1504
|
|
|
1282
|
-
|
|
1283
|
-
if hasattr(target_module, "_mm_id"):
|
|
1284
|
-
orig_model_id = getattr(target_module, "_mm_id")
|
|
1285
|
-
if self.verboseLevel >=2:
|
|
1286
|
-
print(f"Model '{model_id}' shares module '{target_module._get_name()}' with module '{orig_model_id}' ")
|
|
1287
|
-
assert not self.any_param_or_buffer(target_module)
|
|
1288
|
-
|
|
1289
|
-
return
|
|
1290
1505
|
setattr(target_module, "_mm_id", model_id)
|
|
1506
|
+
setattr(target_module, "_mm_forward", previous_method)
|
|
1507
|
+
|
|
1291
1508
|
setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
|
|
1292
1509
|
|
|
1293
1510
|
|
|
1294
1511
|
def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
new_model_id = getattr(module, "_mm_id")
|
|
1299
|
-
# do not always unload existing models if it is more efficient to keep in them in the GPU
|
|
1300
|
-
# (e.g: small modules whose calls are text encoders)
|
|
1301
|
-
if not self.can_model_be_cotenant(new_model_id) :
|
|
1302
|
-
self.unload_all()
|
|
1303
|
-
performEmptyCacheTest = False
|
|
1304
|
-
self.gpu_load(new_model_id)
|
|
1512
|
+
|
|
1513
|
+
def check_change_module(module, *args, **kwargs):
|
|
1514
|
+
self.ensure_model_loaded(model_id)
|
|
1305
1515
|
# transfer leftovers inputs that were incorrectly created in the RAM (mostly due to some .device tests that returned incorrectly "cpu")
|
|
1306
1516
|
args, kwargs = self.move_args_to_gpu(*args, **kwargs)
|
|
1307
|
-
if performEmptyCacheTest:
|
|
1308
|
-
self.empty_cache_if_needed()
|
|
1309
|
-
|
|
1310
1517
|
return previous_method(*args, **kwargs)
|
|
1311
1518
|
|
|
1312
1519
|
if hasattr(target_module, "_mm_id"):
|
|
@@ -1337,6 +1544,8 @@ class offload:
|
|
|
1337
1544
|
base_size = self.blocks_of_modules_sizes[model_id]
|
|
1338
1545
|
current_budget -= base_size
|
|
1339
1546
|
if current_budget <= 0:
|
|
1547
|
+
if self.verboseLevel >=1:
|
|
1548
|
+
print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only only base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
|
|
1340
1549
|
return
|
|
1341
1550
|
|
|
1342
1551
|
towers = []
|
|
@@ -1357,6 +1566,8 @@ class offload:
|
|
|
1357
1566
|
total_size += tower_size
|
|
1358
1567
|
current_budget -= 2 * max_floor_size
|
|
1359
1568
|
if current_budget <= 0:
|
|
1569
|
+
if self.verboseLevel >=1:
|
|
1570
|
+
print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
|
|
1360
1571
|
return
|
|
1361
1572
|
|
|
1362
1573
|
|
|
@@ -1366,6 +1577,8 @@ class offload:
|
|
|
1366
1577
|
preload_total += preload_blocks_count * max_floor_size
|
|
1367
1578
|
max_blocks_fetch = max(max_floor_size, max_blocks_fetch)
|
|
1368
1579
|
if preload_blocks_count <= 0:
|
|
1580
|
+
if self.verboseLevel >=1:
|
|
1581
|
+
print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
|
|
1369
1582
|
return
|
|
1370
1583
|
|
|
1371
1584
|
nb_blocks= len(floors)
|
|
@@ -1396,11 +1609,11 @@ class offload:
|
|
|
1396
1609
|
|
|
1397
1610
|
self.preloaded_blocks_per_model[model_id] = preloaded_blocks
|
|
1398
1611
|
|
|
1399
|
-
if self.verboseLevel >=
|
|
1400
|
-
print(f"Async loading plan for model '{model_id}' : {preload_total/ONE_MB:0.2f} MB will be preloaded ({preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async shuttle")
|
|
1612
|
+
if self.verboseLevel >=1:
|
|
1613
|
+
print(f"Async loading plan for model '{model_id}' : {(preload_total+base_size)/ONE_MB:0.2f} MB will be preloaded (base size of {base_size/ONE_MB:0.2f} MB + {preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async" + (" circular" if len(towers) == 1 else "") + " shuttle")
|
|
1401
1614
|
|
|
1402
1615
|
|
|
1403
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
|
|
1616
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
1404
1617
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
1405
1618
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
1406
1619
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -1417,9 +1630,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1417
1630
|
model_budgets = {}
|
|
1418
1631
|
|
|
1419
1632
|
windows_os = os.name == 'nt'
|
|
1420
|
-
global total_pinned_bytes
|
|
1421
1633
|
|
|
1422
|
-
|
|
1423
1634
|
budget = 0
|
|
1424
1635
|
if not budgets is None:
|
|
1425
1636
|
if isinstance(budgets , dict):
|
|
@@ -1448,6 +1659,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1448
1659
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
1449
1660
|
|
|
1450
1661
|
_welcome()
|
|
1662
|
+
if coTenantsMap != None:
|
|
1663
|
+
self.cotenants_map = coTenantsMap
|
|
1451
1664
|
|
|
1452
1665
|
self.models = models
|
|
1453
1666
|
|
|
@@ -1528,9 +1741,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1528
1741
|
current_model_size += torch.numel(p.data) * p.data.element_size()
|
|
1529
1742
|
|
|
1530
1743
|
for b in current_model.buffers():
|
|
1531
|
-
|
|
1532
|
-
# convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
|
|
1533
|
-
b.data = b.data.to(torch.bfloat16)
|
|
1744
|
+
# do not convert 32 bits float to 16 bits since buffers are few (and potential gain low) and usually they are needed for precision calculation (for instance Rope)
|
|
1534
1745
|
current_model_size += torch.numel(b.data) * b.data.element_size()
|
|
1535
1746
|
|
|
1536
1747
|
if modelPinned:
|
|
@@ -1538,17 +1749,39 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1538
1749
|
|
|
1539
1750
|
|
|
1540
1751
|
model_budget = model_budgets[model_id] * ONE_MB if model_id in model_budgets else budget
|
|
1541
|
-
|
|
1752
|
+
if workingVRAM != None:
|
|
1753
|
+
model_minimumVRAM = -1
|
|
1754
|
+
if isinstance(workingVRAM, dict):
|
|
1755
|
+
if model_id in workingVRAM:
|
|
1756
|
+
model_minimumVRAM = workingVRAM[model_id]
|
|
1757
|
+
elif "*" in model_id in workingVRAM:
|
|
1758
|
+
model_minimumVRAM = workingVRAM["*"]
|
|
1759
|
+
else:
|
|
1760
|
+
model_minimumVRAM = workingVRAM
|
|
1761
|
+
if model_minimumVRAM > 0:
|
|
1762
|
+
new_budget = self.device_mem_capacity - model_minimumVRAM * ONE_MB
|
|
1763
|
+
new_budget = 1 if new_budget < 0 else new_budget
|
|
1764
|
+
model_budget = new_budget if model_budget == 0 or new_budget < model_budget else model_budget
|
|
1542
1765
|
if model_budget > 0 and model_budget > current_model_size:
|
|
1543
1766
|
model_budget = 0
|
|
1767
|
+
coef =0.8
|
|
1768
|
+
if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
|
|
1769
|
+
if verboseLevel >= 1:
|
|
1770
|
+
if model_budget == 0:
|
|
1771
|
+
print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
|
|
1772
|
+
else:
|
|
1773
|
+
print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
|
|
1774
|
+
print(f"Budget allocation for this model has been consequently reduced to the 80% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
|
|
1775
|
+
model_budget = coef * self.device_mem_capacity
|
|
1776
|
+
|
|
1544
1777
|
|
|
1545
|
-
model_budgets[model_id] = model_budget
|
|
1778
|
+
model_budgets[model_id] = model_budget
|
|
1546
1779
|
|
|
1547
1780
|
partialPinning = False
|
|
1548
1781
|
|
|
1549
1782
|
if estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
|
|
1550
1783
|
if self.verboseLevel >=1:
|
|
1551
|
-
print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated reservable RAM is {max_reservable_memory/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
|
|
1784
|
+
print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated available reservable RAM is {(max_reservable_memory-total_pinned_bytes)/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
|
|
1552
1785
|
partialPinning = True
|
|
1553
1786
|
|
|
1554
1787
|
# Hook forward methods of modules
|
|
@@ -1577,15 +1810,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1577
1810
|
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, verboseLevel=verboseLevel)
|
|
1578
1811
|
|
|
1579
1812
|
current_budget = model_budgets[model_id]
|
|
1580
|
-
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
|
|
1813
|
+
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
1581
1814
|
self.loaded_blocks[model_id] = None
|
|
1582
1815
|
|
|
1583
1816
|
for submodule_name, submodule in current_model.named_modules():
|
|
1584
1817
|
# create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
|
|
1585
1818
|
# (it is queried in many pipelines even if offloading is not properly implemented)
|
|
1586
|
-
if
|
|
1819
|
+
if not hasattr(submodule, "_hf_hook"):
|
|
1587
1820
|
setattr(submodule, "_hf_hook", HfHook())
|
|
1588
|
-
|
|
1589
1821
|
if current_budget > 0 and len(submodule_name) > 0:
|
|
1590
1822
|
if cur_blocks_prefix != None:
|
|
1591
1823
|
if submodule_name.startswith(cur_blocks_prefix):
|
|
@@ -1593,20 +1825,20 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1593
1825
|
depth_name = submodule_name.split(".")
|
|
1594
1826
|
level = depth_name[len(depth_prefix)-1]
|
|
1595
1827
|
pre , num = _extract_num_from_str(level)
|
|
1596
|
-
if num != cur_blocks_seq
|
|
1828
|
+
if num != cur_blocks_seq and not (is_mod_seq and cur_blocks_seq>=0):
|
|
1597
1829
|
prev_blocks_name = cur_blocks_name
|
|
1598
1830
|
cur_blocks_name = cur_blocks_prefix + str(num)
|
|
1599
1831
|
# print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
1600
1832
|
cur_blocks_seq = num
|
|
1601
1833
|
else:
|
|
1602
|
-
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
|
|
1834
|
+
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
1603
1835
|
|
|
1604
1836
|
if cur_blocks_prefix == None:
|
|
1605
1837
|
pre , num = _extract_num_from_str(submodule_name)
|
|
1606
1838
|
if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
|
|
1607
|
-
cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre + ".", None, -1
|
|
1839
|
+
cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre + ".", None, -1, isinstance(submodule, torch.nn.Sequential)
|
|
1608
1840
|
elif num >=0:
|
|
1609
|
-
cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre, None, num
|
|
1841
|
+
cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre, None, num, False
|
|
1610
1842
|
cur_blocks_name = submodule_name
|
|
1611
1843
|
# print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
1612
1844
|
|
|
@@ -1621,7 +1853,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1621
1853
|
else:
|
|
1622
1854
|
self.hook_check_empty_cache_needed(submodule, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
1623
1855
|
|
|
1624
|
-
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
|
|
1856
|
+
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
|
|
1625
1857
|
|
|
1626
1858
|
self.tune_preloading(model_id, current_budget, towers_names)
|
|
1627
1859
|
|
|
@@ -1635,9 +1867,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1635
1867
|
elif prev_num - start_num <=1:
|
|
1636
1868
|
print(f"Size of submodel '{n+ str(start_num)}': {prev_size/ONE_MB:.1f} MB")
|
|
1637
1869
|
else:
|
|
1638
|
-
print(f"Size of submodel '{n+ str(start_num) +'-'+ str(prev_num)}': {prev_size/ONE_MB:.1f} MB")
|
|
1870
|
+
print(f"Size of submodel '{n+ str(start_num) +'-'+ str(prev_num)}': {(prev_num-start_num+1)*prev_size/ONE_MB:.1f} MB ({prev_size/ONE_MB:.1f} MB x {prev_num-start_num+1})")
|
|
1639
1871
|
|
|
1640
1872
|
for n, size in self.blocks_of_modules_sizes.items():
|
|
1873
|
+
size = int(size / 10000)* 10000
|
|
1641
1874
|
pre, num = _extract_num_from_str(n) if "/" in n else (n, -1)
|
|
1642
1875
|
if prev_pre == None :
|
|
1643
1876
|
start_num = num
|
|
@@ -1709,21 +1942,21 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1709
1942
|
if profile_no == profile_type.HighRAM_HighVRAM:
|
|
1710
1943
|
pinnedMemory= True
|
|
1711
1944
|
budgets = None
|
|
1712
|
-
info = "You have chosen a profile that
|
|
1945
|
+
info = "You have chosen a profile that may require 48 GB of RAM and up to 24 GB of VRAM on some applications."
|
|
1713
1946
|
elif profile_no == profile_type.HighRAM_LowVRAM:
|
|
1714
1947
|
pinnedMemory= True
|
|
1715
1948
|
budgets["*"] = 3000
|
|
1716
|
-
info = "You have chosen a profile that
|
|
1949
|
+
info = "You have chosen a profile that may require 48 GB of RAM and up to 12 GB of VRAM on some applications."
|
|
1717
1950
|
elif profile_no == profile_type.LowRAM_HighVRAM:
|
|
1718
1951
|
pinnedMemory= "transformer"
|
|
1719
1952
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
1720
1953
|
budgets = None
|
|
1721
|
-
info = "You have chosen a Medium speed profile that
|
|
1954
|
+
info = "You have chosen a Medium speed profile that may require 32 GB of RAM and up to 24 GB of VRAM on some applications."
|
|
1722
1955
|
elif profile_no == profile_type.LowRAM_LowVRAM:
|
|
1723
1956
|
pinnedMemory= "transformer"
|
|
1724
1957
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
1725
1958
|
budgets["*"] = 3000
|
|
1726
|
-
info = "You have chosen a profile that
|
|
1959
|
+
info = "You have chosen a profile that usually may require 32 GB of RAM and up to 12 GB of VRAM on some applications."
|
|
1727
1960
|
elif profile_no == profile_type.VerylowRAM_LowVRAM:
|
|
1728
1961
|
pinnedMemory= False
|
|
1729
1962
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
@@ -1731,9 +1964,10 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1731
1964
|
if "transformer" in modules:
|
|
1732
1965
|
budgets["transformer"] = 400
|
|
1733
1966
|
#asyncTransfers = False
|
|
1734
|
-
info = "You have chosen the slowest profile that
|
|
1967
|
+
info = "You have chosen the slowest profile that may require 24 GB of RAM and up to 10 GB of VRAM on some applications."
|
|
1735
1968
|
else:
|
|
1736
1969
|
raise Exception("Unknown profile")
|
|
1970
|
+
info += " Actual requirements may varry depending on the application or on the tuning done to the profile."
|
|
1737
1971
|
|
|
1738
1972
|
if budgets != None and len(budgets) == 0:
|
|
1739
1973
|
budgets = None
|
mmgp/safetensors2.py
CHANGED
|
@@ -146,7 +146,7 @@ def _read_safetensors_header(path, file):
|
|
|
146
146
|
return catalog, metadata, length_of_header + 8
|
|
147
147
|
|
|
148
148
|
|
|
149
|
-
def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
149
|
+
def torch_write_file(sd, file_path, quantization_map = None, config = None, extra_meta = None):
|
|
150
150
|
from collections import OrderedDict
|
|
151
151
|
sf_sd = OrderedDict()
|
|
152
152
|
|
|
@@ -189,6 +189,14 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
|
189
189
|
if not config is None:
|
|
190
190
|
metadata["config_base64"] = base64.b64encode(json.dumps(config, ensure_ascii=False).encode('utf8')).decode('utf8')
|
|
191
191
|
|
|
192
|
+
if not extra_meta is None:
|
|
193
|
+
for n , m in extra_meta.items():
|
|
194
|
+
if isinstance(m, str):
|
|
195
|
+
metadata[n] = m
|
|
196
|
+
else:
|
|
197
|
+
metadata[n + "_base64"] = base64.b64encode(json.dumps(m, ensure_ascii=False).encode('utf8')).decode('utf8')
|
|
198
|
+
|
|
199
|
+
|
|
192
200
|
if len(metadata) > 0:
|
|
193
201
|
sf_sd["__metadata__"] = metadata
|
|
194
202
|
|
|
@@ -443,6 +451,4 @@ try:
|
|
|
443
451
|
transformers.modeling_utils.safe_open = safe_open
|
|
444
452
|
transformers.modeling_utils.safe_load_file = torch_load_file
|
|
445
453
|
except:
|
|
446
|
-
pass
|
|
447
|
-
|
|
448
|
-
|
|
454
|
+
pass
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.1.4.
|
|
3
|
+
Version: 3.1.4.post1519
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -17,7 +17,7 @@ Requires-Dist: peft
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep</H2>
|
|
20
|
+
<H2>Memory Management 3.1.4-151 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
23
|
|
|
@@ -44,21 +44,23 @@ Each profile may use a combination of the following:
|
|
|
44
44
|
|
|
45
45
|
## Sample applications that use mmgp
|
|
46
46
|
It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
|
|
47
|
-
- Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP
|
|
47
|
+
- Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
|
|
48
48
|
A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
|
|
49
49
|
|
|
50
|
-
- HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP
|
|
50
|
+
- HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP :\
|
|
51
51
|
One of the best open source Text to Video generator
|
|
52
52
|
|
|
53
|
-
- FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP
|
|
53
|
+
- FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP :\
|
|
54
54
|
One of the best inpainting / outpainting tools based on Flux that can run with less than 12 GB of VRAM.
|
|
55
55
|
|
|
56
|
-
- Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP
|
|
56
|
+
- Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP :\
|
|
57
57
|
This application include two models: a text to world generator and a image / video to world (probably the best open source image to video generator).
|
|
58
58
|
|
|
59
|
-
- OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP
|
|
59
|
+
- OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP :\
|
|
60
60
|
A Flux derived application very powerful that can be used to transfer an object of your choice in a prompted scene. With mmgp you can run it with only 6 GB of VRAM.
|
|
61
61
|
|
|
62
|
+
- YuE GP: https://github.com/deepbeepmeep/YuEGP :\
|
|
63
|
+
A great song generator (instruments + singer's voice) based on prompted Lyrics and a genre description. Thanks to mmgp you can run it with less than 10 GB of VRAM without waiting forever.
|
|
62
64
|
|
|
63
65
|
## Installation
|
|
64
66
|
First you need to install the module in your current project with:
|
|
@@ -88,7 +90,7 @@ You can choose between 5 profiles depending on your hardware:
|
|
|
88
90
|
- LowRAM_LowVRAM (4): at least 32 GB of RAM and 12 GB of VRAM : if you have little VRAM or want to generate longer videos / more images
|
|
89
91
|
- VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
|
|
90
92
|
|
|
91
|
-
Profile 2 (High RAM) and 4 (Low RAM)are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
|
|
93
|
+
Profile 2 (High RAM) and 4 (Low RAM) are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
|
|
92
94
|
If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
|
|
93
95
|
In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
|
|
94
96
|
|
|
@@ -114,11 +116,13 @@ For example:
|
|
|
114
116
|
- pinnedMemory: Boolean (for all models) or List of models ids to pin to RAM. Every model pinned to RAM will load much faster (up to 2 times) but this requires more RAM
|
|
115
117
|
- quantizeTransformer: boolean by default True. The 'transformer' model in the pipe contains usually the video or image generator is by defaut; quantized on the fly by default to 8 bits. If you want to save time on disk and reduce the loading time, you may want to load directly a prequantized model. If you don't want to quantize the image generator, you need to set the option *quantizeTransformer* to *False* to turn off on the fly quantization.
|
|
116
118
|
- extraModelsToQuantize: list of additional modelids of models to quantize on the fly. If the corresponding model is already quantized, this option will be ignored.
|
|
117
|
-
- budgets: either a number in mega bytes (for all models, if 0 unlimited budget) or a dictionary that maps model ids to mega bytes : define the budget in
|
|
119
|
+
- budgets: either a number in mega bytes (for all models, if 0 unlimited budget) or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
|
|
118
120
|
The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
|
|
121
|
+
- workingVRAM: either a number in mega bytes or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
|
|
119
122
|
- asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
|
|
120
123
|
- verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
|
|
121
124
|
- compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows
|
|
125
|
+
- coTenantsMap: a dictionary that maps a model id to a list of other models with which it accepts to share the VRAM at the same time. This is useful to avoid unefficient loading / unloading when two models processes are interleaved. For instance *coTenantsMap = { "text_encoder_2": ["text_encoder"] }* , here when *text_encoder_2* is loaded it won't unload *text_encoder*. Please note that the reverse is not true as these maps by design are not symetrical to allow tailored workflows. If you need to have as well *text_encoder* that won't unload *text_encoder_2* if it is already loaded *coTenantsMap = { "text_encoder_2": ["text_encoder"], "text_encoder": ["text_encoder_2"] }*
|
|
122
126
|
|
|
123
127
|
If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
|
|
124
128
|
|
|
@@ -126,11 +130,14 @@ If you are short on RAM and plan to work with quantized models, it is recommende
|
|
|
126
130
|
|
|
127
131
|
The module includes several tools to package a light version of your favorite video / image generator:
|
|
128
132
|
- *extract_models(string prefix, obj to explore)*\
|
|
129
|
-
This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required
|
|
133
|
+
This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required by *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
|
|
130
134
|
|
|
131
|
-
- *load_loras_into_model(model, lora_path, lora_multi)*\
|
|
135
|
+
- *load_loras_into_model(model, lora_path, lora_multi, activate_all_loras = True)*\
|
|
132
136
|
Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
|
|
133
|
-
The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
|
|
137
|
+
The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions. By default all the load loras will be activated or they can be activated later using *activate_loras*.
|
|
138
|
+
|
|
139
|
+
-*activate_loras(model, lora_nos, lora_multi = None )*\
|
|
140
|
+
Activate the loras whose nos are in the list of nos. Every lora that is not this list and that was activated previously will be disactivated.
|
|
134
141
|
|
|
135
142
|
- *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
|
|
136
143
|
Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=dfmplgTm19DPJ8AKqOf8McaY2f63cz3Dqim_-Hvpcqo,86202
|
|
4
|
+
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
+
mmgp-3.1.4.post1519.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.1.4.post1519.dist-info/METADATA,sha256=x0gpYN-KkoW7aNwLNK-3IOV1B7pljr3eW9y5_8w7W6c,15947
|
|
7
|
+
mmgp-3.1.4.post1519.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
8
|
+
mmgp-3.1.4.post1519.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.1.4.post1519.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=DEGTt5RPoLx9JK-d7Ld_B_rIuQrmhblQJw3V5CL9Lo8,74519
|
|
4
|
-
mmgp/safetensors2.py,sha256=OkJAvENfWeb-PL0FcxS1-eYeHLbemTaNXYvNxURrzIs,16154
|
|
5
|
-
mmgp-3.1.4.post15.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.1.4.post15.dist-info/METADATA,sha256=IMmhK6xAu0A96mLlpby9V2H-K8RYIqRpORaBngvtC0U,14278
|
|
7
|
-
mmgp-3.1.4.post15.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
8
|
-
mmgp-3.1.4.post15.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.1.4.post15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|