mmgp 3.0.3__py3-none-any.whl → 3.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +222 -169
- mmgp/safetensors2.py +65 -21
- {mmgp-3.0.3.dist-info → mmgp-3.0.9.dist-info}/METADATA +13 -9
- mmgp-3.0.9.dist-info/RECORD +9 -0
- {mmgp-3.0.3.dist-info → mmgp-3.0.9.dist-info}/WHEEL +1 -1
- mmgp-3.0.3.dist-info/RECORD +0 -9
- {mmgp-3.0.3.dist-info → mmgp-3.0.9.dist-info}/LICENSE.md +0 -0
- {mmgp-3.0.3.dist-info → mmgp-3.0.9.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -61,10 +61,22 @@ import sys
|
|
|
61
61
|
import os
|
|
62
62
|
import json
|
|
63
63
|
import psutil
|
|
64
|
+
try:
|
|
65
|
+
from diffusers.utils.peft_utils import set_weights_and_activate_adapters, get_peft_kwargs
|
|
66
|
+
except:
|
|
67
|
+
set_weights_and_activate_adapters = None
|
|
68
|
+
get_peft_kwargs = None
|
|
69
|
+
pass
|
|
70
|
+
try:
|
|
71
|
+
from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
|
|
72
|
+
except:
|
|
73
|
+
inject_adapter_in_model = None
|
|
74
|
+
pass
|
|
75
|
+
|
|
64
76
|
from mmgp import safetensors2
|
|
65
77
|
from mmgp import profile_type
|
|
66
78
|
|
|
67
|
-
from optimum.quanto import freeze,
|
|
79
|
+
from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module
|
|
68
80
|
|
|
69
81
|
|
|
70
82
|
|
|
@@ -127,6 +139,9 @@ def move_tensors(obj, device):
|
|
|
127
139
|
return _list
|
|
128
140
|
else:
|
|
129
141
|
raise TypeError("Tensor or list / dict of tensors expected")
|
|
142
|
+
def _get_module_name(v):
|
|
143
|
+
return v.__module__.lower()
|
|
144
|
+
|
|
130
145
|
|
|
131
146
|
def _compute_verbose_level(level):
|
|
132
147
|
if level <0:
|
|
@@ -263,7 +278,13 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
263
278
|
# print(f"num params to pin {model_id}: {len(params_list)}")
|
|
264
279
|
for p in params_list:
|
|
265
280
|
if isinstance(p, QTensor):
|
|
266
|
-
|
|
281
|
+
if p._qtype == qint4:
|
|
282
|
+
if hasattr(p,"_scale_shift"):
|
|
283
|
+
length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
284
|
+
else:
|
|
285
|
+
length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
|
|
286
|
+
else:
|
|
287
|
+
length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
|
|
267
288
|
else:
|
|
268
289
|
length = torch.numel(p.data) * p.data.element_size()
|
|
269
290
|
|
|
@@ -306,10 +327,22 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
306
327
|
if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
307
328
|
current_big_tensor = big_tensors[big_tensor_no]
|
|
308
329
|
if isinstance(p, QTensor):
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
330
|
+
if p._qtype == qint4:
|
|
331
|
+
length1 = torch.numel(p._data._data) * p._data._data.element_size()
|
|
332
|
+
p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
|
|
333
|
+
if hasattr(p,"_scale_shift"):
|
|
334
|
+
length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
335
|
+
p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
|
|
336
|
+
else:
|
|
337
|
+
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
338
|
+
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
339
|
+
length3 = torch.numel(p._shift) * p._shift.element_size()
|
|
340
|
+
p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
|
|
341
|
+
else:
|
|
342
|
+
length1 = torch.numel(p._data) * p._data.element_size()
|
|
343
|
+
p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
|
|
344
|
+
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
345
|
+
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
313
346
|
else:
|
|
314
347
|
length = torch.numel(p.data) * p.data.element_size()
|
|
315
348
|
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
@@ -339,98 +372,6 @@ def _welcome():
|
|
|
339
372
|
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.0) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
340
373
|
|
|
341
374
|
|
|
342
|
-
# def _pin_to_memory_sd(model, sd, model_id, partialPinning = False, perc_reserved_mem_max = 0, verboseLevel = 1):
|
|
343
|
-
# if verboseLevel>=1 :
|
|
344
|
-
# if partialPinning:
|
|
345
|
-
# print(f"Partial pinning to reserved RAM of data of file '{model_id}' while loading it")
|
|
346
|
-
# else:
|
|
347
|
-
# print(f"Pinning data to reserved RAM of file '{model_id}' while loading it")
|
|
348
|
-
|
|
349
|
-
# max_reservable_memory = _get_max_reservable_memory(perc_reserved_mem_max)
|
|
350
|
-
# if partialPinning:
|
|
351
|
-
# towers_names, _ = _detect_main_towers(model)
|
|
352
|
-
# towers_names = [n +"." for n in towers_names]
|
|
353
|
-
|
|
354
|
-
# BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
355
|
-
# current_big_tensor_size = 0
|
|
356
|
-
# big_tensor_no = 0
|
|
357
|
-
# big_tensors_sizes = []
|
|
358
|
-
# tensor_map_indexes = []
|
|
359
|
-
# total_tensor_bytes = 0
|
|
360
|
-
|
|
361
|
-
# for k,t in sd.items():
|
|
362
|
-
# include = True
|
|
363
|
-
# # if isinstance(p, QTensor):
|
|
364
|
-
# # length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
|
|
365
|
-
# # else:
|
|
366
|
-
# # length = torch.numel(p.data) * p.data.element_size()
|
|
367
|
-
# length = torch.numel(t) * t.data.element_size()
|
|
368
|
-
|
|
369
|
-
# if partialPinning:
|
|
370
|
-
# include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
|
|
371
|
-
|
|
372
|
-
# if include:
|
|
373
|
-
# if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
|
|
374
|
-
# big_tensors_sizes.append(current_big_tensor_size)
|
|
375
|
-
# current_big_tensor_size = 0
|
|
376
|
-
# big_tensor_no += 1
|
|
377
|
-
# tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
|
|
378
|
-
# current_big_tensor_size += length
|
|
379
|
-
# else:
|
|
380
|
-
# tensor_map_indexes.append((-1, 0, 0 ))
|
|
381
|
-
# total_tensor_bytes += length
|
|
382
|
-
|
|
383
|
-
# big_tensors_sizes.append(current_big_tensor_size)
|
|
384
|
-
|
|
385
|
-
# big_tensors = []
|
|
386
|
-
# last_big_tensor = 0
|
|
387
|
-
# total = 0
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
# for size in big_tensors_sizes:
|
|
391
|
-
# try:
|
|
392
|
-
# currrent_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True)
|
|
393
|
-
# big_tensors.append(currrent_big_tensor)
|
|
394
|
-
# except:
|
|
395
|
-
# print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
396
|
-
# break
|
|
397
|
-
|
|
398
|
-
# last_big_tensor += 1
|
|
399
|
-
# total += size
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
# tensor_no = 0
|
|
403
|
-
# for k,t in sd.items():
|
|
404
|
-
# big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
|
|
405
|
-
# if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
406
|
-
# current_big_tensor = big_tensors[big_tensor_no]
|
|
407
|
-
# # if isinstance(p, QTensor):
|
|
408
|
-
# # length1 = torch.numel(p._data) * p._data.element_size()
|
|
409
|
-
# # p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
|
|
410
|
-
# # length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
411
|
-
# # p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
412
|
-
# # else:
|
|
413
|
-
# # length = torch.numel(p.data) * p.data.element_size()
|
|
414
|
-
# # p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
415
|
-
# length = torch.numel(t) * t.data.element_size()
|
|
416
|
-
# t = _move_to_pinned_tensor(t, current_big_tensor, offset, length)
|
|
417
|
-
# sd[k] = t
|
|
418
|
-
# tensor_no += 1
|
|
419
|
-
|
|
420
|
-
# global total_pinned_bytes
|
|
421
|
-
# total_pinned_bytes += total
|
|
422
|
-
|
|
423
|
-
# if verboseLevel >=1:
|
|
424
|
-
# if total_tensor_bytes == total:
|
|
425
|
-
# print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
426
|
-
# else:
|
|
427
|
-
# print(f"{total/ONE_MB:.2f} MB were pinned to reserved RAM out of {total_tensor_bytes/ONE_MB:.2f} MB")
|
|
428
|
-
|
|
429
|
-
# model._already_pinned = True
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
# return
|
|
433
|
-
|
|
434
375
|
def _quantize_dirty_hack(model):
|
|
435
376
|
# dirty hack: add a hook on state_dict() to return a fake non quantized state_dict if called by Lora Diffusers initialization functions
|
|
436
377
|
setattr( model, "_real_state_dict", model.state_dict)
|
|
@@ -536,10 +477,14 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
536
477
|
prev_blocks_prefix = None
|
|
537
478
|
|
|
538
479
|
if hasattr(model_to_quantize, "_quanto_map"):
|
|
480
|
+
for k, entry in model_to_quantize._quanto_map.items():
|
|
481
|
+
weights = entry["weights"]
|
|
482
|
+
print(f"Model '{model_id}' is already quantized to format '{weights}'")
|
|
483
|
+
return False
|
|
539
484
|
print(f"Model '{model_id}' is already quantized")
|
|
540
485
|
return False
|
|
541
|
-
|
|
542
|
-
print(f"Quantization of model '{model_id}' started")
|
|
486
|
+
|
|
487
|
+
print(f"Quantization of model '{model_id}' started to format '{weights}'")
|
|
543
488
|
|
|
544
489
|
for submodule_name, submodule in model_to_quantize.named_modules():
|
|
545
490
|
if isinstance(submodule, QModuleMixin):
|
|
@@ -594,18 +539,18 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
594
539
|
if verboseLevel >=2:
|
|
595
540
|
print(f"Total Excluded {total_excluded/ONE_MB:.1f} MB oF {total_size/ONE_MB:.1f} that is {perc_excluded*100:.2f}%")
|
|
596
541
|
if perc_excluded >= 0.10:
|
|
597
|
-
print(f"Too many
|
|
542
|
+
print(f"Too many modules are excluded, there is something wrong with the selection, switch back to full quantization.")
|
|
598
543
|
exclude_list = None
|
|
599
544
|
|
|
600
545
|
|
|
601
546
|
#quantize(model_to_quantize,weights, exclude= exclude_list)
|
|
602
|
-
|
|
547
|
+
|
|
603
548
|
for name, m in model_to_quantize.named_modules():
|
|
604
549
|
if exclude_list is None or not any( name == module_name for module_name in exclude_list):
|
|
605
550
|
_quantize_submodule(model_to_quantize, name, m, weights=weights, activations=None, optimizer=None)
|
|
606
551
|
|
|
607
|
-
# force read non quantized parameters so that their lazy tensors and corresponding mmap are released
|
|
608
|
-
# otherwise we may end up
|
|
552
|
+
# force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
|
|
553
|
+
# otherwise we may end up keeping in memory both the quantized and the non quantize model
|
|
609
554
|
for m in model_to_quantize.modules():
|
|
610
555
|
# do not read quantized weights (detected them directly or behind an adapter)
|
|
611
556
|
if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
|
|
@@ -620,12 +565,16 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
620
565
|
b.data = b.data + 0
|
|
621
566
|
|
|
622
567
|
|
|
568
|
+
|
|
623
569
|
freeze(model_to_quantize)
|
|
624
570
|
torch.cuda.empty_cache()
|
|
625
571
|
gc.collect()
|
|
626
572
|
quantization_map = _quantization_map(model_to_quantize)
|
|
627
573
|
model_to_quantize._quanto_map = quantization_map
|
|
628
574
|
|
|
575
|
+
if hasattr(model_to_quantize, "_already_pinned"):
|
|
576
|
+
delattr(model_to_quantize, "_already_pinned")
|
|
577
|
+
|
|
629
578
|
_quantize_dirty_hack(model_to_quantize)
|
|
630
579
|
|
|
631
580
|
print(f"Quantization of model '{model_id}' done")
|
|
@@ -684,15 +633,25 @@ class offload:
|
|
|
684
633
|
|
|
685
634
|
for k,p in submodule.named_parameters(recurse=False):
|
|
686
635
|
if isinstance(p, QTensor):
|
|
687
|
-
blocks_params.append( (submodule, k, p
|
|
688
|
-
|
|
689
|
-
|
|
636
|
+
blocks_params.append( (submodule, k, p ) )
|
|
637
|
+
|
|
638
|
+
if p._qtype == qint4:
|
|
639
|
+
if hasattr(p,"_scale_shift"):
|
|
640
|
+
blocks_params_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
641
|
+
blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
|
|
642
|
+
else:
|
|
643
|
+
blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
|
|
644
|
+
blocks_params_size += torch.numel(p._shift) * p._shift.element_size()
|
|
645
|
+
blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
|
|
646
|
+
else:
|
|
647
|
+
blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
|
|
648
|
+
blocks_params_size += torch.numel(p._data) * p._data.element_size()
|
|
690
649
|
else:
|
|
691
|
-
blocks_params.append( (submodule, k, p
|
|
692
|
-
blocks_params_size += p.data.
|
|
650
|
+
blocks_params.append( (submodule, k, p ) )
|
|
651
|
+
blocks_params_size += torch.numel(p.data) * p.data.element_size()
|
|
693
652
|
|
|
694
653
|
for k, p in submodule.named_buffers(recurse=False):
|
|
695
|
-
blocks_params.append( (submodule, k, p
|
|
654
|
+
blocks_params.append( (submodule, k, p) )
|
|
696
655
|
blocks_params_size += p.data.nbytes
|
|
697
656
|
|
|
698
657
|
|
|
@@ -710,34 +669,28 @@ class offload:
|
|
|
710
669
|
return False
|
|
711
670
|
return True
|
|
712
671
|
|
|
713
|
-
|
|
672
|
+
|
|
673
|
+
def gpu_load_blocks(self, model_id, blocks_name):
|
|
714
674
|
# cl = clock.start()
|
|
715
675
|
|
|
716
676
|
if blocks_name != None:
|
|
717
677
|
self.loaded_blocks[model_id] = blocks_name
|
|
718
678
|
|
|
719
679
|
entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
|
|
720
|
-
|
|
721
|
-
def cpu_to_gpu(stream_to_use, blocks_params
|
|
680
|
+
|
|
681
|
+
def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
|
|
722
682
|
with torch.cuda.stream(stream_to_use):
|
|
723
683
|
for param in blocks_params:
|
|
724
|
-
parent_module, n,
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
else:
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
if record_for_stream != None:
|
|
736
|
-
if isinstance(p, QTensor):
|
|
737
|
-
q._data.record_stream(record_for_stream)
|
|
738
|
-
q._scale.record_stream(record_for_stream)
|
|
739
|
-
else:
|
|
740
|
-
p.data.record_stream(record_for_stream)
|
|
684
|
+
parent_module, n, p = param
|
|
685
|
+
q = p.to("cuda", non_blocking=True)
|
|
686
|
+
q = torch.nn.Parameter(q , requires_grad=False)
|
|
687
|
+
setattr(parent_module, n , q)
|
|
688
|
+
# if record_for_stream != None:
|
|
689
|
+
# if isinstance(p, QTensor):
|
|
690
|
+
# q._data.record_stream(record_for_stream)
|
|
691
|
+
# q._scale.record_stream(record_for_stream)
|
|
692
|
+
# else:
|
|
693
|
+
# p.data.record_stream(record_for_stream)
|
|
741
694
|
|
|
742
695
|
|
|
743
696
|
if self.verboseLevel >=2:
|
|
@@ -776,19 +729,10 @@ class offload:
|
|
|
776
729
|
print(f"Unloading model {blocks_name} ({model_name}) from GPU")
|
|
777
730
|
|
|
778
731
|
blocks_params = self.blocks_of_modules[blocks_name]
|
|
779
|
-
|
|
780
732
|
for param in blocks_params:
|
|
781
|
-
parent_module, n,
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
# need to change the parameter directly from the module as it can't be swapped in place due to a memory leak in the pytorch compiler
|
|
785
|
-
q = WeightQBytesTensor.create(p.qtype, p.axis, p.size(), p.stride(), data, scale, activation_qtype=p.activation_qtype, requires_grad=p.requires_grad )
|
|
786
|
-
q = torch.nn.Parameter(q , requires_grad=False)
|
|
787
|
-
setattr(parent_module, n , q)
|
|
788
|
-
del p
|
|
789
|
-
else:
|
|
790
|
-
p.data = data
|
|
791
|
-
|
|
733
|
+
parent_module, n, p = param
|
|
734
|
+
q = torch.nn.Parameter(p , requires_grad=False)
|
|
735
|
+
setattr(parent_module, n , q)
|
|
792
736
|
# cl.stop()
|
|
793
737
|
# print(f"unload time: {cl.format_time_gap()}")
|
|
794
738
|
|
|
@@ -824,8 +768,8 @@ class offload:
|
|
|
824
768
|
if torch.is_tensor(arg):
|
|
825
769
|
if arg.dtype == torch.float32:
|
|
826
770
|
arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
|
|
827
|
-
|
|
828
|
-
arg = arg.cuda(non_blocking=True)
|
|
771
|
+
elif not arg.is_cuda:
|
|
772
|
+
arg = arg.cuda(non_blocking=True)
|
|
829
773
|
new_args.append(arg)
|
|
830
774
|
|
|
831
775
|
for k in kwargs:
|
|
@@ -833,7 +777,7 @@ class offload:
|
|
|
833
777
|
if torch.is_tensor(arg):
|
|
834
778
|
if arg.dtype == torch.float32:
|
|
835
779
|
arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
|
|
836
|
-
|
|
780
|
+
elif not arg.is_cuda:
|
|
837
781
|
arg = arg.cuda(non_blocking=True)
|
|
838
782
|
new_kwargs[k]= arg
|
|
839
783
|
|
|
@@ -897,6 +841,10 @@ class offload:
|
|
|
897
841
|
|
|
898
842
|
def hook_check_empty_cache_needed(self, target_module, model_id,blocks_name, previous_method, context):
|
|
899
843
|
|
|
844
|
+
qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
|
|
845
|
+
if qint4quantization:
|
|
846
|
+
pass
|
|
847
|
+
|
|
900
848
|
def check_empty_cuda_cache(module, *args, **kwargs):
|
|
901
849
|
# if self.ready_to_check_mem():
|
|
902
850
|
# self.empty_cache_if_needed()
|
|
@@ -912,6 +860,8 @@ class offload:
|
|
|
912
860
|
self.empty_cache_if_needed()
|
|
913
861
|
self.loaded_blocks[model_id] = blocks_name
|
|
914
862
|
self.gpu_load_blocks(model_id, blocks_name)
|
|
863
|
+
if qint4quantization:
|
|
864
|
+
args, kwargs = self.move_args_to_gpu(*args, **kwargs)
|
|
915
865
|
|
|
916
866
|
return previous_method(*args, **kwargs)
|
|
917
867
|
|
|
@@ -970,7 +920,105 @@ class offload:
|
|
|
970
920
|
# for module in parent_module.components.items():
|
|
971
921
|
# self.unhook_module(module)
|
|
972
922
|
|
|
973
|
-
|
|
923
|
+
import torch
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
|
|
928
|
+
def load_loras_into_model(model, lora_path, lora_multi = None, verboseLevel = -1):
|
|
929
|
+
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
930
|
+
|
|
931
|
+
if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or get_peft_kwargs == None:
|
|
932
|
+
raise Exception("Unable to load Lora, missing 'peft' and / or 'diffusers' modules")
|
|
933
|
+
|
|
934
|
+
if not isinstance(lora_path, list):
|
|
935
|
+
lora_path = [lora_path]
|
|
936
|
+
|
|
937
|
+
if lora_multi is None:
|
|
938
|
+
lora_multi = [1. for _ in lora_path]
|
|
939
|
+
|
|
940
|
+
for i, path in enumerate(lora_path):
|
|
941
|
+
adapter_name = str(i)
|
|
942
|
+
|
|
943
|
+
state_dict = safetensors2.torch_load_file(path)
|
|
944
|
+
|
|
945
|
+
keys = list(state_dict.keys())
|
|
946
|
+
if len(keys) == 0:
|
|
947
|
+
raise Exception(f"Empty Lora '{path}'")
|
|
948
|
+
|
|
949
|
+
|
|
950
|
+
network_alphas = {}
|
|
951
|
+
for k in keys:
|
|
952
|
+
if "alpha" in k:
|
|
953
|
+
alpha_value = state_dict.pop(k)
|
|
954
|
+
if not ( (torch.is_tensor(alpha_value) and torch.is_floating_point(alpha_value)) or isinstance(
|
|
955
|
+
alpha_value, float
|
|
956
|
+
)):
|
|
957
|
+
network_alphas[k] = torch.tensor( float(alpha_value.item() ) )
|
|
958
|
+
|
|
959
|
+
pos = keys[0].find(".")
|
|
960
|
+
prefix = keys[0][0:pos]
|
|
961
|
+
if not any( prefix.startswith(some_prefix) for some_prefix in ["diffusion_model", "transformer"]):
|
|
962
|
+
msg = f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format."
|
|
963
|
+
raise Exception(msg)
|
|
964
|
+
|
|
965
|
+
transformer = model
|
|
966
|
+
|
|
967
|
+
transformer_keys = [k for k in keys if k.startswith(prefix)]
|
|
968
|
+
state_dict = {
|
|
969
|
+
k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in transformer_keys
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
sd_keys = state_dict.keys()
|
|
973
|
+
if len(sd_keys) == 0:
|
|
974
|
+
print(f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format.")
|
|
975
|
+
return
|
|
976
|
+
|
|
977
|
+
# is_correct_format = all("lora" in key for key in state_dict.keys())
|
|
978
|
+
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
# check with first key if is not in peft format
|
|
983
|
+
# first_key = next(iter(state_dict.keys()))
|
|
984
|
+
# if "lora_A" not in first_key:
|
|
985
|
+
# state_dict = convert_unet_state_dict_to_peft(state_dict)
|
|
986
|
+
|
|
987
|
+
if adapter_name in getattr(transformer, "peft_config", {}):
|
|
988
|
+
raise ValueError(
|
|
989
|
+
f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
|
|
990
|
+
)
|
|
991
|
+
|
|
992
|
+
rank = {}
|
|
993
|
+
for key, val in state_dict.items():
|
|
994
|
+
if "lora_B" in key:
|
|
995
|
+
rank[key] = val.shape[1]
|
|
996
|
+
|
|
997
|
+
if network_alphas is not None and len(network_alphas) >= 1:
|
|
998
|
+
alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
|
|
999
|
+
network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
|
|
1000
|
+
|
|
1001
|
+
lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
|
|
1002
|
+
|
|
1003
|
+
lora_config = LoraConfig(**lora_config_kwargs)
|
|
1004
|
+
peft_kwargs = {}
|
|
1005
|
+
peft_kwargs["low_cpu_mem_usage"] = True
|
|
1006
|
+
inject_adapter_in_model(lora_config, model, adapter_name=adapter_name, **peft_kwargs)
|
|
1007
|
+
|
|
1008
|
+
incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name, **peft_kwargs)
|
|
1009
|
+
|
|
1010
|
+
warn_msg = ""
|
|
1011
|
+
if incompatible_keys is not None:
|
|
1012
|
+
# Check only for unexpected keys.
|
|
1013
|
+
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
|
|
1014
|
+
if unexpected_keys:
|
|
1015
|
+
pass
|
|
1016
|
+
if verboseLevel >=1:
|
|
1017
|
+
print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
|
|
1018
|
+
set_weights_and_activate_adapters(model,[ str(i) for i in range(len(lora_multi))], lora_multi)
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
|
|
974
1022
|
"""
|
|
975
1023
|
quick version of .LoadfromPretrained of the transformers library
|
|
976
1024
|
used to build a model and load the corresponding weights (quantized or not)
|
|
@@ -1040,13 +1088,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1040
1088
|
|
|
1041
1089
|
model._config = transformer_config
|
|
1042
1090
|
|
|
1043
|
-
load_model_data(model,model_path, do_quantize = do_quantize,
|
|
1091
|
+
load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, verboseLevel=verboseLevel )
|
|
1044
1092
|
|
|
1045
1093
|
return model
|
|
1046
1094
|
|
|
1047
1095
|
|
|
1048
1096
|
|
|
1049
|
-
def load_model_data(model, file_path: str, do_quantize = False,
|
|
1097
|
+
def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
|
|
1050
1098
|
"""
|
|
1051
1099
|
Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
|
|
1052
1100
|
"""
|
|
@@ -1067,14 +1115,6 @@ def load_model_data(model, file_path: str, do_quantize = False, quantization_typ
|
|
|
1067
1115
|
state_dict = state_dict["module"]
|
|
1068
1116
|
else:
|
|
1069
1117
|
state_dict, metadata = _safetensors_load_file(file_path)
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
# if pinToMemory:
|
|
1073
|
-
# _pin_to_memory_sd(model,state_dict, file_path, partialPinning = partialPinning, perc_reserved_mem_max = perc_reserved_mem_max, verboseLevel = verboseLevel)
|
|
1074
|
-
|
|
1075
|
-
# with safetensors2.safe_open(file_path) as f:
|
|
1076
|
-
# metadata = f.metadata()
|
|
1077
|
-
|
|
1078
1118
|
|
|
1079
1119
|
if metadata is None:
|
|
1080
1120
|
quantization_map = None
|
|
@@ -1109,7 +1149,7 @@ def load_model_data(model, file_path: str, do_quantize = False, quantization_typ
|
|
|
1109
1149
|
|
|
1110
1150
|
if do_quantize:
|
|
1111
1151
|
if quantization_map is None:
|
|
1112
|
-
if _quantize(model,
|
|
1152
|
+
if _quantize(model, quantizationType, verboseLevel=verboseLevel, model_id=file_path):
|
|
1113
1153
|
quantization_map = model._quanto_map
|
|
1114
1154
|
else:
|
|
1115
1155
|
if verboseLevel >=1:
|
|
@@ -1120,7 +1160,7 @@ def load_model_data(model, file_path: str, do_quantize = False, quantization_typ
|
|
|
1120
1160
|
|
|
1121
1161
|
return
|
|
1122
1162
|
|
|
1123
|
-
def save_model(model, file_path, do_quantize = False,
|
|
1163
|
+
def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1 ):
|
|
1124
1164
|
"""save the weights of a model and quantize them if requested
|
|
1125
1165
|
These weights can be loaded again using 'load_model_data'
|
|
1126
1166
|
"""
|
|
@@ -1147,7 +1187,7 @@ def save_model(model, file_path, do_quantize = False, quantization_type = qint8,
|
|
|
1147
1187
|
config= json.loads(text)
|
|
1148
1188
|
|
|
1149
1189
|
if do_quantize:
|
|
1150
|
-
_quantize(model, weights=
|
|
1190
|
+
_quantize(model, weights=quantizationType, model_id=file_path)
|
|
1151
1191
|
|
|
1152
1192
|
quantization_map = getattr(model, "_quanto_map", None)
|
|
1153
1193
|
|
|
@@ -1160,7 +1200,7 @@ def save_model(model, file_path, do_quantize = False, quantization_type = qint8,
|
|
|
1160
1200
|
|
|
1161
1201
|
|
|
1162
1202
|
|
|
1163
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
|
|
1203
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
|
|
1164
1204
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
1165
1205
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
1166
1206
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -1238,6 +1278,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1238
1278
|
self.anyCompiledModule = compileAllModels or len(modelsToCompile)>0
|
|
1239
1279
|
if self.anyCompiledModule:
|
|
1240
1280
|
torch._dynamo.config.cache_size_limit = 10000
|
|
1281
|
+
torch.compiler.reset()
|
|
1282
|
+
|
|
1241
1283
|
# torch._logging.set_logs(recompiles=True)
|
|
1242
1284
|
# torch._inductor.config.realize_opcount_threshold = 100 # workaround bug "AssertionError: increase TRITON_MAX_BLOCK['X'] to 4096."
|
|
1243
1285
|
|
|
@@ -1252,19 +1294,31 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1252
1294
|
|
|
1253
1295
|
# if the model has just been quantized so there is no need to quantize it again
|
|
1254
1296
|
if model_id in models_to_quantize:
|
|
1255
|
-
_quantize(current_model, weights=
|
|
1297
|
+
_quantize(current_model, weights=quantizationType, verboseLevel = self.verboseLevel, model_id=model_id)
|
|
1256
1298
|
|
|
1257
1299
|
modelPinned = (pinAllModels or model_id in modelsToPin) and not hasattr(current_model,"_already_pinned")
|
|
1258
1300
|
|
|
1259
|
-
current_model_size = 0
|
|
1260
|
-
|
|
1261
|
-
for p in current_model.
|
|
1301
|
+
current_model_size = 0
|
|
1302
|
+
|
|
1303
|
+
for n, p in current_model.named_parameters():
|
|
1304
|
+
p.requires_grad = False
|
|
1305
|
+
p = p.detach()
|
|
1262
1306
|
if isinstance(p, QTensor):
|
|
1263
1307
|
# # fix quanto bug (seems to have been fixed)
|
|
1264
1308
|
# if not modelPinned and p._scale.dtype == torch.float32:
|
|
1265
1309
|
# p._scale = p._scale.to(torch.bfloat16)
|
|
1266
|
-
|
|
1267
|
-
|
|
1310
|
+
if p._qtype == qint4:
|
|
1311
|
+
if hasattr(p,"_scale_shift"):
|
|
1312
|
+
current_model_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
1313
|
+
else:
|
|
1314
|
+
current_model_size += torch.numel(p._scale) * p._shift.element_size() + torch.numel(p._scale) * p._shift.element_size()
|
|
1315
|
+
|
|
1316
|
+
current_model_size += torch.numel(p._data._data) * p._data._data.element_size()
|
|
1317
|
+
|
|
1318
|
+
else:
|
|
1319
|
+
current_model_size += torch.numel(p._scale) * p._scale.element_size()
|
|
1320
|
+
current_model_size += torch.numel(p._data) * p._data.element_size()
|
|
1321
|
+
|
|
1268
1322
|
else:
|
|
1269
1323
|
if p.data.dtype == torch.float32:
|
|
1270
1324
|
# convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
|
|
@@ -1272,7 +1326,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1272
1326
|
current_model_size += torch.numel(p.data) * p.data.element_size()
|
|
1273
1327
|
|
|
1274
1328
|
for b in current_model.buffers():
|
|
1275
|
-
if b.data.dtype == torch.float32:
|
|
1329
|
+
if b.data.dtype == torch.float32:
|
|
1276
1330
|
# convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
|
|
1277
1331
|
b.data = b.data.to(torch.bfloat16)
|
|
1278
1332
|
current_model_size += torch.numel(b.data) * b.data.element_size()
|
|
@@ -1308,7 +1362,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1308
1362
|
print(f"Potential iterative blocks found in model '{model_id}':{towers_names}")
|
|
1309
1363
|
# compile main iterative modules stacks ("towers")
|
|
1310
1364
|
if compileAllModels or model_id in modelsToCompile :
|
|
1311
|
-
#torch.compiler.reset()
|
|
1312
1365
|
if self.verboseLevel>=1:
|
|
1313
1366
|
print(f"Pytorch compilation of model '{model_id}' is scheduled.")
|
|
1314
1367
|
for tower in towers_modules:
|
|
@@ -1406,7 +1459,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1406
1459
|
modules= modules.components
|
|
1407
1460
|
|
|
1408
1461
|
modules = {k: _remove_model_wrapper(v) for k, v in modules.items() if isinstance(v, torch.nn.Module)}
|
|
1409
|
-
module_names = {k: v
|
|
1462
|
+
module_names = {k: _get_module_name(v) for k, v in modules.items() }
|
|
1410
1463
|
|
|
1411
1464
|
default_extraModelsToQuantize = []
|
|
1412
1465
|
quantizeTransformer = True
|
mmgp/safetensors2.py
CHANGED
|
@@ -155,7 +155,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
|
155
155
|
torch.bool : 'BOOL' , torch.float64 : 'F64' , torch.float32 : 'F32' , torch.float16 : 'F16', torch.float8_e5m2 : "F8_E5M2", torch.float8_e4m3fn: "F8_E4M3" }
|
|
156
156
|
pos = 0
|
|
157
157
|
i = 0
|
|
158
|
-
mx =
|
|
158
|
+
mx = 100000
|
|
159
159
|
for k , t in sd.items():
|
|
160
160
|
entry = {}
|
|
161
161
|
dtypestr= map[t.dtype]
|
|
@@ -186,8 +186,6 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
|
186
186
|
|
|
187
187
|
length_of_header_bytes = struct.pack('<Q', size_header)
|
|
188
188
|
|
|
189
|
-
empty_tensor = b'\x80\x3f'
|
|
190
|
-
|
|
191
189
|
with open(file_path, "wb") as writer:
|
|
192
190
|
bytes_written = writer.write(length_of_header_bytes)
|
|
193
191
|
bytes_written = writer.write(header_bytes)
|
|
@@ -195,12 +193,20 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
|
195
193
|
i = 0
|
|
196
194
|
for k , t in sd.items():
|
|
197
195
|
size = torch.numel(t) * t.element_size()
|
|
198
|
-
if
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
196
|
+
if size != 0:
|
|
197
|
+
if len(t.shape) == 0:
|
|
198
|
+
dtype = t.dtype
|
|
199
|
+
# convert in a friendly format, scalars types not supported by numpy
|
|
200
|
+
if dtype == torch.bfloat16:
|
|
201
|
+
t = t.view(torch.uint16)
|
|
202
|
+
elif dtype == torch.float8_e5m2 or dtype == torch.float8_e4m3fn:
|
|
203
|
+
t = t.view(torch.uint8)
|
|
204
|
+
buffer = t.numpy().tobytes()
|
|
205
|
+
else:
|
|
206
|
+
buffer = t.view(torch.uint8).numpy().tobytes()
|
|
207
|
+
bytes_written = writer.write(buffer)
|
|
208
|
+
assert bytes_written == size
|
|
209
|
+
|
|
204
210
|
i+=1
|
|
205
211
|
if i==mx:
|
|
206
212
|
break
|
|
@@ -208,7 +214,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
|
208
214
|
class SafeTensorFile:
|
|
209
215
|
"""Main class for accessing safetensors files that provides memory-efficient access"""
|
|
210
216
|
|
|
211
|
-
def __init__(self, file_path, metadata, catalog, skip_bytes):
|
|
217
|
+
def __init__(self, file_path, metadata, catalog, skip_bytes, lazy_loading = True):
|
|
212
218
|
self._file_path = file_path
|
|
213
219
|
self._metadata = metadata
|
|
214
220
|
self._catalog = catalog
|
|
@@ -216,20 +222,30 @@ class SafeTensorFile:
|
|
|
216
222
|
self._keys = None
|
|
217
223
|
self.sd = None
|
|
218
224
|
self.mtracker = None
|
|
225
|
+
self.lazy_loading = lazy_loading
|
|
219
226
|
|
|
220
227
|
@classmethod
|
|
221
|
-
def load_metadata(cls, file_path):
|
|
228
|
+
def load_metadata(cls, file_path, lazy_loading = True):
|
|
222
229
|
with open(file_path, 'rb') as f:
|
|
223
230
|
catalog, metadata, skip_bytes = _read_safetensors_header(file_path, f)
|
|
224
231
|
|
|
225
|
-
return cls(file_path, metadata, catalog, skip_bytes)
|
|
232
|
+
return cls(file_path, metadata, catalog, skip_bytes, lazy_loading)
|
|
226
233
|
|
|
227
|
-
def init_tensors(self):
|
|
234
|
+
def init_tensors(self, lazyTensors = True):
|
|
228
235
|
if self.sd is None:
|
|
229
|
-
self.
|
|
236
|
+
self.lazy_loading = lazyTensors
|
|
237
|
+
if lazyTensors:
|
|
238
|
+
self.sd = self.create_tensors_with_mmap()
|
|
239
|
+
else:
|
|
240
|
+
self.sd = self.create_tensors_without_mmap()
|
|
241
|
+
# else:
|
|
242
|
+
# if not self.lazy_loading and lazyTensors:
|
|
243
|
+
# raise Exception("Every tensor should be either lazy loaded or not lazy loaded")
|
|
244
|
+
|
|
230
245
|
return self.sd
|
|
231
246
|
|
|
232
|
-
|
|
247
|
+
|
|
248
|
+
def create_tensors_with_mmap(self):
|
|
233
249
|
|
|
234
250
|
self.mtracker = MmapTracker(self._file_path)
|
|
235
251
|
import mmap
|
|
@@ -282,7 +298,12 @@ class SafeTensorFile:
|
|
|
282
298
|
map_idx = next(iter_tensor_no)
|
|
283
299
|
offset = current_pos - maps[map_idx][1]
|
|
284
300
|
if len(shape) == 0:
|
|
285
|
-
|
|
301
|
+
if length == 0:
|
|
302
|
+
t = torch.empty(0, dtype=dtype)
|
|
303
|
+
else:
|
|
304
|
+
# don't waste a memory view for a scalar
|
|
305
|
+
t = torch.frombuffer(bytearray(maps[map_idx][0][offset:offset + length]), dtype=torch.uint8)
|
|
306
|
+
t = t.view(dtype)
|
|
286
307
|
else:
|
|
287
308
|
mv = memoryview(maps[map_idx][0])[offset:offset + length]
|
|
288
309
|
t = torch.frombuffer(mv, dtype=dtype)
|
|
@@ -293,8 +314,33 @@ class SafeTensorFile:
|
|
|
293
314
|
|
|
294
315
|
return sd
|
|
295
316
|
|
|
317
|
+
def create_tensors_without_mmap(self):
|
|
318
|
+
sd = OrderedDict()
|
|
319
|
+
|
|
320
|
+
with open(self._file_path, 'rb') as f:
|
|
321
|
+
f.seek(self._skip_bytes, 0)
|
|
322
|
+
for k,v in self._catalog.items():
|
|
323
|
+
dtypestr = v["dtype"]
|
|
324
|
+
dtype= _map_to_dtype[dtypestr]
|
|
325
|
+
shape = v["shape"]
|
|
326
|
+
data_offsets = v["data_offsets"]
|
|
327
|
+
length = data_offsets[1]-data_offsets[0]
|
|
328
|
+
buffer = f.read(length)
|
|
329
|
+
if len(shape) == 0:
|
|
330
|
+
if length == 0:
|
|
331
|
+
t = torch.empty(0, dtype=dtype)
|
|
332
|
+
else:
|
|
333
|
+
t = torch.frombuffer(bytearray(buffer), dtype=torch.uint8)
|
|
334
|
+
t = t.view(dtype)
|
|
335
|
+
else:
|
|
336
|
+
t = torch.frombuffer(bytearray(buffer), dtype=dtype)
|
|
337
|
+
t = torch.reshape(t, shape)
|
|
338
|
+
sd[k] = t
|
|
339
|
+
return sd
|
|
340
|
+
|
|
296
341
|
def get_tensor(self, name: str) -> torch.tensor:
|
|
297
342
|
"""Get a tensor by name"""
|
|
343
|
+
# To do : switch to a JIT tensor creation per tensor
|
|
298
344
|
self.init_tensors()
|
|
299
345
|
return self.sd[name]
|
|
300
346
|
|
|
@@ -310,7 +356,7 @@ class SafeTensorFile:
|
|
|
310
356
|
|
|
311
357
|
def tensors(self) -> Dict[str, torch.tensor]:
|
|
312
358
|
"""Get dictionary of all tensors"""
|
|
313
|
-
self.init_tensors()
|
|
359
|
+
self.init_tensors(self.lazy_loading)
|
|
314
360
|
return self.sd
|
|
315
361
|
|
|
316
362
|
def metadata(self) -> Optional[Dict[str, str]]:
|
|
@@ -319,7 +365,7 @@ class SafeTensorFile:
|
|
|
319
365
|
|
|
320
366
|
def __len__(self) -> int:
|
|
321
367
|
"""Get number of tensors"""
|
|
322
|
-
self.init_tensors()
|
|
368
|
+
self.init_tensors(self.lazy_loading)
|
|
323
369
|
return len(self.keys())
|
|
324
370
|
|
|
325
371
|
def __contains__(self, key: str) -> bool:
|
|
@@ -337,10 +383,9 @@ class SafeTensorFile:
|
|
|
337
383
|
class _SafeTensorLoader:
|
|
338
384
|
"""Context manager for loading SafeTensorFile"""
|
|
339
385
|
|
|
340
|
-
def __init__(self, filename: str):
|
|
386
|
+
def __init__(self, filename: str ):
|
|
341
387
|
self.filename = Path(filename)
|
|
342
388
|
self.sft = None
|
|
343
|
-
|
|
344
389
|
if not self.filename.exists():
|
|
345
390
|
raise FileNotFoundError(f"File not found: {filename}")
|
|
346
391
|
|
|
@@ -367,7 +412,6 @@ class _SafeTensorLoader:
|
|
|
367
412
|
|
|
368
413
|
def safe_open(filename: str, framework: str = "pt",device = "cpu") -> _SafeTensorLoader:
|
|
369
414
|
if device != "cpu" or framework !="pt":
|
|
370
|
-
pass
|
|
371
415
|
return _old_safe_open(filename =filename, framework=framework, device=device)
|
|
372
416
|
return _SafeTensorLoader(filename)
|
|
373
417
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.9
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -13,10 +13,11 @@ Requires-Dist: optimum-quanto
|
|
|
13
13
|
Requires-Dist: accelerate
|
|
14
14
|
Requires-Dist: safetensors
|
|
15
15
|
Requires-Dist: psutil
|
|
16
|
+
Requires-Dist: peft
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
<p align="center">
|
|
19
|
-
<H2>Memory Management 3.0 for the GPU Poor by DeepBeepMeep</H2>
|
|
20
|
+
<H2>Memory Management 3.0.9 for the GPU Poor by DeepBeepMeep</H2>
|
|
20
21
|
</p>
|
|
21
22
|
|
|
22
23
|
|
|
@@ -38,8 +39,9 @@ Each profile may use a combination of the following:
|
|
|
38
39
|
- Ability to pin models to reserved RAM to accelerate transfers to VRAM
|
|
39
40
|
- Async transfers to VRAM to avoid a pause when loading a new slice of a model
|
|
40
41
|
- Automated on the fly quantization or ability to load pre quantized models
|
|
41
|
-
-
|
|
42
|
-
|
|
42
|
+
- Pretrained Lora support with low RAM requirements
|
|
43
|
+
- Support for pytorch compilation on Linux and WSL (supported on pure Windows but requires a complex Triton Installation).
|
|
44
|
+
-
|
|
43
45
|
## Installation
|
|
44
46
|
First you need to install the module in your current project with:
|
|
45
47
|
```shell
|
|
@@ -105,20 +107,22 @@ If you are short on RAM and plan to work with quantized models, it is recommende
|
|
|
105
107
|
## Going further
|
|
106
108
|
|
|
107
109
|
The module includes several tools to package a light version of your favorite video / image generator:
|
|
108
|
-
- *save_model(model, file_path, do_quantize = False,
|
|
110
|
+
- *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
|
|
109
111
|
Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
|
|
110
112
|
The resulting safetensor file will contain extra fields in its metadata such as the quantization map and its configuration, so you will be able to move the file around without files such as *config.json* or *file_map.json*.
|
|
111
113
|
You will need *load_model_data* or *fast_load_transformers_model* to read the file again . You may also load it using the default *safetensor* librar however you will need to provide in the same directory any complementary file that are usually requested (for instance *config.json*)
|
|
112
114
|
|
|
113
|
-
- *load_model_data(model, file_path: str, do_quantize = False,
|
|
115
|
+
- *load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
|
|
114
116
|
Load the tensors data of a model in RAM of a model already initialized with no data. Detect and handle quantized models saved previously with *save_model*.A model can also be quantized on the fly while being loaded. The model which is loaded can be pinned to RAM while it is loaded, this is more RAM efficient than pinning tensors later using *offline.all* or *offline.profile*
|
|
115
117
|
|
|
116
|
-
- *fast_load_transformers_model(model_path: str, do_quantize = False,
|
|
118
|
+
- *fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
|
|
117
119
|
Initialize (build the model hierarchy in memory) and fast load the corresponding tensors of a 'transformers' or 'diffusers' library model.
|
|
118
120
|
The advantages over the original *from_pretrained* method is that a full model can fit into a single file with a filename of your choosing (thefore you can have multiple 'transformers' versions of the same model in the same directory) and prequantized models are processed in a transparent way.
|
|
119
121
|
Last but not least, you can also on the fly pin to RAM the whole model or the most important part of it (partialPin = True) in a more efficient way (faster and requires less RAM) than if you did through *offload.all* or *offload.profile*.
|
|
120
122
|
|
|
121
|
-
|
|
123
|
+
- *load_loras_into_model(model, lora_path, lora_multi)
|
|
124
|
+
Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
|
|
125
|
+
The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
|
|
122
126
|
|
|
123
127
|
The typical workflow wil be:
|
|
124
128
|
1) temporarly insert the *save_model* function just after a model has been fully loaded to save a copy of the model / quantized model.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=bYjpbAHbVX2Vf3nBJXYEc1u9B5JIYvJxv4eMS8L5Tco,64209
|
|
4
|
+
mmgp/safetensors2.py,sha256=G6uzvpGauJLPEvN74MX1ib4YK0E4wzNMyrZO5wOX2k0,15812
|
|
5
|
+
mmgp-3.0.9.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.0.9.dist-info/METADATA,sha256=0vNt8lNKfMkyBrFUN8pOfkDRf8i_jmndgH2ePIekmdg,12570
|
|
7
|
+
mmgp-3.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
8
|
+
mmgp-3.0.9.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.0.9.dist-info/RECORD,,
|
mmgp-3.0.3.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=N_n12QJmZlPRbZiYl6BQVfmJaqxxIbiCKkT6w-2CVo4,61781
|
|
4
|
-
mmgp/safetensors2.py,sha256=CSv8HdrjURUzBazpaBDU1WNwUL1lhzpCyzG0GWygbGE,13602
|
|
5
|
-
mmgp-3.0.3.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.0.3.dist-info/METADATA,sha256=0dw13_XUzNPCV6VL-e5FAjvMIUDDT1ffFf7rLG_34zc,12079
|
|
7
|
-
mmgp-3.0.3.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
|
|
8
|
-
mmgp-3.0.3.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.0.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|