mmgp 3.0.1__py3-none-any.whl → 3.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +242 -185
- mmgp/safetensors2.py +65 -21
- {mmgp-3.0.1.dist-info → mmgp-3.0.9.dist-info}/METADATA +18 -10
- mmgp-3.0.9.dist-info/RECORD +9 -0
- {mmgp-3.0.1.dist-info → mmgp-3.0.9.dist-info}/WHEEL +1 -1
- mmgp-3.0.1.dist-info/RECORD +0 -9
- {mmgp-3.0.1.dist-info → mmgp-3.0.9.dist-info}/LICENSE.md +0 -0
- {mmgp-3.0.1.dist-info → mmgp-3.0.9.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -61,10 +61,22 @@ import sys
|
|
|
61
61
|
import os
|
|
62
62
|
import json
|
|
63
63
|
import psutil
|
|
64
|
+
try:
|
|
65
|
+
from diffusers.utils.peft_utils import set_weights_and_activate_adapters, get_peft_kwargs
|
|
66
|
+
except:
|
|
67
|
+
set_weights_and_activate_adapters = None
|
|
68
|
+
get_peft_kwargs = None
|
|
69
|
+
pass
|
|
70
|
+
try:
|
|
71
|
+
from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
|
|
72
|
+
except:
|
|
73
|
+
inject_adapter_in_model = None
|
|
74
|
+
pass
|
|
75
|
+
|
|
64
76
|
from mmgp import safetensors2
|
|
65
77
|
from mmgp import profile_type
|
|
66
78
|
|
|
67
|
-
from optimum.quanto import freeze,
|
|
79
|
+
from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module
|
|
68
80
|
|
|
69
81
|
|
|
70
82
|
|
|
@@ -127,6 +139,9 @@ def move_tensors(obj, device):
|
|
|
127
139
|
return _list
|
|
128
140
|
else:
|
|
129
141
|
raise TypeError("Tensor or list / dict of tensors expected")
|
|
142
|
+
def _get_module_name(v):
|
|
143
|
+
return v.__module__.lower()
|
|
144
|
+
|
|
130
145
|
|
|
131
146
|
def _compute_verbose_level(level):
|
|
132
147
|
if level <0:
|
|
@@ -260,9 +275,16 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
260
275
|
if include:
|
|
261
276
|
params_list = params_list + list(sub_module.buffers(recurse=False)) + list(sub_module.parameters(recurse=False))
|
|
262
277
|
|
|
278
|
+
# print(f"num params to pin {model_id}: {len(params_list)}")
|
|
263
279
|
for p in params_list:
|
|
264
280
|
if isinstance(p, QTensor):
|
|
265
|
-
|
|
281
|
+
if p._qtype == qint4:
|
|
282
|
+
if hasattr(p,"_scale_shift"):
|
|
283
|
+
length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
284
|
+
else:
|
|
285
|
+
length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
|
|
286
|
+
else:
|
|
287
|
+
length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
|
|
266
288
|
else:
|
|
267
289
|
length = torch.numel(p.data) * p.data.element_size()
|
|
268
290
|
|
|
@@ -305,10 +327,22 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
|
|
|
305
327
|
if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
306
328
|
current_big_tensor = big_tensors[big_tensor_no]
|
|
307
329
|
if isinstance(p, QTensor):
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
330
|
+
if p._qtype == qint4:
|
|
331
|
+
length1 = torch.numel(p._data._data) * p._data._data.element_size()
|
|
332
|
+
p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
|
|
333
|
+
if hasattr(p,"_scale_shift"):
|
|
334
|
+
length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
335
|
+
p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
|
|
336
|
+
else:
|
|
337
|
+
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
338
|
+
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
339
|
+
length3 = torch.numel(p._shift) * p._shift.element_size()
|
|
340
|
+
p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
|
|
341
|
+
else:
|
|
342
|
+
length1 = torch.numel(p._data) * p._data.element_size()
|
|
343
|
+
p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
|
|
344
|
+
length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
345
|
+
p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
312
346
|
else:
|
|
313
347
|
length = torch.numel(p.data) * p.data.element_size()
|
|
314
348
|
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
@@ -338,98 +372,6 @@ def _welcome():
|
|
|
338
372
|
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.0) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
339
373
|
|
|
340
374
|
|
|
341
|
-
# def _pin_to_memory_sd(model, sd, model_id, partialPinning = False, perc_reserved_mem_max = 0, verboseLevel = 1):
|
|
342
|
-
# if verboseLevel>=1 :
|
|
343
|
-
# if partialPinning:
|
|
344
|
-
# print(f"Partial pinning to reserved RAM of data of file '{model_id}' while loading it")
|
|
345
|
-
# else:
|
|
346
|
-
# print(f"Pinning data to reserved RAM of file '{model_id}' while loading it")
|
|
347
|
-
|
|
348
|
-
# max_reservable_memory = _get_max_reservable_memory(perc_reserved_mem_max)
|
|
349
|
-
# if partialPinning:
|
|
350
|
-
# towers_names, _ = _detect_main_towers(model)
|
|
351
|
-
# towers_names = [n +"." for n in towers_names]
|
|
352
|
-
|
|
353
|
-
# BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
354
|
-
# current_big_tensor_size = 0
|
|
355
|
-
# big_tensor_no = 0
|
|
356
|
-
# big_tensors_sizes = []
|
|
357
|
-
# tensor_map_indexes = []
|
|
358
|
-
# total_tensor_bytes = 0
|
|
359
|
-
|
|
360
|
-
# for k,t in sd.items():
|
|
361
|
-
# include = True
|
|
362
|
-
# # if isinstance(p, QTensor):
|
|
363
|
-
# # length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
|
|
364
|
-
# # else:
|
|
365
|
-
# # length = torch.numel(p.data) * p.data.element_size()
|
|
366
|
-
# length = torch.numel(t) * t.data.element_size()
|
|
367
|
-
|
|
368
|
-
# if partialPinning:
|
|
369
|
-
# include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
|
|
370
|
-
|
|
371
|
-
# if include:
|
|
372
|
-
# if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
|
|
373
|
-
# big_tensors_sizes.append(current_big_tensor_size)
|
|
374
|
-
# current_big_tensor_size = 0
|
|
375
|
-
# big_tensor_no += 1
|
|
376
|
-
# tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
|
|
377
|
-
# current_big_tensor_size += length
|
|
378
|
-
# else:
|
|
379
|
-
# tensor_map_indexes.append((-1, 0, 0 ))
|
|
380
|
-
# total_tensor_bytes += length
|
|
381
|
-
|
|
382
|
-
# big_tensors_sizes.append(current_big_tensor_size)
|
|
383
|
-
|
|
384
|
-
# big_tensors = []
|
|
385
|
-
# last_big_tensor = 0
|
|
386
|
-
# total = 0
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
# for size in big_tensors_sizes:
|
|
390
|
-
# try:
|
|
391
|
-
# currrent_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True)
|
|
392
|
-
# big_tensors.append(currrent_big_tensor)
|
|
393
|
-
# except:
|
|
394
|
-
# print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
395
|
-
# break
|
|
396
|
-
|
|
397
|
-
# last_big_tensor += 1
|
|
398
|
-
# total += size
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
# tensor_no = 0
|
|
402
|
-
# for k,t in sd.items():
|
|
403
|
-
# big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
|
|
404
|
-
# if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
405
|
-
# current_big_tensor = big_tensors[big_tensor_no]
|
|
406
|
-
# # if isinstance(p, QTensor):
|
|
407
|
-
# # length1 = torch.numel(p._data) * p._data.element_size()
|
|
408
|
-
# # p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
|
|
409
|
-
# # length2 = torch.numel(p._scale) * p._scale.element_size()
|
|
410
|
-
# # p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
|
|
411
|
-
# # else:
|
|
412
|
-
# # length = torch.numel(p.data) * p.data.element_size()
|
|
413
|
-
# # p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
414
|
-
# length = torch.numel(t) * t.data.element_size()
|
|
415
|
-
# t = _move_to_pinned_tensor(t, current_big_tensor, offset, length)
|
|
416
|
-
# sd[k] = t
|
|
417
|
-
# tensor_no += 1
|
|
418
|
-
|
|
419
|
-
# global total_pinned_bytes
|
|
420
|
-
# total_pinned_bytes += total
|
|
421
|
-
|
|
422
|
-
# if verboseLevel >=1:
|
|
423
|
-
# if total_tensor_bytes == total:
|
|
424
|
-
# print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
425
|
-
# else:
|
|
426
|
-
# print(f"{total/ONE_MB:.2f} MB were pinned to reserved RAM out of {total_tensor_bytes/ONE_MB:.2f} MB")
|
|
427
|
-
|
|
428
|
-
# model._already_pinned = True
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
# return
|
|
432
|
-
|
|
433
375
|
def _quantize_dirty_hack(model):
|
|
434
376
|
# dirty hack: add a hook on state_dict() to return a fake non quantized state_dict if called by Lora Diffusers initialization functions
|
|
435
377
|
setattr( model, "_real_state_dict", model.state_dict)
|
|
@@ -535,10 +477,14 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
535
477
|
prev_blocks_prefix = None
|
|
536
478
|
|
|
537
479
|
if hasattr(model_to_quantize, "_quanto_map"):
|
|
480
|
+
for k, entry in model_to_quantize._quanto_map.items():
|
|
481
|
+
weights = entry["weights"]
|
|
482
|
+
print(f"Model '{model_id}' is already quantized to format '{weights}'")
|
|
483
|
+
return False
|
|
538
484
|
print(f"Model '{model_id}' is already quantized")
|
|
539
485
|
return False
|
|
540
|
-
|
|
541
|
-
print(f"Quantization of model '{model_id}' started")
|
|
486
|
+
|
|
487
|
+
print(f"Quantization of model '{model_id}' started to format '{weights}'")
|
|
542
488
|
|
|
543
489
|
for submodule_name, submodule in model_to_quantize.named_modules():
|
|
544
490
|
if isinstance(submodule, QModuleMixin):
|
|
@@ -593,38 +539,42 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
|
|
|
593
539
|
if verboseLevel >=2:
|
|
594
540
|
print(f"Total Excluded {total_excluded/ONE_MB:.1f} MB oF {total_size/ONE_MB:.1f} that is {perc_excluded*100:.2f}%")
|
|
595
541
|
if perc_excluded >= 0.10:
|
|
596
|
-
print(f"Too many
|
|
542
|
+
print(f"Too many modules are excluded, there is something wrong with the selection, switch back to full quantization.")
|
|
597
543
|
exclude_list = None
|
|
598
544
|
|
|
599
545
|
|
|
600
546
|
#quantize(model_to_quantize,weights, exclude= exclude_list)
|
|
601
|
-
|
|
547
|
+
|
|
602
548
|
for name, m in model_to_quantize.named_modules():
|
|
603
549
|
if exclude_list is None or not any( name == module_name for module_name in exclude_list):
|
|
604
550
|
_quantize_submodule(model_to_quantize, name, m, weights=weights, activations=None, optimizer=None)
|
|
605
551
|
|
|
606
|
-
# force read non quantized parameters so that their lazy tensors and corresponding mmap are released
|
|
607
|
-
# otherwise we may end up
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
for name, m in model_to_quantize.named_modules():
|
|
552
|
+
# force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
|
|
553
|
+
# otherwise we may end up keeping in memory both the quantized and the non quantize model
|
|
554
|
+
for m in model_to_quantize.modules():
|
|
611
555
|
# do not read quantized weights (detected them directly or behind an adapter)
|
|
612
|
-
if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
|
|
613
|
-
|
|
556
|
+
if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
|
|
557
|
+
if hasattr(m, "bias") and m.bias is not None:
|
|
558
|
+
m.bias.data = m.bias.data + 0
|
|
614
559
|
else:
|
|
615
|
-
|
|
616
|
-
|
|
560
|
+
for n, p in m.named_parameters(recurse = False):
|
|
561
|
+
data = getattr(m, n)
|
|
562
|
+
setattr(m,n, torch.nn.Parameter(data + 0 ) )
|
|
617
563
|
|
|
618
|
-
|
|
619
|
-
|
|
564
|
+
for b in m.buffers(recurse = False):
|
|
565
|
+
b.data = b.data + 0
|
|
620
566
|
|
|
621
567
|
|
|
568
|
+
|
|
622
569
|
freeze(model_to_quantize)
|
|
623
570
|
torch.cuda.empty_cache()
|
|
624
571
|
gc.collect()
|
|
625
572
|
quantization_map = _quantization_map(model_to_quantize)
|
|
626
573
|
model_to_quantize._quanto_map = quantization_map
|
|
627
574
|
|
|
575
|
+
if hasattr(model_to_quantize, "_already_pinned"):
|
|
576
|
+
delattr(model_to_quantize, "_already_pinned")
|
|
577
|
+
|
|
628
578
|
_quantize_dirty_hack(model_to_quantize)
|
|
629
579
|
|
|
630
580
|
print(f"Quantization of model '{model_id}' done")
|
|
@@ -683,15 +633,25 @@ class offload:
|
|
|
683
633
|
|
|
684
634
|
for k,p in submodule.named_parameters(recurse=False):
|
|
685
635
|
if isinstance(p, QTensor):
|
|
686
|
-
blocks_params.append( (submodule, k, p
|
|
687
|
-
|
|
688
|
-
|
|
636
|
+
blocks_params.append( (submodule, k, p ) )
|
|
637
|
+
|
|
638
|
+
if p._qtype == qint4:
|
|
639
|
+
if hasattr(p,"_scale_shift"):
|
|
640
|
+
blocks_params_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
641
|
+
blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
|
|
642
|
+
else:
|
|
643
|
+
blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
|
|
644
|
+
blocks_params_size += torch.numel(p._shift) * p._shift.element_size()
|
|
645
|
+
blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
|
|
646
|
+
else:
|
|
647
|
+
blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
|
|
648
|
+
blocks_params_size += torch.numel(p._data) * p._data.element_size()
|
|
689
649
|
else:
|
|
690
|
-
blocks_params.append( (submodule, k, p
|
|
691
|
-
blocks_params_size += p.data.
|
|
650
|
+
blocks_params.append( (submodule, k, p ) )
|
|
651
|
+
blocks_params_size += torch.numel(p.data) * p.data.element_size()
|
|
692
652
|
|
|
693
653
|
for k, p in submodule.named_buffers(recurse=False):
|
|
694
|
-
blocks_params.append( (submodule, k, p
|
|
654
|
+
blocks_params.append( (submodule, k, p) )
|
|
695
655
|
blocks_params_size += p.data.nbytes
|
|
696
656
|
|
|
697
657
|
|
|
@@ -709,34 +669,28 @@ class offload:
|
|
|
709
669
|
return False
|
|
710
670
|
return True
|
|
711
671
|
|
|
712
|
-
|
|
672
|
+
|
|
673
|
+
def gpu_load_blocks(self, model_id, blocks_name):
|
|
713
674
|
# cl = clock.start()
|
|
714
675
|
|
|
715
676
|
if blocks_name != None:
|
|
716
677
|
self.loaded_blocks[model_id] = blocks_name
|
|
717
678
|
|
|
718
679
|
entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
|
|
719
|
-
|
|
720
|
-
def cpu_to_gpu(stream_to_use, blocks_params
|
|
680
|
+
|
|
681
|
+
def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
|
|
721
682
|
with torch.cuda.stream(stream_to_use):
|
|
722
683
|
for param in blocks_params:
|
|
723
|
-
parent_module, n,
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
else:
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
if record_for_stream != None:
|
|
735
|
-
if isinstance(p, QTensor):
|
|
736
|
-
q._data.record_stream(record_for_stream)
|
|
737
|
-
q._scale.record_stream(record_for_stream)
|
|
738
|
-
else:
|
|
739
|
-
p.data.record_stream(record_for_stream)
|
|
684
|
+
parent_module, n, p = param
|
|
685
|
+
q = p.to("cuda", non_blocking=True)
|
|
686
|
+
q = torch.nn.Parameter(q , requires_grad=False)
|
|
687
|
+
setattr(parent_module, n , q)
|
|
688
|
+
# if record_for_stream != None:
|
|
689
|
+
# if isinstance(p, QTensor):
|
|
690
|
+
# q._data.record_stream(record_for_stream)
|
|
691
|
+
# q._scale.record_stream(record_for_stream)
|
|
692
|
+
# else:
|
|
693
|
+
# p.data.record_stream(record_for_stream)
|
|
740
694
|
|
|
741
695
|
|
|
742
696
|
if self.verboseLevel >=2:
|
|
@@ -775,19 +729,10 @@ class offload:
|
|
|
775
729
|
print(f"Unloading model {blocks_name} ({model_name}) from GPU")
|
|
776
730
|
|
|
777
731
|
blocks_params = self.blocks_of_modules[blocks_name]
|
|
778
|
-
|
|
779
732
|
for param in blocks_params:
|
|
780
|
-
parent_module, n,
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
# need to change the parameter directly from the module as it can't be swapped in place due to a memory leak in the pytorch compiler
|
|
784
|
-
q = WeightQBytesTensor.create(p.qtype, p.axis, p.size(), p.stride(), data, scale, activation_qtype=p.activation_qtype, requires_grad=p.requires_grad )
|
|
785
|
-
q = torch.nn.Parameter(q , requires_grad=False)
|
|
786
|
-
setattr(parent_module, n , q)
|
|
787
|
-
del p
|
|
788
|
-
else:
|
|
789
|
-
p.data = data
|
|
790
|
-
|
|
733
|
+
parent_module, n, p = param
|
|
734
|
+
q = torch.nn.Parameter(p , requires_grad=False)
|
|
735
|
+
setattr(parent_module, n , q)
|
|
791
736
|
# cl.stop()
|
|
792
737
|
# print(f"unload time: {cl.format_time_gap()}")
|
|
793
738
|
|
|
@@ -823,8 +768,8 @@ class offload:
|
|
|
823
768
|
if torch.is_tensor(arg):
|
|
824
769
|
if arg.dtype == torch.float32:
|
|
825
770
|
arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
|
|
826
|
-
|
|
827
|
-
arg = arg.cuda(non_blocking=True)
|
|
771
|
+
elif not arg.is_cuda:
|
|
772
|
+
arg = arg.cuda(non_blocking=True)
|
|
828
773
|
new_args.append(arg)
|
|
829
774
|
|
|
830
775
|
for k in kwargs:
|
|
@@ -832,7 +777,7 @@ class offload:
|
|
|
832
777
|
if torch.is_tensor(arg):
|
|
833
778
|
if arg.dtype == torch.float32:
|
|
834
779
|
arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
|
|
835
|
-
|
|
780
|
+
elif not arg.is_cuda:
|
|
836
781
|
arg = arg.cuda(non_blocking=True)
|
|
837
782
|
new_kwargs[k]= arg
|
|
838
783
|
|
|
@@ -896,6 +841,10 @@ class offload:
|
|
|
896
841
|
|
|
897
842
|
def hook_check_empty_cache_needed(self, target_module, model_id,blocks_name, previous_method, context):
|
|
898
843
|
|
|
844
|
+
qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
|
|
845
|
+
if qint4quantization:
|
|
846
|
+
pass
|
|
847
|
+
|
|
899
848
|
def check_empty_cuda_cache(module, *args, **kwargs):
|
|
900
849
|
# if self.ready_to_check_mem():
|
|
901
850
|
# self.empty_cache_if_needed()
|
|
@@ -911,6 +860,8 @@ class offload:
|
|
|
911
860
|
self.empty_cache_if_needed()
|
|
912
861
|
self.loaded_blocks[model_id] = blocks_name
|
|
913
862
|
self.gpu_load_blocks(model_id, blocks_name)
|
|
863
|
+
if qint4quantization:
|
|
864
|
+
args, kwargs = self.move_args_to_gpu(*args, **kwargs)
|
|
914
865
|
|
|
915
866
|
return previous_method(*args, **kwargs)
|
|
916
867
|
|
|
@@ -969,11 +920,111 @@ class offload:
|
|
|
969
920
|
# for module in parent_module.components.items():
|
|
970
921
|
# self.unhook_module(module)
|
|
971
922
|
|
|
972
|
-
|
|
923
|
+
import torch
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
|
|
928
|
+
def load_loras_into_model(model, lora_path, lora_multi = None, verboseLevel = -1):
|
|
929
|
+
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
930
|
+
|
|
931
|
+
if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or get_peft_kwargs == None:
|
|
932
|
+
raise Exception("Unable to load Lora, missing 'peft' and / or 'diffusers' modules")
|
|
933
|
+
|
|
934
|
+
if not isinstance(lora_path, list):
|
|
935
|
+
lora_path = [lora_path]
|
|
936
|
+
|
|
937
|
+
if lora_multi is None:
|
|
938
|
+
lora_multi = [1. for _ in lora_path]
|
|
939
|
+
|
|
940
|
+
for i, path in enumerate(lora_path):
|
|
941
|
+
adapter_name = str(i)
|
|
942
|
+
|
|
943
|
+
state_dict = safetensors2.torch_load_file(path)
|
|
944
|
+
|
|
945
|
+
keys = list(state_dict.keys())
|
|
946
|
+
if len(keys) == 0:
|
|
947
|
+
raise Exception(f"Empty Lora '{path}'")
|
|
948
|
+
|
|
949
|
+
|
|
950
|
+
network_alphas = {}
|
|
951
|
+
for k in keys:
|
|
952
|
+
if "alpha" in k:
|
|
953
|
+
alpha_value = state_dict.pop(k)
|
|
954
|
+
if not ( (torch.is_tensor(alpha_value) and torch.is_floating_point(alpha_value)) or isinstance(
|
|
955
|
+
alpha_value, float
|
|
956
|
+
)):
|
|
957
|
+
network_alphas[k] = torch.tensor( float(alpha_value.item() ) )
|
|
958
|
+
|
|
959
|
+
pos = keys[0].find(".")
|
|
960
|
+
prefix = keys[0][0:pos]
|
|
961
|
+
if not any( prefix.startswith(some_prefix) for some_prefix in ["diffusion_model", "transformer"]):
|
|
962
|
+
msg = f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format."
|
|
963
|
+
raise Exception(msg)
|
|
964
|
+
|
|
965
|
+
transformer = model
|
|
966
|
+
|
|
967
|
+
transformer_keys = [k for k in keys if k.startswith(prefix)]
|
|
968
|
+
state_dict = {
|
|
969
|
+
k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in transformer_keys
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
sd_keys = state_dict.keys()
|
|
973
|
+
if len(sd_keys) == 0:
|
|
974
|
+
print(f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format.")
|
|
975
|
+
return
|
|
976
|
+
|
|
977
|
+
# is_correct_format = all("lora" in key for key in state_dict.keys())
|
|
978
|
+
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
# check with first key if is not in peft format
|
|
983
|
+
# first_key = next(iter(state_dict.keys()))
|
|
984
|
+
# if "lora_A" not in first_key:
|
|
985
|
+
# state_dict = convert_unet_state_dict_to_peft(state_dict)
|
|
986
|
+
|
|
987
|
+
if adapter_name in getattr(transformer, "peft_config", {}):
|
|
988
|
+
raise ValueError(
|
|
989
|
+
f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
|
|
990
|
+
)
|
|
991
|
+
|
|
992
|
+
rank = {}
|
|
993
|
+
for key, val in state_dict.items():
|
|
994
|
+
if "lora_B" in key:
|
|
995
|
+
rank[key] = val.shape[1]
|
|
996
|
+
|
|
997
|
+
if network_alphas is not None and len(network_alphas) >= 1:
|
|
998
|
+
alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
|
|
999
|
+
network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
|
|
1000
|
+
|
|
1001
|
+
lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
|
|
1002
|
+
|
|
1003
|
+
lora_config = LoraConfig(**lora_config_kwargs)
|
|
1004
|
+
peft_kwargs = {}
|
|
1005
|
+
peft_kwargs["low_cpu_mem_usage"] = True
|
|
1006
|
+
inject_adapter_in_model(lora_config, model, adapter_name=adapter_name, **peft_kwargs)
|
|
1007
|
+
|
|
1008
|
+
incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name, **peft_kwargs)
|
|
1009
|
+
|
|
1010
|
+
warn_msg = ""
|
|
1011
|
+
if incompatible_keys is not None:
|
|
1012
|
+
# Check only for unexpected keys.
|
|
1013
|
+
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
|
|
1014
|
+
if unexpected_keys:
|
|
1015
|
+
pass
|
|
1016
|
+
if verboseLevel >=1:
|
|
1017
|
+
print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
|
|
1018
|
+
set_weights_and_activate_adapters(model,[ str(i) for i in range(len(lora_multi))], lora_multi)
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
|
|
973
1022
|
"""
|
|
974
1023
|
quick version of .LoadfromPretrained of the transformers library
|
|
975
1024
|
used to build a model and load the corresponding weights (quantized or not)
|
|
976
1025
|
"""
|
|
1026
|
+
|
|
1027
|
+
|
|
977
1028
|
import os.path
|
|
978
1029
|
from accelerate import init_empty_weights
|
|
979
1030
|
|
|
@@ -1037,13 +1088,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1037
1088
|
|
|
1038
1089
|
model._config = transformer_config
|
|
1039
1090
|
|
|
1040
|
-
load_model_data(model,model_path, do_quantize = do_quantize,
|
|
1091
|
+
load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, verboseLevel=verboseLevel )
|
|
1041
1092
|
|
|
1042
1093
|
return model
|
|
1043
1094
|
|
|
1044
1095
|
|
|
1045
1096
|
|
|
1046
|
-
def load_model_data(model, file_path: str, do_quantize = False,
|
|
1097
|
+
def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
|
|
1047
1098
|
"""
|
|
1048
1099
|
Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
|
|
1049
1100
|
"""
|
|
@@ -1064,14 +1115,6 @@ def load_model_data(model, file_path: str, do_quantize = False, quantization_typ
|
|
|
1064
1115
|
state_dict = state_dict["module"]
|
|
1065
1116
|
else:
|
|
1066
1117
|
state_dict, metadata = _safetensors_load_file(file_path)
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
# if pinToMemory:
|
|
1070
|
-
# _pin_to_memory_sd(model,state_dict, file_path, partialPinning = partialPinning, perc_reserved_mem_max = perc_reserved_mem_max, verboseLevel = verboseLevel)
|
|
1071
|
-
|
|
1072
|
-
# with safetensors2.safe_open(file_path) as f:
|
|
1073
|
-
# metadata = f.metadata()
|
|
1074
|
-
|
|
1075
1118
|
|
|
1076
1119
|
if metadata is None:
|
|
1077
1120
|
quantization_map = None
|
|
@@ -1106,7 +1149,7 @@ def load_model_data(model, file_path: str, do_quantize = False, quantization_typ
|
|
|
1106
1149
|
|
|
1107
1150
|
if do_quantize:
|
|
1108
1151
|
if quantization_map is None:
|
|
1109
|
-
if _quantize(model,
|
|
1152
|
+
if _quantize(model, quantizationType, verboseLevel=verboseLevel, model_id=file_path):
|
|
1110
1153
|
quantization_map = model._quanto_map
|
|
1111
1154
|
else:
|
|
1112
1155
|
if verboseLevel >=1:
|
|
@@ -1117,7 +1160,7 @@ def load_model_data(model, file_path: str, do_quantize = False, quantization_typ
|
|
|
1117
1160
|
|
|
1118
1161
|
return
|
|
1119
1162
|
|
|
1120
|
-
def save_model(model, file_path, do_quantize = False,
|
|
1163
|
+
def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1 ):
|
|
1121
1164
|
"""save the weights of a model and quantize them if requested
|
|
1122
1165
|
These weights can be loaded again using 'load_model_data'
|
|
1123
1166
|
"""
|
|
@@ -1144,7 +1187,7 @@ def save_model(model, file_path, do_quantize = False, quantization_type = qint8,
|
|
|
1144
1187
|
config= json.loads(text)
|
|
1145
1188
|
|
|
1146
1189
|
if do_quantize:
|
|
1147
|
-
_quantize(model, weights=
|
|
1190
|
+
_quantize(model, weights=quantizationType, model_id=file_path)
|
|
1148
1191
|
|
|
1149
1192
|
quantization_map = getattr(model, "_quanto_map", None)
|
|
1150
1193
|
|
|
@@ -1157,7 +1200,7 @@ def save_model(model, file_path, do_quantize = False, quantization_type = qint8,
|
|
|
1157
1200
|
|
|
1158
1201
|
|
|
1159
1202
|
|
|
1160
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
|
|
1203
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
|
|
1161
1204
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
1162
1205
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
1163
1206
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -1235,6 +1278,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1235
1278
|
self.anyCompiledModule = compileAllModels or len(modelsToCompile)>0
|
|
1236
1279
|
if self.anyCompiledModule:
|
|
1237
1280
|
torch._dynamo.config.cache_size_limit = 10000
|
|
1281
|
+
torch.compiler.reset()
|
|
1282
|
+
|
|
1238
1283
|
# torch._logging.set_logs(recompiles=True)
|
|
1239
1284
|
# torch._inductor.config.realize_opcount_threshold = 100 # workaround bug "AssertionError: increase TRITON_MAX_BLOCK['X'] to 4096."
|
|
1240
1285
|
|
|
@@ -1249,19 +1294,31 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1249
1294
|
|
|
1250
1295
|
# if the model has just been quantized so there is no need to quantize it again
|
|
1251
1296
|
if model_id in models_to_quantize:
|
|
1252
|
-
_quantize(current_model, weights=
|
|
1297
|
+
_quantize(current_model, weights=quantizationType, verboseLevel = self.verboseLevel, model_id=model_id)
|
|
1253
1298
|
|
|
1254
1299
|
modelPinned = (pinAllModels or model_id in modelsToPin) and not hasattr(current_model,"_already_pinned")
|
|
1255
1300
|
|
|
1256
|
-
current_model_size = 0
|
|
1257
|
-
|
|
1258
|
-
for p in current_model.
|
|
1301
|
+
current_model_size = 0
|
|
1302
|
+
|
|
1303
|
+
for n, p in current_model.named_parameters():
|
|
1304
|
+
p.requires_grad = False
|
|
1305
|
+
p = p.detach()
|
|
1259
1306
|
if isinstance(p, QTensor):
|
|
1260
1307
|
# # fix quanto bug (seems to have been fixed)
|
|
1261
1308
|
# if not modelPinned and p._scale.dtype == torch.float32:
|
|
1262
1309
|
# p._scale = p._scale.to(torch.bfloat16)
|
|
1263
|
-
|
|
1264
|
-
|
|
1310
|
+
if p._qtype == qint4:
|
|
1311
|
+
if hasattr(p,"_scale_shift"):
|
|
1312
|
+
current_model_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
1313
|
+
else:
|
|
1314
|
+
current_model_size += torch.numel(p._scale) * p._shift.element_size() + torch.numel(p._scale) * p._shift.element_size()
|
|
1315
|
+
|
|
1316
|
+
current_model_size += torch.numel(p._data._data) * p._data._data.element_size()
|
|
1317
|
+
|
|
1318
|
+
else:
|
|
1319
|
+
current_model_size += torch.numel(p._scale) * p._scale.element_size()
|
|
1320
|
+
current_model_size += torch.numel(p._data) * p._data.element_size()
|
|
1321
|
+
|
|
1265
1322
|
else:
|
|
1266
1323
|
if p.data.dtype == torch.float32:
|
|
1267
1324
|
# convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
|
|
@@ -1269,7 +1326,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1269
1326
|
current_model_size += torch.numel(p.data) * p.data.element_size()
|
|
1270
1327
|
|
|
1271
1328
|
for b in current_model.buffers():
|
|
1272
|
-
if b.data.dtype == torch.float32:
|
|
1329
|
+
if b.data.dtype == torch.float32:
|
|
1273
1330
|
# convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
|
|
1274
1331
|
b.data = b.data.to(torch.bfloat16)
|
|
1275
1332
|
current_model_size += torch.numel(b.data) * b.data.element_size()
|
|
@@ -1305,7 +1362,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1305
1362
|
print(f"Potential iterative blocks found in model '{model_id}':{towers_names}")
|
|
1306
1363
|
# compile main iterative modules stacks ("towers")
|
|
1307
1364
|
if compileAllModels or model_id in modelsToCompile :
|
|
1308
|
-
#torch.compiler.reset()
|
|
1309
1365
|
if self.verboseLevel>=1:
|
|
1310
1366
|
print(f"Pytorch compilation of model '{model_id}' is scheduled.")
|
|
1311
1367
|
for tower in towers_modules:
|
|
@@ -1313,6 +1369,13 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1313
1369
|
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
|
|
1314
1370
|
#dynamic=True,
|
|
1315
1371
|
|
|
1372
|
+
if pinAllModels or model_id in modelsToPin:
|
|
1373
|
+
if hasattr(current_model,"_already_pinned"):
|
|
1374
|
+
if self.verboseLevel >=1:
|
|
1375
|
+
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
1376
|
+
else:
|
|
1377
|
+
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, perc_reserved_mem_max=perc_reserved_mem_max, verboseLevel=verboseLevel)
|
|
1378
|
+
|
|
1316
1379
|
for submodule_name, submodule in current_model.named_modules():
|
|
1317
1380
|
# create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
|
|
1318
1381
|
# (it is queried in many pipelines even if offloading is not properly implemented)
|
|
@@ -1358,12 +1421,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
1358
1421
|
|
|
1359
1422
|
current_size = self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
|
|
1360
1423
|
|
|
1361
|
-
if pinAllModels or model_id in modelsToPin:
|
|
1362
|
-
if hasattr(current_model,"_already_pinned"):
|
|
1363
|
-
if self.verboseLevel >=1:
|
|
1364
|
-
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
1365
|
-
else:
|
|
1366
|
-
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, perc_reserved_mem_max=perc_reserved_mem_max, verboseLevel=verboseLevel)
|
|
1367
1424
|
|
|
1368
1425
|
|
|
1369
1426
|
|
|
@@ -1402,7 +1459,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1402
1459
|
modules= modules.components
|
|
1403
1460
|
|
|
1404
1461
|
modules = {k: _remove_model_wrapper(v) for k, v in modules.items() if isinstance(v, torch.nn.Module)}
|
|
1405
|
-
module_names = {k: v
|
|
1462
|
+
module_names = {k: _get_module_name(v) for k, v in modules.items() }
|
|
1406
1463
|
|
|
1407
1464
|
default_extraModelsToQuantize = []
|
|
1408
1465
|
quantizeTransformer = True
|
|
@@ -1422,6 +1479,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1422
1479
|
|
|
1423
1480
|
default_budgets = { "transformer" : 600 , "text_encoder": 3000, "text_encoder_2": 3000 }
|
|
1424
1481
|
extraModelsToQuantize = None
|
|
1482
|
+
asyncTransfers = True
|
|
1425
1483
|
|
|
1426
1484
|
if profile_no == profile_type.HighRAM_HighVRAM:
|
|
1427
1485
|
pinnedMemory= True
|
|
@@ -1439,7 +1497,6 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1439
1497
|
pinnedMemory= "transformer"
|
|
1440
1498
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
1441
1499
|
budgets=default_budgets
|
|
1442
|
-
asyncTransfers = True
|
|
1443
1500
|
info = "You have chosen a profile that requires at least 32 GB of RAM and 12 GB of VRAM. Some RAM is consumed to reduce VRAM consumption. "
|
|
1444
1501
|
elif profile_no == profile_type.VerylowRAM_LowVRAM:
|
|
1445
1502
|
pinnedMemory= False
|
mmgp/safetensors2.py
CHANGED
|
@@ -155,7 +155,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
|
155
155
|
torch.bool : 'BOOL' , torch.float64 : 'F64' , torch.float32 : 'F32' , torch.float16 : 'F16', torch.float8_e5m2 : "F8_E5M2", torch.float8_e4m3fn: "F8_E4M3" }
|
|
156
156
|
pos = 0
|
|
157
157
|
i = 0
|
|
158
|
-
mx =
|
|
158
|
+
mx = 100000
|
|
159
159
|
for k , t in sd.items():
|
|
160
160
|
entry = {}
|
|
161
161
|
dtypestr= map[t.dtype]
|
|
@@ -186,8 +186,6 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
|
186
186
|
|
|
187
187
|
length_of_header_bytes = struct.pack('<Q', size_header)
|
|
188
188
|
|
|
189
|
-
empty_tensor = b'\x80\x3f'
|
|
190
|
-
|
|
191
189
|
with open(file_path, "wb") as writer:
|
|
192
190
|
bytes_written = writer.write(length_of_header_bytes)
|
|
193
191
|
bytes_written = writer.write(header_bytes)
|
|
@@ -195,12 +193,20 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
|
195
193
|
i = 0
|
|
196
194
|
for k , t in sd.items():
|
|
197
195
|
size = torch.numel(t) * t.element_size()
|
|
198
|
-
if
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
196
|
+
if size != 0:
|
|
197
|
+
if len(t.shape) == 0:
|
|
198
|
+
dtype = t.dtype
|
|
199
|
+
# convert in a friendly format, scalars types not supported by numpy
|
|
200
|
+
if dtype == torch.bfloat16:
|
|
201
|
+
t = t.view(torch.uint16)
|
|
202
|
+
elif dtype == torch.float8_e5m2 or dtype == torch.float8_e4m3fn:
|
|
203
|
+
t = t.view(torch.uint8)
|
|
204
|
+
buffer = t.numpy().tobytes()
|
|
205
|
+
else:
|
|
206
|
+
buffer = t.view(torch.uint8).numpy().tobytes()
|
|
207
|
+
bytes_written = writer.write(buffer)
|
|
208
|
+
assert bytes_written == size
|
|
209
|
+
|
|
204
210
|
i+=1
|
|
205
211
|
if i==mx:
|
|
206
212
|
break
|
|
@@ -208,7 +214,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
|
|
|
208
214
|
class SafeTensorFile:
|
|
209
215
|
"""Main class for accessing safetensors files that provides memory-efficient access"""
|
|
210
216
|
|
|
211
|
-
def __init__(self, file_path, metadata, catalog, skip_bytes):
|
|
217
|
+
def __init__(self, file_path, metadata, catalog, skip_bytes, lazy_loading = True):
|
|
212
218
|
self._file_path = file_path
|
|
213
219
|
self._metadata = metadata
|
|
214
220
|
self._catalog = catalog
|
|
@@ -216,20 +222,30 @@ class SafeTensorFile:
|
|
|
216
222
|
self._keys = None
|
|
217
223
|
self.sd = None
|
|
218
224
|
self.mtracker = None
|
|
225
|
+
self.lazy_loading = lazy_loading
|
|
219
226
|
|
|
220
227
|
@classmethod
|
|
221
|
-
def load_metadata(cls, file_path):
|
|
228
|
+
def load_metadata(cls, file_path, lazy_loading = True):
|
|
222
229
|
with open(file_path, 'rb') as f:
|
|
223
230
|
catalog, metadata, skip_bytes = _read_safetensors_header(file_path, f)
|
|
224
231
|
|
|
225
|
-
return cls(file_path, metadata, catalog, skip_bytes)
|
|
232
|
+
return cls(file_path, metadata, catalog, skip_bytes, lazy_loading)
|
|
226
233
|
|
|
227
|
-
def init_tensors(self):
|
|
234
|
+
def init_tensors(self, lazyTensors = True):
|
|
228
235
|
if self.sd is None:
|
|
229
|
-
self.
|
|
236
|
+
self.lazy_loading = lazyTensors
|
|
237
|
+
if lazyTensors:
|
|
238
|
+
self.sd = self.create_tensors_with_mmap()
|
|
239
|
+
else:
|
|
240
|
+
self.sd = self.create_tensors_without_mmap()
|
|
241
|
+
# else:
|
|
242
|
+
# if not self.lazy_loading and lazyTensors:
|
|
243
|
+
# raise Exception("Every tensor should be either lazy loaded or not lazy loaded")
|
|
244
|
+
|
|
230
245
|
return self.sd
|
|
231
246
|
|
|
232
|
-
|
|
247
|
+
|
|
248
|
+
def create_tensors_with_mmap(self):
|
|
233
249
|
|
|
234
250
|
self.mtracker = MmapTracker(self._file_path)
|
|
235
251
|
import mmap
|
|
@@ -282,7 +298,12 @@ class SafeTensorFile:
|
|
|
282
298
|
map_idx = next(iter_tensor_no)
|
|
283
299
|
offset = current_pos - maps[map_idx][1]
|
|
284
300
|
if len(shape) == 0:
|
|
285
|
-
|
|
301
|
+
if length == 0:
|
|
302
|
+
t = torch.empty(0, dtype=dtype)
|
|
303
|
+
else:
|
|
304
|
+
# don't waste a memory view for a scalar
|
|
305
|
+
t = torch.frombuffer(bytearray(maps[map_idx][0][offset:offset + length]), dtype=torch.uint8)
|
|
306
|
+
t = t.view(dtype)
|
|
286
307
|
else:
|
|
287
308
|
mv = memoryview(maps[map_idx][0])[offset:offset + length]
|
|
288
309
|
t = torch.frombuffer(mv, dtype=dtype)
|
|
@@ -293,8 +314,33 @@ class SafeTensorFile:
|
|
|
293
314
|
|
|
294
315
|
return sd
|
|
295
316
|
|
|
317
|
+
def create_tensors_without_mmap(self):
|
|
318
|
+
sd = OrderedDict()
|
|
319
|
+
|
|
320
|
+
with open(self._file_path, 'rb') as f:
|
|
321
|
+
f.seek(self._skip_bytes, 0)
|
|
322
|
+
for k,v in self._catalog.items():
|
|
323
|
+
dtypestr = v["dtype"]
|
|
324
|
+
dtype= _map_to_dtype[dtypestr]
|
|
325
|
+
shape = v["shape"]
|
|
326
|
+
data_offsets = v["data_offsets"]
|
|
327
|
+
length = data_offsets[1]-data_offsets[0]
|
|
328
|
+
buffer = f.read(length)
|
|
329
|
+
if len(shape) == 0:
|
|
330
|
+
if length == 0:
|
|
331
|
+
t = torch.empty(0, dtype=dtype)
|
|
332
|
+
else:
|
|
333
|
+
t = torch.frombuffer(bytearray(buffer), dtype=torch.uint8)
|
|
334
|
+
t = t.view(dtype)
|
|
335
|
+
else:
|
|
336
|
+
t = torch.frombuffer(bytearray(buffer), dtype=dtype)
|
|
337
|
+
t = torch.reshape(t, shape)
|
|
338
|
+
sd[k] = t
|
|
339
|
+
return sd
|
|
340
|
+
|
|
296
341
|
def get_tensor(self, name: str) -> torch.tensor:
|
|
297
342
|
"""Get a tensor by name"""
|
|
343
|
+
# To do : switch to a JIT tensor creation per tensor
|
|
298
344
|
self.init_tensors()
|
|
299
345
|
return self.sd[name]
|
|
300
346
|
|
|
@@ -310,7 +356,7 @@ class SafeTensorFile:
|
|
|
310
356
|
|
|
311
357
|
def tensors(self) -> Dict[str, torch.tensor]:
|
|
312
358
|
"""Get dictionary of all tensors"""
|
|
313
|
-
self.init_tensors()
|
|
359
|
+
self.init_tensors(self.lazy_loading)
|
|
314
360
|
return self.sd
|
|
315
361
|
|
|
316
362
|
def metadata(self) -> Optional[Dict[str, str]]:
|
|
@@ -319,7 +365,7 @@ class SafeTensorFile:
|
|
|
319
365
|
|
|
320
366
|
def __len__(self) -> int:
|
|
321
367
|
"""Get number of tensors"""
|
|
322
|
-
self.init_tensors()
|
|
368
|
+
self.init_tensors(self.lazy_loading)
|
|
323
369
|
return len(self.keys())
|
|
324
370
|
|
|
325
371
|
def __contains__(self, key: str) -> bool:
|
|
@@ -337,10 +383,9 @@ class SafeTensorFile:
|
|
|
337
383
|
class _SafeTensorLoader:
|
|
338
384
|
"""Context manager for loading SafeTensorFile"""
|
|
339
385
|
|
|
340
|
-
def __init__(self, filename: str):
|
|
386
|
+
def __init__(self, filename: str ):
|
|
341
387
|
self.filename = Path(filename)
|
|
342
388
|
self.sft = None
|
|
343
|
-
|
|
344
389
|
if not self.filename.exists():
|
|
345
390
|
raise FileNotFoundError(f"File not found: {filename}")
|
|
346
391
|
|
|
@@ -367,7 +412,6 @@ class _SafeTensorLoader:
|
|
|
367
412
|
|
|
368
413
|
def safe_open(filename: str, framework: str = "pt",device = "cpu") -> _SafeTensorLoader:
|
|
369
414
|
if device != "cpu" or framework !="pt":
|
|
370
|
-
pass
|
|
371
415
|
return _old_safe_open(filename =filename, framework=framework, device=device)
|
|
372
416
|
return _SafeTensorLoader(filename)
|
|
373
417
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.9
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -13,10 +13,11 @@ Requires-Dist: optimum-quanto
|
|
|
13
13
|
Requires-Dist: accelerate
|
|
14
14
|
Requires-Dist: safetensors
|
|
15
15
|
Requires-Dist: psutil
|
|
16
|
+
Requires-Dist: peft
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
<p align="center">
|
|
19
|
-
<H2>Memory Management 3.0 for the GPU Poor by DeepBeepMeep</H2>
|
|
20
|
+
<H2>Memory Management 3.0.9 for the GPU Poor by DeepBeepMeep</H2>
|
|
20
21
|
</p>
|
|
21
22
|
|
|
22
23
|
|
|
@@ -38,8 +39,9 @@ Each profile may use a combination of the following:
|
|
|
38
39
|
- Ability to pin models to reserved RAM to accelerate transfers to VRAM
|
|
39
40
|
- Async transfers to VRAM to avoid a pause when loading a new slice of a model
|
|
40
41
|
- Automated on the fly quantization or ability to load pre quantized models
|
|
41
|
-
-
|
|
42
|
-
|
|
42
|
+
- Pretrained Lora support with low RAM requirements
|
|
43
|
+
- Support for pytorch compilation on Linux and WSL (supported on pure Windows but requires a complex Triton Installation).
|
|
44
|
+
-
|
|
43
45
|
## Installation
|
|
44
46
|
First you need to install the module in your current project with:
|
|
45
47
|
```shell
|
|
@@ -69,7 +71,8 @@ You can choose between 5 profiles depending on your hardware:
|
|
|
69
71
|
- VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
|
|
70
72
|
|
|
71
73
|
Profile 2 (High RAM) and 4 (Low RAM)are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
|
|
72
|
-
|
|
74
|
+
If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
|
|
75
|
+
In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
|
|
73
76
|
|
|
74
77
|
By default the 'transformer' will be quantized to 8 bits for all profiles. If you don't want that you may specify the optional parameter *quantizeTransformer = False*.
|
|
75
78
|
|
|
@@ -80,6 +83,9 @@ Every parameter set automatically by a profile can be overridden with one or mul
|
|
|
80
83
|
```
|
|
81
84
|
If you want to know which parameter are set by one specific profile you can use the parameter *verboseLevel=2*
|
|
82
85
|
|
|
86
|
+
**It is highly recommended to put the *from mmgp import offload, profile_type* at the top of your main python file (that is as the first import) so that all the existing safetensors calls are redirected to mmpg.**
|
|
87
|
+
|
|
88
|
+
|
|
83
89
|
## Alternatively you may want to create your own profile with specific parameters:
|
|
84
90
|
|
|
85
91
|
For example:
|
|
@@ -101,20 +107,22 @@ If you are short on RAM and plan to work with quantized models, it is recommende
|
|
|
101
107
|
## Going further
|
|
102
108
|
|
|
103
109
|
The module includes several tools to package a light version of your favorite video / image generator:
|
|
104
|
-
- *save_model(model, file_path, do_quantize = False,
|
|
110
|
+
- *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
|
|
105
111
|
Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
|
|
106
112
|
The resulting safetensor file will contain extra fields in its metadata such as the quantization map and its configuration, so you will be able to move the file around without files such as *config.json* or *file_map.json*.
|
|
107
113
|
You will need *load_model_data* or *fast_load_transformers_model* to read the file again . You may also load it using the default *safetensor* librar however you will need to provide in the same directory any complementary file that are usually requested (for instance *config.json*)
|
|
108
114
|
|
|
109
|
-
- *load_model_data(model, file_path: str, do_quantize = False,
|
|
115
|
+
- *load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
|
|
110
116
|
Load the tensors data of a model in RAM of a model already initialized with no data. Detect and handle quantized models saved previously with *save_model*.A model can also be quantized on the fly while being loaded. The model which is loaded can be pinned to RAM while it is loaded, this is more RAM efficient than pinning tensors later using *offline.all* or *offline.profile*
|
|
111
117
|
|
|
112
|
-
- *fast_load_transformers_model(model_path: str, do_quantize = False,
|
|
118
|
+
- *fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
|
|
113
119
|
Initialize (build the model hierarchy in memory) and fast load the corresponding tensors of a 'transformers' or 'diffusers' library model.
|
|
114
120
|
The advantages over the original *from_pretrained* method is that a full model can fit into a single file with a filename of your choosing (thefore you can have multiple 'transformers' versions of the same model in the same directory) and prequantized models are processed in a transparent way.
|
|
115
121
|
Last but not least, you can also on the fly pin to RAM the whole model or the most important part of it (partialPin = True) in a more efficient way (faster and requires less RAM) than if you did through *offload.all* or *offload.profile*.
|
|
116
122
|
|
|
117
|
-
|
|
123
|
+
- *load_loras_into_model(model, lora_path, lora_multi)
|
|
124
|
+
Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
|
|
125
|
+
The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
|
|
118
126
|
|
|
119
127
|
The typical workflow wil be:
|
|
120
128
|
1) temporarly insert the *save_model* function just after a model has been fully loaded to save a copy of the model / quantized model.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=bYjpbAHbVX2Vf3nBJXYEc1u9B5JIYvJxv4eMS8L5Tco,64209
|
|
4
|
+
mmgp/safetensors2.py,sha256=G6uzvpGauJLPEvN74MX1ib4YK0E4wzNMyrZO5wOX2k0,15812
|
|
5
|
+
mmgp-3.0.9.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.0.9.dist-info/METADATA,sha256=0vNt8lNKfMkyBrFUN8pOfkDRf8i_jmndgH2ePIekmdg,12570
|
|
7
|
+
mmgp-3.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
8
|
+
mmgp-3.0.9.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.0.9.dist-info/RECORD,,
|
mmgp-3.0.1.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=T9RBAibAyAnKV-8AiYmop_UOGl_N1l5EJo5ucCZfxK8,61611
|
|
4
|
-
mmgp/safetensors2.py,sha256=CSv8HdrjURUzBazpaBDU1WNwUL1lhzpCyzG0GWygbGE,13602
|
|
5
|
-
mmgp-3.0.1.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.0.1.dist-info/METADATA,sha256=uSsBc5pBaYBL4Ek3TR99J9hP7AQQlwnnUM_JQlkNwbE,11765
|
|
7
|
-
mmgp-3.0.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
8
|
-
mmgp-3.0.1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|