mmgp 3.3.4__py3-none-any.whl → 3.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +84 -48
- {mmgp-3.3.4.dist-info → mmgp-3.4.0.dist-info}/METADATA +197 -197
- mmgp-3.4.0.dist-info/RECORD +9 -0
- {mmgp-3.3.4.dist-info → mmgp-3.4.0.dist-info}/WHEEL +1 -1
- {mmgp-3.3.4.dist-info → mmgp-3.4.0.dist-info}/licenses/LICENSE.md +1 -1
- mmgp-3.3.4.dist-info/RECORD +0 -9
- {mmgp-3.3.4.dist-info → mmgp-3.4.0.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.
|
|
1
|
+
# ------------------ Memory Management 3.4.0 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -89,7 +89,7 @@ mmm = safetensors2.mmm
|
|
|
89
89
|
default_verboseLevel = 1
|
|
90
90
|
|
|
91
91
|
ONE_MB = 1048576
|
|
92
|
-
|
|
92
|
+
sizeofhalffloat = torch.bfloat16.itemsize
|
|
93
93
|
sizeofint8 = torch.int8.itemsize
|
|
94
94
|
total_pinned_bytes = 0
|
|
95
95
|
max_pinnable_bytes = 0
|
|
@@ -149,10 +149,17 @@ def _compute_verbose_level(level):
|
|
|
149
149
|
safetensors2.verboseLevel = level
|
|
150
150
|
return level
|
|
151
151
|
|
|
152
|
-
def _get_perc_reserved_mem_max(perc_reserved_mem_max):
|
|
153
|
-
if perc_reserved_mem_max<=0:
|
|
152
|
+
def _get_perc_reserved_mem_max(perc_reserved_mem_max = 0):
|
|
153
|
+
if perc_reserved_mem_max <=0:
|
|
154
|
+
perc_reserved_mem_max = os.getenv("perc_reserved_mem_max", 0)
|
|
155
|
+
|
|
156
|
+
if perc_reserved_mem_max <= 0:
|
|
154
157
|
perc_reserved_mem_max = 0.40 if os.name == 'nt' else 0.5
|
|
155
|
-
return
|
|
158
|
+
return perc_reserved_mem_max
|
|
159
|
+
|
|
160
|
+
def _get_max_reservable_memory(perc_reserved_mem_max = 0):
|
|
161
|
+
max_reservable_memory = perc_reserved_mem_max * physical_memory
|
|
162
|
+
return max_reservable_memory
|
|
156
163
|
|
|
157
164
|
def _detect_main_towers(model, min_floors = 5):
|
|
158
165
|
cur_blocks_prefix = None
|
|
@@ -214,7 +221,7 @@ def _get_model(model_path):
|
|
|
214
221
|
_filename = _path[-1]
|
|
215
222
|
_path = _path[:-1]
|
|
216
223
|
if len(_path)<=1:
|
|
217
|
-
raise("file not found")
|
|
224
|
+
raise Exception("file not found")
|
|
218
225
|
else:
|
|
219
226
|
try:
|
|
220
227
|
from huggingface_hub import hf_hub_download #snapshot_download,
|
|
@@ -290,8 +297,9 @@ def _get_tensor_ref(p):
|
|
|
290
297
|
return p.data_ptr()
|
|
291
298
|
|
|
292
299
|
|
|
293
|
-
|
|
294
|
-
|
|
300
|
+
BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
301
|
+
BIG_TENSOR_MIN_SIZE = 2**26 # 64 MB
|
|
302
|
+
RESERVED_RAM_MIN_AVAILABLE = 2**27 # 128 MB
|
|
295
303
|
|
|
296
304
|
def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
|
|
297
305
|
tied_weights = {}
|
|
@@ -322,7 +330,7 @@ def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
|
|
|
322
330
|
|
|
323
331
|
def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
324
332
|
global max_pinnable_bytes, total_pinned_bytes
|
|
325
|
-
if max_pinnable_bytes > 0 and
|
|
333
|
+
if max_pinnable_bytes > 0 and total_pinned_bytes >= max_pinnable_bytes:
|
|
326
334
|
|
|
327
335
|
if verboseLevel>=1 :
|
|
328
336
|
print(f"Unable pin data of '{sd_name}' to reserved RAM as there is no reserved RAM left")
|
|
@@ -357,6 +365,12 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
357
365
|
last_big_tensor = 0
|
|
358
366
|
total = 0
|
|
359
367
|
|
|
368
|
+
try:
|
|
369
|
+
dummy_pinned_tensor = torch.empty( RESERVED_RAM_MIN_AVAILABLE, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
370
|
+
except:
|
|
371
|
+
print("There isn't any Reserved RAM left, you may need to choose a profile with a higher number that requires less Reserved RAM or set OS env 'perc_reserved_mem_max' to a value less 0.3")
|
|
372
|
+
return
|
|
373
|
+
|
|
360
374
|
for size in big_tensors_sizes:
|
|
361
375
|
try:
|
|
362
376
|
current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
@@ -367,6 +381,7 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
367
381
|
|
|
368
382
|
last_big_tensor += 1
|
|
369
383
|
total += size
|
|
384
|
+
del dummy_pinned_tensor
|
|
370
385
|
|
|
371
386
|
|
|
372
387
|
tensor_no = 0
|
|
@@ -401,10 +416,10 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
401
416
|
return
|
|
402
417
|
|
|
403
418
|
|
|
404
|
-
def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, big_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
419
|
+
def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, big_tensor_size = BIG_TENSOR_MAX_SIZE, perc_reserved_mem_max = 0,verboseLevel = 1):
|
|
405
420
|
|
|
406
421
|
global max_pinnable_bytes, total_pinned_bytes
|
|
407
|
-
if max_pinnable_bytes > 0 and
|
|
422
|
+
if max_pinnable_bytes > 0 and total_pinned_bytes >= max_pinnable_bytes:
|
|
408
423
|
|
|
409
424
|
if verboseLevel>=1 :
|
|
410
425
|
print(f"Unable pin data of '{model_id}' to reserved RAM as there is no reserved RAM left")
|
|
@@ -414,6 +429,8 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
414
429
|
towers_names, _ = _detect_main_towers(model)
|
|
415
430
|
|
|
416
431
|
|
|
432
|
+
perc_reserved_mem_max = _get_perc_reserved_mem_max(perc_reserved_mem_max)
|
|
433
|
+
max_reservable_memory = _get_max_reservable_memory(perc_reserved_mem_max)
|
|
417
434
|
|
|
418
435
|
current_big_tensor_size = 0
|
|
419
436
|
big_tensor_no = 0
|
|
@@ -502,8 +519,12 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
502
519
|
|
|
503
520
|
|
|
504
521
|
failed_planned_allocation = False
|
|
505
|
-
|
|
506
522
|
gc.collect()
|
|
523
|
+
try:
|
|
524
|
+
dummy_pinned_tensor = torch.empty( RESERVED_RAM_MIN_AVAILABLE, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
525
|
+
except:
|
|
526
|
+
print("There isn't any Reserved RAM left, you may need to choose a profile with a higher number that requires less Reserved RAM or set OS env 'perc_reserved_mem_max' to a value less than{perc_reserved_mem_max}")
|
|
527
|
+
return
|
|
507
528
|
|
|
508
529
|
last_allocated_big_tensor = -1
|
|
509
530
|
tensor_no = 0
|
|
@@ -530,16 +551,21 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
530
551
|
big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
|
|
531
552
|
if last_allocated_big_tensor < big_tensor_no:
|
|
532
553
|
last_allocated_big_tensor += 1
|
|
533
|
-
size = big_tensors_sizes[last_allocated_big_tensor]
|
|
554
|
+
size = max(big_tensors_sizes[last_allocated_big_tensor], BIG_TENSOR_MIN_SIZE)
|
|
534
555
|
try:
|
|
535
|
-
|
|
536
|
-
|
|
556
|
+
if max_reservable_memory > 0 and ( (total_pinned_bytes + total + size) >= max_reservable_memory):
|
|
557
|
+
dummy_pinned_tensor = None
|
|
558
|
+
failed_planned_allocation = True
|
|
559
|
+
max_pinnable_bytes = total_pinned_bytes + total
|
|
560
|
+
break
|
|
561
|
+
|
|
537
562
|
current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
538
563
|
big_tensors.append(current_big_tensor)
|
|
539
564
|
except:
|
|
540
|
-
print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
541
|
-
|
|
565
|
+
print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f}).")
|
|
566
|
+
dummy_pinned_tensor = None
|
|
542
567
|
failed_planned_allocation = True
|
|
568
|
+
max_pinnable_bytes = total_pinned_bytes + total
|
|
543
569
|
break
|
|
544
570
|
|
|
545
571
|
total += size
|
|
@@ -569,6 +595,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
569
595
|
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
570
596
|
tensor_no += 1
|
|
571
597
|
del p
|
|
598
|
+
del dummy_pinned_tensor
|
|
572
599
|
model._pinned_bytes = total
|
|
573
600
|
total_pinned_bytes += total
|
|
574
601
|
del params_dict
|
|
@@ -591,7 +618,7 @@ def _welcome():
|
|
|
591
618
|
if welcome_displayed:
|
|
592
619
|
return
|
|
593
620
|
welcome_displayed = True
|
|
594
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.
|
|
621
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.4.0) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
595
622
|
|
|
596
623
|
def _extract_num_from_str(num_in_str):
|
|
597
624
|
size = len(num_in_str)
|
|
@@ -728,10 +755,10 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 2*
|
|
|
728
755
|
tied_weights[submodule_name]= (n, ) + match
|
|
729
756
|
else:
|
|
730
757
|
cache_ref[ref] = (submodule_name, n)
|
|
731
|
-
size += torch.numel(p.data) *
|
|
758
|
+
size += torch.numel(p.data) * sizeofhalffloat
|
|
732
759
|
|
|
733
760
|
for p in submodule.buffers(recurse=False):
|
|
734
|
-
size += torch.numel(p.data) *
|
|
761
|
+
size += torch.numel(p.data) * sizeofhalffloat
|
|
735
762
|
|
|
736
763
|
|
|
737
764
|
|
|
@@ -853,7 +880,7 @@ def split_linear_modules(model, map ):
|
|
|
853
880
|
sub_bias = torch.split(bias, split_sizes, dim=0)
|
|
854
881
|
for sub_name, _subdata, _subbias, _subscale in zip(mapped_modules, sub_data, sub_bias, sub_scale):
|
|
855
882
|
with init_empty_weights():
|
|
856
|
-
sub_module = QLinear(_subdata.shape[1], _subdata.shape[0], bias=bias != None, device ="cpu", dtype=
|
|
883
|
+
sub_module = QLinear(_subdata.shape[1], _subdata.shape[0], bias=bias != None, device ="cpu", dtype=weight.dtype)
|
|
857
884
|
sub_module.weight = torch.nn.Parameter(WeightQBytesTensor.create(weight.qtype, weight.axis, _subdata.size(), weight.stride(), _subdata, _subscale, activation_qtype=weight.activation_qtype, requires_grad=weight.requires_grad ))
|
|
858
885
|
if bias != None:
|
|
859
886
|
sub_module.bias = torch.nn.Parameter(_subbias)
|
|
@@ -866,7 +893,7 @@ def split_linear_modules(model, map ):
|
|
|
866
893
|
sub_bias = torch.split(bias, split_sizes, dim=0)
|
|
867
894
|
for sub_name, subdata, subbias in zip(mapped_modules, sub_data, sub_bias):
|
|
868
895
|
with init_empty_weights():
|
|
869
|
-
sub_module = torch.nn.Linear( subdata.shape[1], subdata.shape[0], bias=bias != None, device ="cpu", dtype=
|
|
896
|
+
sub_module = torch.nn.Linear( subdata.shape[1], subdata.shape[0], bias=bias != None, device ="cpu", dtype=weight.dtype)
|
|
870
897
|
sub_module.weight = torch.nn.Parameter(subdata , requires_grad=False)
|
|
871
898
|
if bias != None:
|
|
872
899
|
sub_module.bias = torch.nn.Parameter(subbias)
|
|
@@ -1045,8 +1072,8 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1045
1072
|
# loras_module_data = dict()
|
|
1046
1073
|
# loras_model_data[module] = loras_module_data
|
|
1047
1074
|
loras_adapter_data = loras_module_data.get(adapter_name, None)
|
|
1048
|
-
lora_A = None if lora_A == None else lora_A.to(
|
|
1049
|
-
lora_B = None if lora_B == None else lora_B.to(
|
|
1075
|
+
lora_A = None if lora_A == None else lora_A.to(module.weight.dtype)
|
|
1076
|
+
lora_B = None if lora_B == None else lora_B.to(module.weight.dtype)
|
|
1050
1077
|
if loras_adapter_data == None:
|
|
1051
1078
|
alpha = lora_alphas.get(k[:-len("lora_X.weight")] + "alpha", 1.)
|
|
1052
1079
|
loras_adapter_data = [lora_A, lora_B, alpha]
|
|
@@ -1483,6 +1510,7 @@ class offload:
|
|
|
1483
1510
|
self.transfer_stream = torch.cuda.Stream()
|
|
1484
1511
|
self.async_transfers = False
|
|
1485
1512
|
self.parameters_ref = {}
|
|
1513
|
+
self.max_reservable_memory = 0
|
|
1486
1514
|
|
|
1487
1515
|
global last_offload_obj
|
|
1488
1516
|
last_offload_obj = self
|
|
@@ -1729,22 +1757,22 @@ class offload:
|
|
|
1729
1757
|
gc.collect()
|
|
1730
1758
|
self.last_reserved_mem_check = time.time()
|
|
1731
1759
|
|
|
1732
|
-
def move_args_to_gpu(self, *args, **kwargs):
|
|
1760
|
+
def move_args_to_gpu(self, dtype, *args, **kwargs):
|
|
1733
1761
|
new_args= []
|
|
1734
1762
|
new_kwargs={}
|
|
1763
|
+
|
|
1735
1764
|
for arg in args:
|
|
1736
1765
|
if torch.is_tensor(arg):
|
|
1737
1766
|
if arg.dtype == torch.float32:
|
|
1738
|
-
arg = arg.to(
|
|
1767
|
+
arg = arg.to(dtype).cuda(non_blocking=True)
|
|
1739
1768
|
elif not arg.is_cuda:
|
|
1740
1769
|
arg = arg.cuda(non_blocking=True)
|
|
1741
1770
|
new_args.append(arg)
|
|
1742
|
-
|
|
1743
1771
|
for k in kwargs:
|
|
1744
1772
|
arg = kwargs[k]
|
|
1745
1773
|
if torch.is_tensor(arg):
|
|
1746
1774
|
if arg.dtype == torch.float32:
|
|
1747
|
-
arg = arg.to(
|
|
1775
|
+
arg = arg.to(dtype).cuda(non_blocking=True)
|
|
1748
1776
|
elif not arg.is_cuda:
|
|
1749
1777
|
arg = arg.cuda(non_blocking=True)
|
|
1750
1778
|
new_kwargs[k]= arg
|
|
@@ -1805,7 +1833,7 @@ class offload:
|
|
|
1805
1833
|
loras_scaling = model._loras_scaling
|
|
1806
1834
|
training = False
|
|
1807
1835
|
|
|
1808
|
-
|
|
1836
|
+
dtype = weight.dtype
|
|
1809
1837
|
if weight.shape[-1] < x.shape[-2] : # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
1810
1838
|
if len(active_adapters) > 0:
|
|
1811
1839
|
if isinstance(submodule, QModuleMixin):
|
|
@@ -1833,7 +1861,7 @@ class offload:
|
|
|
1833
1861
|
result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
|
|
1834
1862
|
|
|
1835
1863
|
if len(active_adapters) > 0:
|
|
1836
|
-
x = x.to(
|
|
1864
|
+
x = x.to(dtype)
|
|
1837
1865
|
|
|
1838
1866
|
for active_adapter in active_adapters:
|
|
1839
1867
|
data = loras_data.get(active_adapter + '_GPU', None)
|
|
@@ -1873,7 +1901,7 @@ class offload:
|
|
|
1873
1901
|
return functools.update_wrapper(functools.partial(lora_linear_forward, submodule), old_forward)
|
|
1874
1902
|
|
|
1875
1903
|
def ensure_model_loaded(self, model_id):
|
|
1876
|
-
if
|
|
1904
|
+
if model_id in self.active_models_ids:
|
|
1877
1905
|
return
|
|
1878
1906
|
# new_model_id = getattr(module, "_mm_id")
|
|
1879
1907
|
# do not always unload existing models if it is more efficient to keep in them in the GPU
|
|
@@ -1895,8 +1923,9 @@ class offload:
|
|
|
1895
1923
|
target_module.register_forward_pre_hook(preload_blocks_for_compile)
|
|
1896
1924
|
|
|
1897
1925
|
|
|
1898
|
-
def hook_check_empty_cache_needed(self, target_module, model_id, blocks_name, previous_method, context):
|
|
1926
|
+
def hook_check_empty_cache_needed(self, target_module, model, model_id, blocks_name, previous_method, context):
|
|
1899
1927
|
|
|
1928
|
+
dtype = model._dtype
|
|
1900
1929
|
qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
|
|
1901
1930
|
if qint4quantization:
|
|
1902
1931
|
pass
|
|
@@ -1921,8 +1950,8 @@ class offload:
|
|
|
1921
1950
|
self.empty_cache_if_needed()
|
|
1922
1951
|
elif blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
|
|
1923
1952
|
self.gpu_load_blocks(model_id, blocks_name)
|
|
1924
|
-
if qint4quantization:
|
|
1925
|
-
args, kwargs = self.move_args_to_gpu(*args, **kwargs)
|
|
1953
|
+
if qint4quantization and dtype !=None:
|
|
1954
|
+
args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
|
|
1926
1955
|
|
|
1927
1956
|
return previous_method(*args, **kwargs)
|
|
1928
1957
|
|
|
@@ -1933,11 +1962,13 @@ class offload:
|
|
|
1933
1962
|
|
|
1934
1963
|
|
|
1935
1964
|
def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
|
|
1965
|
+
dtype = model._dtype
|
|
1936
1966
|
|
|
1937
1967
|
def check_change_module(module, *args, **kwargs):
|
|
1938
1968
|
self.ensure_model_loaded(model_id)
|
|
1939
1969
|
# transfer leftovers inputs that were incorrectly created in the RAM (mostly due to some .device tests that returned incorrectly "cpu")
|
|
1940
|
-
|
|
1970
|
+
if dtype != None:
|
|
1971
|
+
args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
|
|
1941
1972
|
return previous_method(*args, **kwargs)
|
|
1942
1973
|
|
|
1943
1974
|
if hasattr(target_module, "_mm_id"):
|
|
@@ -2069,7 +2100,7 @@ class offload:
|
|
|
2069
2100
|
|
|
2070
2101
|
|
|
2071
2102
|
|
|
2072
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False,
|
|
2103
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2073
2104
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
2074
2105
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
2075
2106
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -2083,6 +2114,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2083
2114
|
self.verboseLevel = verboseLevel
|
|
2084
2115
|
safetensors2.verboseLevel = verboseLevel
|
|
2085
2116
|
self.modules_data = {}
|
|
2117
|
+
|
|
2086
2118
|
model_budgets = {}
|
|
2087
2119
|
|
|
2088
2120
|
windows_os = os.name == 'nt'
|
|
@@ -2159,9 +2191,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2159
2191
|
# torch._logging.set_logs(recompiles=True)
|
|
2160
2192
|
# torch._inductor.config.realize_opcount_threshold = 100 # workaround bug "AssertionError: increase TRITON_MAX_BLOCK['X'] to 4096."
|
|
2161
2193
|
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
max_reservable_memory = perc_reserved_mem_max * physical_memory
|
|
2194
|
+
perc_reserved_mem_max = _get_perc_reserved_mem_max(perc_reserved_mem_max)
|
|
2195
|
+
max_reservable_memory = _get_max_reservable_memory(perc_reserved_mem_max)
|
|
2165
2196
|
|
|
2166
2197
|
estimatesBytesToPin = 0
|
|
2167
2198
|
for model_id in models:
|
|
@@ -2176,12 +2207,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2176
2207
|
modelPinned = (pinAllModels or model_id in modelsToPin) and not hasattr(current_model,"_already_pinned")
|
|
2177
2208
|
|
|
2178
2209
|
current_model_size = 0
|
|
2210
|
+
model_dtype = None
|
|
2179
2211
|
for n, p in current_model.named_parameters():
|
|
2180
2212
|
p.requires_grad = False
|
|
2181
2213
|
if isinstance(p, QTensor):
|
|
2182
|
-
# # fix quanto bug (seems to have been fixed)
|
|
2183
|
-
# if not modelPinned and p._scale.dtype == torch.float32:
|
|
2184
|
-
# p._scale = p._scale.to(torch.bfloat16)
|
|
2185
2214
|
if p._qtype == qint4:
|
|
2186
2215
|
if hasattr(p,"_scale_shift"):
|
|
2187
2216
|
current_model_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
|
|
@@ -2193,13 +2222,20 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2193
2222
|
else:
|
|
2194
2223
|
current_model_size += torch.numel(p._scale) * p._scale.element_size()
|
|
2195
2224
|
current_model_size += torch.numel(p._data) * p._data.element_size()
|
|
2225
|
+
dtype = p._scale.dtype
|
|
2196
2226
|
|
|
2197
2227
|
else:
|
|
2198
|
-
|
|
2199
|
-
|
|
2200
|
-
|
|
2228
|
+
dtype = p.data.dtype
|
|
2229
|
+
if convertWeightsFloatTo != None and dtype == torch.float32:
|
|
2230
|
+
# convert any left overs float32 weight to bfloat16 / float16 to divide by 2 the model memory footprint
|
|
2231
|
+
dtype = convertWeightsFloatTo if model_dtype == None else model_dtype
|
|
2232
|
+
p.data = p.data.to(dtype)
|
|
2233
|
+
if model_dtype== None:
|
|
2234
|
+
model_dtype = dtype
|
|
2235
|
+
else:
|
|
2236
|
+
assert model_dtype == dtype
|
|
2201
2237
|
current_model_size += torch.numel(p.data) * p.data.element_size()
|
|
2202
|
-
|
|
2238
|
+
current_model._dtype = model_dtype
|
|
2203
2239
|
for b in current_model.buffers():
|
|
2204
2240
|
# do not convert 32 bits float to 16 bits since buffers are few (and potential gain low) and usually they are needed for precision calculation (for instance Rope)
|
|
2205
2241
|
current_model_size += torch.numel(b.data) * b.data.element_size()
|
|
@@ -2267,7 +2303,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2267
2303
|
if self.verboseLevel >=1:
|
|
2268
2304
|
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
2269
2305
|
else:
|
|
2270
|
-
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, verboseLevel=verboseLevel)
|
|
2306
|
+
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, perc_reserved_mem_max = perc_reserved_mem_max, verboseLevel=verboseLevel)
|
|
2271
2307
|
|
|
2272
2308
|
current_budget = model_budgets[model_id]
|
|
2273
2309
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
@@ -2317,7 +2353,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2317
2353
|
elif compilationInThisOne and submodule in towers_modules:
|
|
2318
2354
|
self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
|
|
2319
2355
|
else:
|
|
2320
|
-
self.hook_check_empty_cache_needed(submodule, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2356
|
+
self.hook_check_empty_cache_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2321
2357
|
|
|
2322
2358
|
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
|
|
2323
2359
|
|
|
@@ -1,197 +1,197 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: mmgp
|
|
3
|
-
Version: 3.
|
|
4
|
-
Summary: Memory Management for the GPU Poor
|
|
5
|
-
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
|
-
License: GNU GENERAL PUBLIC LICENSE
|
|
7
|
-
Version 3, 29 June 2007
|
|
8
|
-
Requires-Python: >=3.10
|
|
9
|
-
Description-Content-Type: text/markdown
|
|
10
|
-
License-File: LICENSE.md
|
|
11
|
-
Requires-Dist: torch>=2.1.0
|
|
12
|
-
Requires-Dist: optimum-quanto
|
|
13
|
-
Requires-Dist: accelerate
|
|
14
|
-
Requires-Dist: safetensors
|
|
15
|
-
Requires-Dist: psutil
|
|
16
|
-
Dynamic: license-file
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.3.
|
|
21
|
-
</p>
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 12 to 24 GB GPU limited card.
|
|
25
|
-
This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
26
|
-
times in a pipe (eg VAE).
|
|
27
|
-
|
|
28
|
-
Requirements:
|
|
29
|
-
- VRAM: minimum 6 GB, recommended 24 GB (RTX 3090/ RTX 4090)
|
|
30
|
-
- RAM: minimum 24 GB, recommended 48 GB
|
|
31
|
-
|
|
32
|
-
This module features 5 profiles in order to able to run the model at a decent speed on a low end consumer config (24 GB of RAM and 6 VRAM) and to run it at a very good speed (if not the best) on a high end consumer config (48 GB of RAM and 24 GB of VRAM).\
|
|
33
|
-
These RAM requirements are for Linux systems. Due to different memory management Windows will require an extra 16 GB of RAM to run the corresponding profile.
|
|
34
|
-
|
|
35
|
-
Each profile may use a combination of the following:
|
|
36
|
-
- Low RAM consumption (thanks to a rewritten safetensors library) that allows low RAM on the fly quantization
|
|
37
|
-
- Smart automated loading / unloading of models in the GPU to avoid unloading models that may be needed again soon
|
|
38
|
-
- Smart slicing of models to reduce memory occupied by models in the VRAM
|
|
39
|
-
- Ability to pin models to reserved RAM to accelerate transfers to VRAM
|
|
40
|
-
- Async transfers to VRAM to avoid a pause when loading a new slice of a model
|
|
41
|
-
- Automated on the fly quantization or ability to load pre quantized models
|
|
42
|
-
- Pretrained Lora support with low RAM requirements
|
|
43
|
-
- Support for pytorch compilation on Linux and WSL (supported on pure Windows but requires a complex Triton Installation).
|
|
44
|
-
|
|
45
|
-
## Sample applications that use mmgp
|
|
46
|
-
It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
|
|
47
|
-
- Wan2GP: https://github.com/deepbeepmeep/Wan2GP :\
|
|
48
|
-
An excellent text to video and image to video generator by Alibaba
|
|
49
|
-
|
|
50
|
-
- Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
|
|
51
|
-
A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
|
|
52
|
-
|
|
53
|
-
- HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP :\
|
|
54
|
-
One of the best open source Text to Video generator
|
|
55
|
-
|
|
56
|
-
- FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP :\
|
|
57
|
-
One of the best inpainting / outpainting tools based on Flux that can run with less than 12 GB of VRAM.
|
|
58
|
-
|
|
59
|
-
- Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP :\
|
|
60
|
-
This application include two models: a text to world generator and a image / video to world (probably the best open source image to video generator).
|
|
61
|
-
|
|
62
|
-
- OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP :\
|
|
63
|
-
A Flux derived application very powerful that can be used to transfer an object of your choice in a prompted scene. With mmgp you can run it with only 6 GB of VRAM.
|
|
64
|
-
|
|
65
|
-
- YuE GP: https://github.com/deepbeepmeep/YuEGP :\
|
|
66
|
-
A great song generator (instruments + singer's voice) based on prompted Lyrics and a genre description. Thanks to mmgp you can run it with less than 10 GB of VRAM without waiting forever.
|
|
67
|
-
|
|
68
|
-
## Installation
|
|
69
|
-
First you need to install the module in your current project with:
|
|
70
|
-
```shell
|
|
71
|
-
pip install mmgp
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
## Usage
|
|
76
|
-
|
|
77
|
-
It is almost plug and play and just needs to be invoked from the main app just after the model pipeline has been created.
|
|
78
|
-
1) First make sure that the pipeline explictly loads the models in the CPU device, for instance:
|
|
79
|
-
```
|
|
80
|
-
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to("cpu")
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
2) Once every potential Lora has been loaded and merged, add the following lines for a quick setup:
|
|
84
|
-
```
|
|
85
|
-
from mmgp import offload, profile_type
|
|
86
|
-
offload.profile(pipe, profile_type.HighRAM_LowVRAM_Fast)
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
You can choose between 5 profiles depending on your hardware:
|
|
90
|
-
- HighRAM_HighVRAM (1): at least 48 GB of RAM and 24 GB of VRAM : the fastest well suited for a RTX 3090 / RTX 4090 but consumes much more VRAM, adapted for fast shorter video or small batches of pictures
|
|
91
|
-
- HighRAM_LowVRAM (2): at least 48 GB of RAM and 12 GB of VRAM : a bit slower, better suited for RTX 3070/3080/4070/4080 or for RTX 3090 / RTX 4090 with large pictures batches or long videos
|
|
92
|
-
- LowRAM_HighVRAM (3): at least 32 GB of RAM and 24 GB of VRAM : adapted for RTX 3090 / RTX 4090 with limited RAM but at the cost of VRAM (shorter videos / fewer images)
|
|
93
|
-
- LowRAM_LowVRAM (4): at least 32 GB of RAM and 12 GB of VRAM : if you have little VRAM or want to generate longer videos / more images
|
|
94
|
-
- VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
|
|
95
|
-
|
|
96
|
-
Profile 2 (High RAM) and 4 (Low RAM) are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
|
|
97
|
-
If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
|
|
98
|
-
In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
|
|
99
|
-
|
|
100
|
-
By default the model named 'transformer' will be quantized to 8 bits for all profiles. If you don't want that you may specify the optional parameter *quantizeTransformer = False*.
|
|
101
|
-
|
|
102
|
-
Every parameter set automatically by a profile can be overridden with one or multiple parameters accepted by *offload.all* (see below):
|
|
103
|
-
```
|
|
104
|
-
from mmgp import offload, profile_type
|
|
105
|
-
offload.profile(pipe, profile_type.HighRAM_LowVRAM, budgets = 1000)
|
|
106
|
-
```
|
|
107
|
-
If you want to know which parameter are set by one specific profile you can use the parameter *verboseLevel=2*
|
|
108
|
-
|
|
109
|
-
**It is highly recommended to put the *from mmgp import offload, profile_type* at the top of your main python file (that is as the first import) so that all the existing safetensors calls are redirected to mmpg.**
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
## Alternatively you may want to create your own profile with specific parameters:
|
|
113
|
-
|
|
114
|
-
For example:
|
|
115
|
-
```
|
|
116
|
-
from mmgp import offload
|
|
117
|
-
offload.all(pipe, pinnedMemory=True, ExtraModelsToQuantize = ["text_encoder_2"] )
|
|
118
|
-
```
|
|
119
|
-
- pinnedMemory: Boolean (for all models) or List of models ids to pin to RAM. Every model pinned to RAM will load much faster (up to 2 times) but this requires more RAM
|
|
120
|
-
- quantizeTransformer: boolean by default True. The 'transformer' model in the pipe contains usually the video or image generator is by defaut; quantized on the fly by default to 8 bits. If you want to save time on disk and reduce the loading time, you may want to load directly a prequantized model. If you don't want to quantize the image generator, you need to set the option *quantizeTransformer* to *False* to turn off on the fly quantization.
|
|
121
|
-
- extraModelsToQuantize: list of additional modelids of models to quantize on the fly. If the corresponding model is already quantized, this option will be ignored.
|
|
122
|
-
- budgets: either a number in mega bytes, (for all models, if 0 unlimited budget) a string that is perecentage of the total VRAM or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
|
|
123
|
-
The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
|
|
124
|
-
- workingVRAM: either a number in mega bytes, a string that is perecentage of the total VRAM or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
|
|
125
|
-
- asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
|
|
126
|
-
- verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
|
|
127
|
-
- compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows
|
|
128
|
-
- coTenantsMap: a dictionary that maps a model id to a list of other models with which it accepts to share the VRAM at the same time. This is useful to avoid unefficient loading / unloading when two models processes are interleaved. For instance *coTenantsMap = { "text_encoder_2": ["text_encoder"] }* , here when *text_encoder_2* is loaded it won't unload *text_encoder*. Please note that the reverse is not true as these maps by design are not symetrical to allow tailored workflows. If you need to have as well *text_encoder* that won't unload *text_encoder_2* if it is already loaded *coTenantsMap = { "text_encoder_2": ["text_encoder"], "text_encoder": ["text_encoder_2"] }*
|
|
129
|
-
|
|
130
|
-
If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
|
|
131
|
-
|
|
132
|
-
## Going further
|
|
133
|
-
|
|
134
|
-
The module includes several tools to package a light version of your favorite video / image generator:
|
|
135
|
-
- *extract_models(string prefix, obj to explore)*\
|
|
136
|
-
This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required by *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
|
|
137
|
-
|
|
138
|
-
- *load_loras_into_model(model, lora_path, lora_multi, activate_all_loras = True)*\
|
|
139
|
-
Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
|
|
140
|
-
The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions. By default all the load loras will be activated or they can be activated later using *activate_loras*.
|
|
141
|
-
|
|
142
|
-
-*activate_loras(model, lora_nos, lora_multi = None )*\
|
|
143
|
-
Activate the loras whose nos are in the list of nos. Every lora that is not this list and that was activated previously will be disactivated.
|
|
144
|
-
|
|
145
|
-
- *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
|
|
146
|
-
Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
|
|
147
|
-
The resulting safetensor file will contain extra fields in its metadata such as the quantization map and its configuration, so you will be able to move the file around without files such as *config.json* or *file_map.json*.
|
|
148
|
-
You will need *load_model_data* or *fast_load_transformers_model* to read the file again . You may also load it using the default *safetensor* librar however you will need to provide in the same directory any complementary file that are usually requested (for instance *config.json*)
|
|
149
|
-
|
|
150
|
-
- *load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
|
|
151
|
-
Load the tensors data of a model in RAM of a model already initialized with no data. Detect and handle quantized models saved previously with *save_model*.A model can also be quantized on the fly while being loaded. The model which is loaded can be pinned to RAM while it is loaded, this is more RAM efficient than pinning tensors later using *offline.all* or *offline.profile*
|
|
152
|
-
|
|
153
|
-
- *fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
|
|
154
|
-
Initialize (build the model hierarchy in memory) and fast load the corresponding tensors of a 'transformers' or 'diffusers' library model.
|
|
155
|
-
The advantages over the original *from_pretrained* method is that a full model can fit into a single file with a filename of your choosing (thefore you can have multiple 'transformers' versions of the same model in the same directory) and prequantized models are processed in a transparent way.
|
|
156
|
-
Last but not least, you can also on the fly pin to RAM the whole model or the most important part of it (partialPin = True) in a more efficient way (faster and requires less RAM) than if you did through *offload.all* or *offload.profile*.
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
The typical workflow wil be:
|
|
160
|
-
1) temporarly insert the *save_model* function just after a model has been fully loaded to save a copy of the model / quantized model.
|
|
161
|
-
2) replace the full initalizing / loading logic with *fast_load_transformers_model* (if there is a *from_pretrained* call to a transformers object) or only the tensor loading functions (*torch.load_model_file* and *torch.load_state_dict*) with *load_model_data after* the initializing logic.
|
|
162
|
-
|
|
163
|
-
## Special cases
|
|
164
|
-
Sometime there isn't an explicit pipe object as each submodel is loaded separately in the main app. If this is the case, you may try to use *extract_models* or create a dictionary that manually maps all the models.\
|
|
165
|
-
For instance :
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
- for flux derived models:
|
|
169
|
-
```
|
|
170
|
-
pipe = { "text_encoder": clip, "text_encoder_2": t5, "transformer": model, "vae":ae }
|
|
171
|
-
```
|
|
172
|
-
- for mochi:
|
|
173
|
-
```
|
|
174
|
-
pipe = { "text_encoder": self.text_encoder, "transformer": self.dit, "vae":self.decoder }
|
|
175
|
-
```
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
Please note it is recommended to have always one model whose Id is 'transformer' so that you can leverage predefined profiles. The 'transformer' corresponds to the main image / video model which usually needs to be quantized (this is done on the fly by default when loading the model).
|
|
179
|
-
|
|
180
|
-
Be careful, lots of models use the T5 XXL as a text encoder. However, quite often their corresponding pipeline configurations point at the official Google T5 XXL repository
|
|
181
|
-
where there is a huge 40GB model to download and load. It is cumbersorme as it is a 32 bits model and contains the decoder part of T5 that is not used.
|
|
182
|
-
I suggest you use instead one of the 16 bits encoder only version available around, for instance:
|
|
183
|
-
```
|
|
184
|
-
text_encoder_2 = T5EncoderModel.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="text_encoder_2", torch_dtype=torch.float16)
|
|
185
|
-
```
|
|
186
|
-
|
|
187
|
-
Sometime just providing the pipe won't be sufficient as you will need to change the content of the core model:
|
|
188
|
-
- For instance you may need to disable an existing CPU offload logic that already exists (such as manual calls to move tensors between cuda and the cpu)
|
|
189
|
-
- mmpg to tries to fake the device as being "cuda" but sometimes some code won't be fooled and it will create tensors in the cpu device and this may cause some issues.
|
|
190
|
-
|
|
191
|
-
You are free to use my module for non commercial use as long you give me proper credits. You may contact me on twitter @deepbeepmeep
|
|
192
|
-
|
|
193
|
-
Thanks to
|
|
194
|
-
---------
|
|
195
|
-
- Huggingface / accelerate for the hooking examples
|
|
196
|
-
- Huggingface / quanto for their very useful quantizer
|
|
197
|
-
- gau-nernst for his Pinnig RAM samples
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mmgp
|
|
3
|
+
Version: 3.4.0
|
|
4
|
+
Summary: Memory Management for the GPU Poor
|
|
5
|
+
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
|
+
License: GNU GENERAL PUBLIC LICENSE
|
|
7
|
+
Version 3, 29 June 2007
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE.md
|
|
11
|
+
Requires-Dist: torch>=2.1.0
|
|
12
|
+
Requires-Dist: optimum-quanto
|
|
13
|
+
Requires-Dist: accelerate
|
|
14
|
+
Requires-Dist: safetensors
|
|
15
|
+
Requires-Dist: psutil
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<H2>Memory Management 3.3.1 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 12 to 24 GB GPU limited card.
|
|
25
|
+
This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
26
|
+
times in a pipe (eg VAE).
|
|
27
|
+
|
|
28
|
+
Requirements:
|
|
29
|
+
- VRAM: minimum 6 GB, recommended 24 GB (RTX 3090/ RTX 4090)
|
|
30
|
+
- RAM: minimum 24 GB, recommended 48 GB
|
|
31
|
+
|
|
32
|
+
This module features 5 profiles in order to able to run the model at a decent speed on a low end consumer config (24 GB of RAM and 6 VRAM) and to run it at a very good speed (if not the best) on a high end consumer config (48 GB of RAM and 24 GB of VRAM).\
|
|
33
|
+
These RAM requirements are for Linux systems. Due to different memory management Windows will require an extra 16 GB of RAM to run the corresponding profile.
|
|
34
|
+
|
|
35
|
+
Each profile may use a combination of the following:
|
|
36
|
+
- Low RAM consumption (thanks to a rewritten safetensors library) that allows low RAM on the fly quantization
|
|
37
|
+
- Smart automated loading / unloading of models in the GPU to avoid unloading models that may be needed again soon
|
|
38
|
+
- Smart slicing of models to reduce memory occupied by models in the VRAM
|
|
39
|
+
- Ability to pin models to reserved RAM to accelerate transfers to VRAM
|
|
40
|
+
- Async transfers to VRAM to avoid a pause when loading a new slice of a model
|
|
41
|
+
- Automated on the fly quantization or ability to load pre quantized models
|
|
42
|
+
- Pretrained Lora support with low RAM requirements
|
|
43
|
+
- Support for pytorch compilation on Linux and WSL (supported on pure Windows but requires a complex Triton Installation).
|
|
44
|
+
|
|
45
|
+
## Sample applications that use mmgp
|
|
46
|
+
It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
|
|
47
|
+
- Wan2GP: https://github.com/deepbeepmeep/Wan2GP :\
|
|
48
|
+
An excellent text to video and image to video generator by Alibaba
|
|
49
|
+
|
|
50
|
+
- Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
|
|
51
|
+
A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
|
|
52
|
+
|
|
53
|
+
- HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP :\
|
|
54
|
+
One of the best open source Text to Video generator
|
|
55
|
+
|
|
56
|
+
- FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP :\
|
|
57
|
+
One of the best inpainting / outpainting tools based on Flux that can run with less than 12 GB of VRAM.
|
|
58
|
+
|
|
59
|
+
- Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP :\
|
|
60
|
+
This application include two models: a text to world generator and a image / video to world (probably the best open source image to video generator).
|
|
61
|
+
|
|
62
|
+
- OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP :\
|
|
63
|
+
A Flux derived application very powerful that can be used to transfer an object of your choice in a prompted scene. With mmgp you can run it with only 6 GB of VRAM.
|
|
64
|
+
|
|
65
|
+
- YuE GP: https://github.com/deepbeepmeep/YuEGP :\
|
|
66
|
+
A great song generator (instruments + singer's voice) based on prompted Lyrics and a genre description. Thanks to mmgp you can run it with less than 10 GB of VRAM without waiting forever.
|
|
67
|
+
|
|
68
|
+
## Installation
|
|
69
|
+
First you need to install the module in your current project with:
|
|
70
|
+
```shell
|
|
71
|
+
pip install mmgp
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
## Usage
|
|
76
|
+
|
|
77
|
+
It is almost plug and play and just needs to be invoked from the main app just after the model pipeline has been created.
|
|
78
|
+
1) First make sure that the pipeline explictly loads the models in the CPU device, for instance:
|
|
79
|
+
```
|
|
80
|
+
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to("cpu")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
2) Once every potential Lora has been loaded and merged, add the following lines for a quick setup:
|
|
84
|
+
```
|
|
85
|
+
from mmgp import offload, profile_type
|
|
86
|
+
offload.profile(pipe, profile_type.HighRAM_LowVRAM_Fast)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
You can choose between 5 profiles depending on your hardware:
|
|
90
|
+
- HighRAM_HighVRAM (1): at least 48 GB of RAM and 24 GB of VRAM : the fastest well suited for a RTX 3090 / RTX 4090 but consumes much more VRAM, adapted for fast shorter video or small batches of pictures
|
|
91
|
+
- HighRAM_LowVRAM (2): at least 48 GB of RAM and 12 GB of VRAM : a bit slower, better suited for RTX 3070/3080/4070/4080 or for RTX 3090 / RTX 4090 with large pictures batches or long videos
|
|
92
|
+
- LowRAM_HighVRAM (3): at least 32 GB of RAM and 24 GB of VRAM : adapted for RTX 3090 / RTX 4090 with limited RAM but at the cost of VRAM (shorter videos / fewer images)
|
|
93
|
+
- LowRAM_LowVRAM (4): at least 32 GB of RAM and 12 GB of VRAM : if you have little VRAM or want to generate longer videos / more images
|
|
94
|
+
- VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
|
|
95
|
+
|
|
96
|
+
Profile 2 (High RAM) and 4 (Low RAM) are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
|
|
97
|
+
If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
|
|
98
|
+
In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
|
|
99
|
+
|
|
100
|
+
By default the model named 'transformer' will be quantized to 8 bits for all profiles. If you don't want that you may specify the optional parameter *quantizeTransformer = False*.
|
|
101
|
+
|
|
102
|
+
Every parameter set automatically by a profile can be overridden with one or multiple parameters accepted by *offload.all* (see below):
|
|
103
|
+
```
|
|
104
|
+
from mmgp import offload, profile_type
|
|
105
|
+
offload.profile(pipe, profile_type.HighRAM_LowVRAM, budgets = 1000)
|
|
106
|
+
```
|
|
107
|
+
If you want to know which parameter are set by one specific profile you can use the parameter *verboseLevel=2*
|
|
108
|
+
|
|
109
|
+
**It is highly recommended to put the *from mmgp import offload, profile_type* at the top of your main python file (that is as the first import) so that all the existing safetensors calls are redirected to mmpg.**
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
## Alternatively you may want to create your own profile with specific parameters:
|
|
113
|
+
|
|
114
|
+
For example:
|
|
115
|
+
```
|
|
116
|
+
from mmgp import offload
|
|
117
|
+
offload.all(pipe, pinnedMemory=True, ExtraModelsToQuantize = ["text_encoder_2"] )
|
|
118
|
+
```
|
|
119
|
+
- pinnedMemory: Boolean (for all models) or List of models ids to pin to RAM. Every model pinned to RAM will load much faster (up to 2 times) but this requires more RAM
|
|
120
|
+
- quantizeTransformer: boolean by default True. The 'transformer' model in the pipe contains usually the video or image generator is by defaut; quantized on the fly by default to 8 bits. If you want to save time on disk and reduce the loading time, you may want to load directly a prequantized model. If you don't want to quantize the image generator, you need to set the option *quantizeTransformer* to *False* to turn off on the fly quantization.
|
|
121
|
+
- extraModelsToQuantize: list of additional modelids of models to quantize on the fly. If the corresponding model is already quantized, this option will be ignored.
|
|
122
|
+
- budgets: either a number in mega bytes, (for all models, if 0 unlimited budget) a string that is perecentage of the total VRAM or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
|
|
123
|
+
The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
|
|
124
|
+
- workingVRAM: either a number in mega bytes, a string that is perecentage of the total VRAM or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
|
|
125
|
+
- asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
|
|
126
|
+
- verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
|
|
127
|
+
- compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows
|
|
128
|
+
- coTenantsMap: a dictionary that maps a model id to a list of other models with which it accepts to share the VRAM at the same time. This is useful to avoid unefficient loading / unloading when two models processes are interleaved. For instance *coTenantsMap = { "text_encoder_2": ["text_encoder"] }* , here when *text_encoder_2* is loaded it won't unload *text_encoder*. Please note that the reverse is not true as these maps by design are not symetrical to allow tailored workflows. If you need to have as well *text_encoder* that won't unload *text_encoder_2* if it is already loaded *coTenantsMap = { "text_encoder_2": ["text_encoder"], "text_encoder": ["text_encoder_2"] }*
|
|
129
|
+
|
|
130
|
+
If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
|
|
131
|
+
|
|
132
|
+
## Going further
|
|
133
|
+
|
|
134
|
+
The module includes several tools to package a light version of your favorite video / image generator:
|
|
135
|
+
- *extract_models(string prefix, obj to explore)*\
|
|
136
|
+
This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required by *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
|
|
137
|
+
|
|
138
|
+
- *load_loras_into_model(model, lora_path, lora_multi, activate_all_loras = True)*\
|
|
139
|
+
Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
|
|
140
|
+
The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions. By default all the load loras will be activated or they can be activated later using *activate_loras*.
|
|
141
|
+
|
|
142
|
+
-*activate_loras(model, lora_nos, lora_multi = None )*\
|
|
143
|
+
Activate the loras whose nos are in the list of nos. Every lora that is not this list and that was activated previously will be disactivated.
|
|
144
|
+
|
|
145
|
+
- *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
|
|
146
|
+
Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
|
|
147
|
+
The resulting safetensor file will contain extra fields in its metadata such as the quantization map and its configuration, so you will be able to move the file around without files such as *config.json* or *file_map.json*.
|
|
148
|
+
You will need *load_model_data* or *fast_load_transformers_model* to read the file again . You may also load it using the default *safetensor* librar however you will need to provide in the same directory any complementary file that are usually requested (for instance *config.json*)
|
|
149
|
+
|
|
150
|
+
- *load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
|
|
151
|
+
Load the tensors data of a model in RAM of a model already initialized with no data. Detect and handle quantized models saved previously with *save_model*.A model can also be quantized on the fly while being loaded. The model which is loaded can be pinned to RAM while it is loaded, this is more RAM efficient than pinning tensors later using *offline.all* or *offline.profile*
|
|
152
|
+
|
|
153
|
+
- *fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
|
|
154
|
+
Initialize (build the model hierarchy in memory) and fast load the corresponding tensors of a 'transformers' or 'diffusers' library model.
|
|
155
|
+
The advantages over the original *from_pretrained* method is that a full model can fit into a single file with a filename of your choosing (thefore you can have multiple 'transformers' versions of the same model in the same directory) and prequantized models are processed in a transparent way.
|
|
156
|
+
Last but not least, you can also on the fly pin to RAM the whole model or the most important part of it (partialPin = True) in a more efficient way (faster and requires less RAM) than if you did through *offload.all* or *offload.profile*.
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
The typical workflow wil be:
|
|
160
|
+
1) temporarly insert the *save_model* function just after a model has been fully loaded to save a copy of the model / quantized model.
|
|
161
|
+
2) replace the full initalizing / loading logic with *fast_load_transformers_model* (if there is a *from_pretrained* call to a transformers object) or only the tensor loading functions (*torch.load_model_file* and *torch.load_state_dict*) with *load_model_data after* the initializing logic.
|
|
162
|
+
|
|
163
|
+
## Special cases
|
|
164
|
+
Sometime there isn't an explicit pipe object as each submodel is loaded separately in the main app. If this is the case, you may try to use *extract_models* or create a dictionary that manually maps all the models.\
|
|
165
|
+
For instance :
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
- for flux derived models:
|
|
169
|
+
```
|
|
170
|
+
pipe = { "text_encoder": clip, "text_encoder_2": t5, "transformer": model, "vae":ae }
|
|
171
|
+
```
|
|
172
|
+
- for mochi:
|
|
173
|
+
```
|
|
174
|
+
pipe = { "text_encoder": self.text_encoder, "transformer": self.dit, "vae":self.decoder }
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
Please note it is recommended to have always one model whose Id is 'transformer' so that you can leverage predefined profiles. The 'transformer' corresponds to the main image / video model which usually needs to be quantized (this is done on the fly by default when loading the model).
|
|
179
|
+
|
|
180
|
+
Be careful, lots of models use the T5 XXL as a text encoder. However, quite often their corresponding pipeline configurations point at the official Google T5 XXL repository
|
|
181
|
+
where there is a huge 40GB model to download and load. It is cumbersorme as it is a 32 bits model and contains the decoder part of T5 that is not used.
|
|
182
|
+
I suggest you use instead one of the 16 bits encoder only version available around, for instance:
|
|
183
|
+
```
|
|
184
|
+
text_encoder_2 = T5EncoderModel.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="text_encoder_2", torch_dtype=torch.float16)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Sometime just providing the pipe won't be sufficient as you will need to change the content of the core model:
|
|
188
|
+
- For instance you may need to disable an existing CPU offload logic that already exists (such as manual calls to move tensors between cuda and the cpu)
|
|
189
|
+
- mmpg to tries to fake the device as being "cuda" but sometimes some code won't be fooled and it will create tensors in the cpu device and this may cause some issues.
|
|
190
|
+
|
|
191
|
+
You are free to use my module for non commercial use as long you give me proper credits. You may contact me on twitter @deepbeepmeep
|
|
192
|
+
|
|
193
|
+
Thanks to
|
|
194
|
+
---------
|
|
195
|
+
- Huggingface / accelerate for the hooking examples
|
|
196
|
+
- Huggingface / quanto for their very useful quantizer
|
|
197
|
+
- gau-nernst for his Pinnig RAM samples
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=LOx2y6wTxktpbhkPmXx8oVOCq2uMjR4BfIGGEoWY12A,108582
|
|
4
|
+
mmgp/safetensors2.py,sha256=rmUbBmK3Dra5prUTTRSVi6-XUFAa9Mj6B5CNPgzt9To,17333
|
|
5
|
+
mmgp-3.4.0.dist-info/licenses/LICENSE.md,sha256=DD-WIS0BkPoWJ_8hQO3J8hMP9K_1-dyrYv1YCbkxcDU,94
|
|
6
|
+
mmgp-3.4.0.dist-info/METADATA,sha256=Nv5a1uTGAse-G6FpoWChJv_gScHwMcSt_NMd7c99JQA,16350
|
|
7
|
+
mmgp-3.4.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
8
|
+
mmgp-3.4.0.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.4.0.dist-info/RECORD,,
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
GNU GENERAL PUBLIC LICENSE
|
|
1
|
+
GNU GENERAL PUBLIC LICENSE
|
|
2
2
|
Version 3, 29 June 2007
|
mmgp-3.3.4.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=WpQK1af2g0qcAm32EguTX8oBHZGKumPX2EqYS-df69Y,106583
|
|
4
|
-
mmgp/safetensors2.py,sha256=rmUbBmK3Dra5prUTTRSVi6-XUFAa9Mj6B5CNPgzt9To,17333
|
|
5
|
-
mmgp-3.3.4.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.3.4.dist-info/METADATA,sha256=Yk2eSpNITRDHK0lclsP6VXhW0_5hkUNVvXSfk25f7Ds,16154
|
|
7
|
-
mmgp-3.3.4.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
|
|
8
|
-
mmgp-3.3.4.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.3.4.dist-info/RECORD,,
|
|
File without changes
|