mmgp 3.2.8__py3-none-any.whl → 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mmgp/offload.py +41 -14
- {mmgp-3.2.8.dist-info → mmgp-3.3.1.dist-info}/METADATA +4 -3
- mmgp-3.3.1.dist-info/RECORD +9 -0
- {mmgp-3.2.8.dist-info → mmgp-3.3.1.dist-info}/WHEEL +1 -1
- mmgp-3.2.8.dist-info/RECORD +0 -9
- {mmgp-3.2.8.dist-info → mmgp-3.3.1.dist-info/licenses}/LICENSE.md +0 -0
- {mmgp-3.2.8.dist-info → mmgp-3.3.1.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.
|
|
1
|
+
# ------------------ Memory Management 3.3.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -92,6 +92,8 @@ ONE_MB = 1048576
|
|
|
92
92
|
sizeofbfloat16 = torch.bfloat16.itemsize
|
|
93
93
|
sizeofint8 = torch.int8.itemsize
|
|
94
94
|
total_pinned_bytes = 0
|
|
95
|
+
max_pinnable_bytes = 0
|
|
96
|
+
|
|
95
97
|
physical_memory= psutil.virtual_memory().total
|
|
96
98
|
|
|
97
99
|
HEADER = '\033[95m'
|
|
@@ -319,6 +321,13 @@ def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
|
|
|
319
321
|
print(f"Found {tied_weights_count} tied weights for a total of {tied_weights_total/ONE_MB:0.2f} MB, last : {tied_weights_last}")
|
|
320
322
|
|
|
321
323
|
def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
324
|
+
global max_pinnable_bytes, total_pinned_bytes
|
|
325
|
+
if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
|
|
326
|
+
|
|
327
|
+
if verboseLevel>=1 :
|
|
328
|
+
print(f"Unable pin data of '{sd_name}' to reserved RAM as there is no reserved RAM left")
|
|
329
|
+
return
|
|
330
|
+
|
|
322
331
|
current_big_tensor_size = 0
|
|
323
332
|
big_tensor_no = 0
|
|
324
333
|
big_tensors_sizes = []
|
|
@@ -393,10 +402,19 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
393
402
|
|
|
394
403
|
|
|
395
404
|
def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
405
|
+
|
|
406
|
+
global max_pinnable_bytes, total_pinned_bytes
|
|
407
|
+
if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
|
|
408
|
+
|
|
409
|
+
if verboseLevel>=1 :
|
|
410
|
+
print(f"Unable pin data of '{model_id}' to reserved RAM as there is no reserved RAM left")
|
|
411
|
+
return
|
|
412
|
+
|
|
396
413
|
if partialPinning:
|
|
397
414
|
towers_names, _ = _detect_main_towers(model)
|
|
398
415
|
|
|
399
416
|
|
|
417
|
+
|
|
400
418
|
current_big_tensor_size = 0
|
|
401
419
|
big_tensor_no = 0
|
|
402
420
|
big_tensors_sizes = []
|
|
@@ -484,13 +502,18 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
484
502
|
total = 0
|
|
485
503
|
|
|
486
504
|
|
|
505
|
+
failed_planned_allocation = False
|
|
487
506
|
|
|
488
507
|
for size in big_tensors_sizes:
|
|
489
508
|
try:
|
|
509
|
+
# if total > 7000 * ONE_MB:
|
|
510
|
+
# raise Exception ("test no more reserved RAM")
|
|
490
511
|
current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
491
512
|
big_tensors.append(current_big_tensor)
|
|
492
513
|
except:
|
|
493
514
|
print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
515
|
+
max_pinnable_bytes = total + total_pinned_bytes
|
|
516
|
+
failed_planned_allocation = True
|
|
494
517
|
break
|
|
495
518
|
|
|
496
519
|
last_big_tensor += 1
|
|
@@ -553,13 +576,13 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
553
576
|
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
554
577
|
tensor_no += 1
|
|
555
578
|
del p
|
|
556
|
-
|
|
579
|
+
model._pinned_bytes = total
|
|
557
580
|
total_pinned_bytes += total
|
|
558
581
|
del params_dict
|
|
559
582
|
gc.collect()
|
|
560
583
|
|
|
561
584
|
if verboseLevel >=1:
|
|
562
|
-
if partialPinning:
|
|
585
|
+
if partialPinning or failed_planned_allocation:
|
|
563
586
|
print(f"The model was partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
564
587
|
else:
|
|
565
588
|
print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
@@ -575,7 +598,7 @@ def _welcome():
|
|
|
575
598
|
if welcome_displayed:
|
|
576
599
|
return
|
|
577
600
|
welcome_displayed = True
|
|
578
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.
|
|
601
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
579
602
|
|
|
580
603
|
def _extract_num_from_str(num_in_str):
|
|
581
604
|
size = len(num_in_str)
|
|
@@ -882,10 +905,11 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
882
905
|
return source + CrLf + text
|
|
883
906
|
|
|
884
907
|
def trunc(text, sz):
|
|
908
|
+
text = str(text)
|
|
885
909
|
if len(text) < sz:
|
|
886
|
-
return
|
|
910
|
+
return text
|
|
887
911
|
else:
|
|
888
|
-
return
|
|
912
|
+
return text[0:sz] + '...'
|
|
889
913
|
|
|
890
914
|
if not isinstance(lora_path, list):
|
|
891
915
|
lora_path = [lora_path]
|
|
@@ -1408,7 +1432,9 @@ def extract_models(obj = None, prefix = None):
|
|
|
1408
1432
|
elif prefix[ -1:] != "/":
|
|
1409
1433
|
prefix + "/"
|
|
1410
1434
|
|
|
1411
|
-
for name in dir(obj):
|
|
1435
|
+
for name in dir(obj):
|
|
1436
|
+
if name in ["_execution_device"]:
|
|
1437
|
+
continue
|
|
1412
1438
|
element = getattr(obj,name)
|
|
1413
1439
|
if name in ("pipeline", "pipe"):
|
|
1414
1440
|
pipeline = element
|
|
@@ -1550,7 +1576,7 @@ class offload:
|
|
|
1550
1576
|
lora_A, lora_B, alpha = lora_data
|
|
1551
1577
|
key = adapter + '_GPU'
|
|
1552
1578
|
if to_GPU:
|
|
1553
|
-
lora_module[key] = [lora_A.cuda(), lora_B.cuda(), alpha]
|
|
1579
|
+
lora_module[key] = [lora_A.cuda(non_blocking=True), lora_B.cuda(non_blocking=True), alpha]
|
|
1554
1580
|
elif key in lora_module:
|
|
1555
1581
|
del lora_module[key]
|
|
1556
1582
|
|
|
@@ -1594,8 +1620,8 @@ class offload:
|
|
|
1594
1620
|
lora_data = loras_model_data.get(parent_module, None)
|
|
1595
1621
|
if lora_data != None:
|
|
1596
1622
|
loras_modules[parent_module]= lora_data
|
|
1597
|
-
|
|
1598
|
-
|
|
1623
|
+
if len(loras_modules) > 0:
|
|
1624
|
+
self._move_loras(loras_active_adapters, loras_modules, True)
|
|
1599
1625
|
|
|
1600
1626
|
loaded_block = self.loaded_blocks[model_id]
|
|
1601
1627
|
|
|
@@ -2019,7 +2045,7 @@ class offload:
|
|
|
2019
2045
|
print(f"Async loading plan for model '{model_id}' : {(preload_total+base_size)/ONE_MB:0.2f} MB will be preloaded (base size of {base_size/ONE_MB:0.2f} MB + {preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async" + (" circular" if len(towers) == 1 else "") + " shuttle")
|
|
2020
2046
|
|
|
2021
2047
|
def release(self):
|
|
2022
|
-
global last_offload_obj
|
|
2048
|
+
global last_offload_obj, total_pinned_bytes
|
|
2023
2049
|
|
|
2024
2050
|
if last_offload_obj == self:
|
|
2025
2051
|
last_offload_obj = None
|
|
@@ -2035,6 +2061,8 @@ class offload:
|
|
|
2035
2061
|
|
|
2036
2062
|
for model_id, model in self.models.items():
|
|
2037
2063
|
move_loras_to_device(model, "cpu")
|
|
2064
|
+
if hasattr(model, "_pinned_bytes"):
|
|
2065
|
+
total_pinned_bytes -= model._pinned_bytes
|
|
2038
2066
|
if hasattr(model, "_loras_model_data"):
|
|
2039
2067
|
unload_loras_from_model(model)
|
|
2040
2068
|
|
|
@@ -2046,7 +2074,7 @@ class offload:
|
|
|
2046
2074
|
|
|
2047
2075
|
|
|
2048
2076
|
|
|
2049
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2077
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2050
2078
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
2051
2079
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
2052
2080
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -2216,9 +2244,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, l
|
|
|
2216
2244
|
|
|
2217
2245
|
model_budgets[model_id] = model_budget
|
|
2218
2246
|
|
|
2219
|
-
partialPinning = False
|
|
2220
2247
|
|
|
2221
|
-
if estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
|
|
2248
|
+
if not partialPinning and estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
|
|
2222
2249
|
if self.verboseLevel >=1:
|
|
2223
2250
|
print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated available reservable RAM is {(max_reservable_memory-total_pinned_bytes)/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
|
|
2224
2251
|
partialPinning = True
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.3.1
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -13,10 +13,11 @@ Requires-Dist: optimum-quanto
|
|
|
13
13
|
Requires-Dist: accelerate
|
|
14
14
|
Requires-Dist: safetensors
|
|
15
15
|
Requires-Dist: psutil
|
|
16
|
+
Dynamic: license-file
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
<p align="center">
|
|
19
|
-
<H2>Memory Management 3.
|
|
20
|
+
<H2>Memory Management 3.3.1 for the GPU Poor by DeepBeepMeep</H2>
|
|
20
21
|
</p>
|
|
21
22
|
|
|
22
23
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=hG-gMFeHsRkjaPan_lwiTsQOctkXylJMiWhyL3KvGQA,106337
|
|
4
|
+
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
+
mmgp-3.3.1.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.3.1.dist-info/METADATA,sha256=SF0kLwi8zGHF1F53ZxFDZq5bDCWE39l-A24tYeyyhHo,16153
|
|
7
|
+
mmgp-3.3.1.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
|
|
8
|
+
mmgp-3.3.1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.3.1.dist-info/RECORD,,
|
mmgp-3.2.8.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=Wwk3uV3ZJv3ApyX-vpzukOllkBOTkLwGm5qDadmqVqQ,105209
|
|
4
|
-
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
-
mmgp-3.2.8.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.2.8.dist-info/METADATA,sha256=_3nE_8-UHpItfJsJsb4KUIs_WdROc68SCTNTP5lj_ho,16131
|
|
7
|
-
mmgp-3.2.8.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
|
|
8
|
-
mmgp-3.2.8.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.2.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|