mmgp 3.3.0__py3-none-any.whl → 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mmgp/offload.py +32 -8
- {mmgp-3.3.0.dist-info → mmgp-3.3.1.dist-info}/METADATA +2 -2
- mmgp-3.3.1.dist-info/RECORD +9 -0
- mmgp-3.3.0.dist-info/RECORD +0 -9
- {mmgp-3.3.0.dist-info → mmgp-3.3.1.dist-info}/WHEEL +0 -0
- {mmgp-3.3.0.dist-info → mmgp-3.3.1.dist-info}/licenses/LICENSE.md +0 -0
- {mmgp-3.3.0.dist-info → mmgp-3.3.1.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.3.
|
|
1
|
+
# ------------------ Memory Management 3.3.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -92,6 +92,8 @@ ONE_MB = 1048576
|
|
|
92
92
|
sizeofbfloat16 = torch.bfloat16.itemsize
|
|
93
93
|
sizeofint8 = torch.int8.itemsize
|
|
94
94
|
total_pinned_bytes = 0
|
|
95
|
+
max_pinnable_bytes = 0
|
|
96
|
+
|
|
95
97
|
physical_memory= psutil.virtual_memory().total
|
|
96
98
|
|
|
97
99
|
HEADER = '\033[95m'
|
|
@@ -319,6 +321,13 @@ def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
|
|
|
319
321
|
print(f"Found {tied_weights_count} tied weights for a total of {tied_weights_total/ONE_MB:0.2f} MB, last : {tied_weights_last}")
|
|
320
322
|
|
|
321
323
|
def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
324
|
+
global max_pinnable_bytes, total_pinned_bytes
|
|
325
|
+
if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
|
|
326
|
+
|
|
327
|
+
if verboseLevel>=1 :
|
|
328
|
+
print(f"Unable pin data of '{sd_name}' to reserved RAM as there is no reserved RAM left")
|
|
329
|
+
return
|
|
330
|
+
|
|
322
331
|
current_big_tensor_size = 0
|
|
323
332
|
big_tensor_no = 0
|
|
324
333
|
big_tensors_sizes = []
|
|
@@ -393,10 +402,19 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
393
402
|
|
|
394
403
|
|
|
395
404
|
def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
405
|
+
|
|
406
|
+
global max_pinnable_bytes, total_pinned_bytes
|
|
407
|
+
if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
|
|
408
|
+
|
|
409
|
+
if verboseLevel>=1 :
|
|
410
|
+
print(f"Unable pin data of '{model_id}' to reserved RAM as there is no reserved RAM left")
|
|
411
|
+
return
|
|
412
|
+
|
|
396
413
|
if partialPinning:
|
|
397
414
|
towers_names, _ = _detect_main_towers(model)
|
|
398
415
|
|
|
399
416
|
|
|
417
|
+
|
|
400
418
|
current_big_tensor_size = 0
|
|
401
419
|
big_tensor_no = 0
|
|
402
420
|
big_tensors_sizes = []
|
|
@@ -484,13 +502,18 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
484
502
|
total = 0
|
|
485
503
|
|
|
486
504
|
|
|
505
|
+
failed_planned_allocation = False
|
|
487
506
|
|
|
488
507
|
for size in big_tensors_sizes:
|
|
489
508
|
try:
|
|
509
|
+
# if total > 7000 * ONE_MB:
|
|
510
|
+
# raise Exception ("test no more reserved RAM")
|
|
490
511
|
current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
491
512
|
big_tensors.append(current_big_tensor)
|
|
492
513
|
except:
|
|
493
514
|
print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
515
|
+
max_pinnable_bytes = total + total_pinned_bytes
|
|
516
|
+
failed_planned_allocation = True
|
|
494
517
|
break
|
|
495
518
|
|
|
496
519
|
last_big_tensor += 1
|
|
@@ -553,13 +576,13 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
553
576
|
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
554
577
|
tensor_no += 1
|
|
555
578
|
del p
|
|
556
|
-
|
|
579
|
+
model._pinned_bytes = total
|
|
557
580
|
total_pinned_bytes += total
|
|
558
581
|
del params_dict
|
|
559
582
|
gc.collect()
|
|
560
583
|
|
|
561
584
|
if verboseLevel >=1:
|
|
562
|
-
if partialPinning:
|
|
585
|
+
if partialPinning or failed_planned_allocation:
|
|
563
586
|
print(f"The model was partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
564
587
|
else:
|
|
565
588
|
print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
@@ -575,7 +598,7 @@ def _welcome():
|
|
|
575
598
|
if welcome_displayed:
|
|
576
599
|
return
|
|
577
600
|
welcome_displayed = True
|
|
578
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.
|
|
601
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
579
602
|
|
|
580
603
|
def _extract_num_from_str(num_in_str):
|
|
581
604
|
size = len(num_in_str)
|
|
@@ -2022,7 +2045,7 @@ class offload:
|
|
|
2022
2045
|
print(f"Async loading plan for model '{model_id}' : {(preload_total+base_size)/ONE_MB:0.2f} MB will be preloaded (base size of {base_size/ONE_MB:0.2f} MB + {preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async" + (" circular" if len(towers) == 1 else "") + " shuttle")
|
|
2023
2046
|
|
|
2024
2047
|
def release(self):
|
|
2025
|
-
global last_offload_obj
|
|
2048
|
+
global last_offload_obj, total_pinned_bytes
|
|
2026
2049
|
|
|
2027
2050
|
if last_offload_obj == self:
|
|
2028
2051
|
last_offload_obj = None
|
|
@@ -2038,6 +2061,8 @@ class offload:
|
|
|
2038
2061
|
|
|
2039
2062
|
for model_id, model in self.models.items():
|
|
2040
2063
|
move_loras_to_device(model, "cpu")
|
|
2064
|
+
if hasattr(model, "_pinned_bytes"):
|
|
2065
|
+
total_pinned_bytes -= model._pinned_bytes
|
|
2041
2066
|
if hasattr(model, "_loras_model_data"):
|
|
2042
2067
|
unload_loras_from_model(model)
|
|
2043
2068
|
|
|
@@ -2049,7 +2074,7 @@ class offload:
|
|
|
2049
2074
|
|
|
2050
2075
|
|
|
2051
2076
|
|
|
2052
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2077
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2053
2078
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
2054
2079
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
2055
2080
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -2219,9 +2244,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, l
|
|
|
2219
2244
|
|
|
2220
2245
|
model_budgets[model_id] = model_budget
|
|
2221
2246
|
|
|
2222
|
-
partialPinning = False
|
|
2223
2247
|
|
|
2224
|
-
if estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
|
|
2248
|
+
if not partialPinning and estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
|
|
2225
2249
|
if self.verboseLevel >=1:
|
|
2226
2250
|
print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated available reservable RAM is {(max_reservable_memory-total_pinned_bytes)/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
|
|
2227
2251
|
partialPinning = True
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.3.
|
|
3
|
+
Version: 3.3.1
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -17,7 +17,7 @@ Dynamic: license-file
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.3.
|
|
20
|
+
<H2>Memory Management 3.3.1 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
23
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=hG-gMFeHsRkjaPan_lwiTsQOctkXylJMiWhyL3KvGQA,106337
|
|
4
|
+
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
+
mmgp-3.3.1.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.3.1.dist-info/METADATA,sha256=SF0kLwi8zGHF1F53ZxFDZq5bDCWE39l-A24tYeyyhHo,16153
|
|
7
|
+
mmgp-3.3.1.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
|
|
8
|
+
mmgp-3.3.1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.3.1.dist-info/RECORD,,
|
mmgp-3.3.0.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=xdlYbB8nKUywAAMPcfCzJmCxYHvBB5vcZgv2wEQTtbE,105329
|
|
4
|
-
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
-
mmgp-3.3.0.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.3.0.dist-info/METADATA,sha256=33eB_YmC6PciTkzi_Z_gsWWzoz6RJgyLbEItFatVghk,16153
|
|
7
|
-
mmgp-3.3.0.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
|
|
8
|
-
mmgp-3.3.0.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|