mmgp 3.5.10__tar.gz → 3.5.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.5.10 → mmgp-3.5.12}/LICENSE.md +1 -1
- {mmgp-3.5.10/src/mmgp.egg-info → mmgp-3.5.12}/PKG-INFO +2 -2
- {mmgp-3.5.10 → mmgp-3.5.12}/README.md +1 -1
- {mmgp-3.5.10 → mmgp-3.5.12}/pyproject.toml +1 -1
- {mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp/offload.py +49 -11
- {mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp/safetensors2.py +13 -3
- {mmgp-3.5.10 → mmgp-3.5.12/src/mmgp.egg-info}/PKG-INFO +2 -2
- {mmgp-3.5.10 → mmgp-3.5.12}/setup.cfg +0 -0
- {mmgp-3.5.10 → mmgp-3.5.12}/src/__init__.py +0 -0
- {mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp/__init__.py +0 -0
- {mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
GNU GENERAL PUBLIC LICENSE
|
|
1
|
+
GNU GENERAL PUBLIC LICENSE
|
|
2
2
|
Version 3, 29 June 2007
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.5.
|
|
3
|
+
Version: 3.5.12
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.5.
|
|
18
|
+
<H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.5.
|
|
1
|
+
# ------------------ Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -66,7 +66,6 @@ from accelerate import init_empty_weights
|
|
|
66
66
|
|
|
67
67
|
import functools
|
|
68
68
|
import types
|
|
69
|
-
from functools import lru_cache
|
|
70
69
|
import torch
|
|
71
70
|
|
|
72
71
|
|
|
@@ -90,6 +89,23 @@ class QEmbedding(QModuleMixin, torch.nn.Embedding):
|
|
|
90
89
|
|
|
91
90
|
shared_state = {}
|
|
92
91
|
|
|
92
|
+
def get_cache(cache_name):
|
|
93
|
+
all_cache = shared_state.get("_cache", None)
|
|
94
|
+
if all_cache is None:
|
|
95
|
+
all_cache = {}
|
|
96
|
+
shared_state["_cache"]= all_cache
|
|
97
|
+
cache = all_cache.get(cache_name, None)
|
|
98
|
+
if cache is None:
|
|
99
|
+
cache = {}
|
|
100
|
+
all_cache[cache_name] = cache
|
|
101
|
+
return cache
|
|
102
|
+
|
|
103
|
+
def clear_caches():
|
|
104
|
+
all_cache = shared_state.get("_cache", None)
|
|
105
|
+
if all_cache is not None:
|
|
106
|
+
all_cache.clear()
|
|
107
|
+
|
|
108
|
+
|
|
93
109
|
mmm = safetensors2.mmm
|
|
94
110
|
|
|
95
111
|
default_verboseLevel = 1
|
|
@@ -623,6 +639,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
623
639
|
total += size
|
|
624
640
|
|
|
625
641
|
current_big_tensor = big_tensors[big_tensor_no]
|
|
642
|
+
|
|
626
643
|
if is_buffer :
|
|
627
644
|
_force_load_buffer(p) # otherwise potential memory leak
|
|
628
645
|
if isinstance(p, QTensor):
|
|
@@ -671,7 +688,7 @@ def _welcome():
|
|
|
671
688
|
if welcome_displayed:
|
|
672
689
|
return
|
|
673
690
|
welcome_displayed = True
|
|
674
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.
|
|
691
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.12) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
675
692
|
|
|
676
693
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
677
694
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -1032,7 +1049,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1032
1049
|
|
|
1033
1050
|
if split_linear_modules_map != None:
|
|
1034
1051
|
new_state_dict = dict()
|
|
1035
|
-
suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False)]
|
|
1052
|
+
suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False)]
|
|
1036
1053
|
for module_name, module_data in state_dict.items():
|
|
1037
1054
|
name_parts = module_name.split(".")
|
|
1038
1055
|
for suffix, pos, any_split in suffixes:
|
|
@@ -1306,7 +1323,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
|
|
|
1306
1323
|
model_path = [model_path]
|
|
1307
1324
|
|
|
1308
1325
|
|
|
1309
|
-
if not builtins.all(file_name.endswith(".sft") or file_name.endswith(".safetensors") or file_name.endswith(".pt") for file_name in model_path):
|
|
1326
|
+
if not builtins.all(file_name.endswith(".sft") or file_name.endswith(".safetensors") or file_name.endswith(".pt") or file_name.endswith(".ckpt") for file_name in model_path):
|
|
1310
1327
|
raise Exception("full model path to file expected")
|
|
1311
1328
|
|
|
1312
1329
|
model_path = [ _get_model(file) for file in model_path]
|
|
@@ -1314,7 +1331,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
|
|
|
1314
1331
|
raise Exception("Unable to find file")
|
|
1315
1332
|
|
|
1316
1333
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
1317
|
-
if model_path[-1].endswith(".pt"):
|
|
1334
|
+
if model_path[-1].endswith(".pt") or model_path[-1].endswith(".ckpt"):
|
|
1318
1335
|
metadata = None
|
|
1319
1336
|
else:
|
|
1320
1337
|
with safetensors2.safe_open(model_path[-1], writable_tensors =writable_tensors) as f:
|
|
@@ -2481,7 +2498,7 @@ def {fname}(module, *args, **kwargs):
|
|
|
2481
2498
|
|
|
2482
2499
|
|
|
2483
2500
|
|
|
2484
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2501
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, verboseLevel = -1):
|
|
2485
2502
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
2486
2503
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
2487
2504
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -2490,6 +2507,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2490
2507
|
budgets: 0 by default (unlimited). If non 0, it corresponds to the maximum size in MB that every model will occupy at any moment
|
|
2491
2508
|
(in fact the real usage is twice this number). It is very efficient to reduce VRAM consumption but this feature may be very slow
|
|
2492
2509
|
if pinnedMemory is not enabled
|
|
2510
|
+
vram_safety_coefficient: float between 0 and 1 (exclusive), default 0.8. Sets the maximum portion of VRAM that can be used for models.
|
|
2511
|
+
Lower values provide more safety margin but may reduce performance.
|
|
2493
2512
|
"""
|
|
2494
2513
|
self = offload()
|
|
2495
2514
|
self.verboseLevel = verboseLevel
|
|
@@ -2505,7 +2524,11 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2505
2524
|
return float(b[:-1]) * self.device_mem_capacity
|
|
2506
2525
|
else:
|
|
2507
2526
|
return b * ONE_MB
|
|
2508
|
-
|
|
2527
|
+
|
|
2528
|
+
# Validate vram_safety_coefficient
|
|
2529
|
+
if not isinstance(vram_safety_coefficient, float) or vram_safety_coefficient <= 0 or vram_safety_coefficient >= 1:
|
|
2530
|
+
raise ValueError("vram_safety_coefficient must be a float between 0 and 1 (exclusive)")
|
|
2531
|
+
|
|
2509
2532
|
budget = 0
|
|
2510
2533
|
if not budgets is None:
|
|
2511
2534
|
if isinstance(budgets , dict):
|
|
@@ -2650,14 +2673,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2650
2673
|
model_budget = new_budget if model_budget == 0 or new_budget < model_budget else model_budget
|
|
2651
2674
|
if model_budget > 0 and model_budget > current_model_size:
|
|
2652
2675
|
model_budget = 0
|
|
2653
|
-
coef =
|
|
2676
|
+
coef =vram_safety_coefficient
|
|
2654
2677
|
if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
|
|
2655
2678
|
if verboseLevel >= 1:
|
|
2656
2679
|
if model_budget == 0:
|
|
2657
|
-
print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
|
|
2680
|
+
print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100:.0f}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB:0.1f}) MB)")
|
|
2658
2681
|
else:
|
|
2659
2682
|
print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
|
|
2660
|
-
print(f"Budget allocation for this model has been consequently reduced to the
|
|
2683
|
+
print(f"Budget allocation for this model has been consequently reduced to the {coef * 100:.0f}% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
|
|
2661
2684
|
model_budget = coef * self.device_mem_capacity
|
|
2662
2685
|
|
|
2663
2686
|
|
|
@@ -2681,6 +2704,21 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2681
2704
|
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
2682
2705
|
else:
|
|
2683
2706
|
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, perc_reserved_mem_max = perc_reserved_mem_max, verboseLevel=verboseLevel)
|
|
2707
|
+
# empty_tensor = torch.empty((1,))
|
|
2708
|
+
# for sub_module_name, sub_module in current_model.named_modules():
|
|
2709
|
+
# for k, p in sub_module.named_parameters(recurse=False):
|
|
2710
|
+
# if p is not None:
|
|
2711
|
+
# if isinstance(p, QTensor):
|
|
2712
|
+
# p._data.data = empty_tensor
|
|
2713
|
+
# p._scale.data = empty_tensor
|
|
2714
|
+
# else:
|
|
2715
|
+
# p.data = empty_tensor
|
|
2716
|
+
# del k
|
|
2717
|
+
# for k, v in sub_module.named_buffers(recurse=False):
|
|
2718
|
+
# del k
|
|
2719
|
+
# sub_module = None
|
|
2720
|
+
# v = None
|
|
2721
|
+
# gc.collect()
|
|
2684
2722
|
current_budget = model_budgets[model_id]
|
|
2685
2723
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
2686
2724
|
self.loaded_blocks[model_id] = None
|
|
@@ -46,7 +46,16 @@ class MmapTracker:
|
|
|
46
46
|
file_path = os.path.join(*s)
|
|
47
47
|
self.file_path = file_path # os.path.abspath(file_path)
|
|
48
48
|
self.count = 0
|
|
49
|
-
|
|
49
|
+
key = file_path
|
|
50
|
+
i = 1
|
|
51
|
+
while True:
|
|
52
|
+
if key not in mmm:
|
|
53
|
+
mmm[key] = self
|
|
54
|
+
break
|
|
55
|
+
i +=1
|
|
56
|
+
key = key + "#" + str(i)
|
|
57
|
+
self.mmm_key = key
|
|
58
|
+
# print(f"MMAP Add: {file_path}: {mmm.keys()}")
|
|
50
59
|
|
|
51
60
|
def register(self, mmap_obj, map_id, start, size):
|
|
52
61
|
|
|
@@ -61,7 +70,8 @@ class MmapTracker:
|
|
|
61
70
|
|
|
62
71
|
print(f"MMap Manager of file '{self.file_path}' : MMap no {map_id} has been released" + text)
|
|
63
72
|
if self.count == self._already_released:
|
|
64
|
-
|
|
73
|
+
# print(f"MMAP Del: {self.file_path}: {mmm.keys()}")
|
|
74
|
+
del mmm[self.mmm_key ]
|
|
65
75
|
|
|
66
76
|
self._maps.pop(map_id, None)
|
|
67
77
|
|
|
@@ -240,7 +250,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None, extr
|
|
|
240
250
|
t = t.view(torch.uint16)
|
|
241
251
|
elif dtype == torch.float8_e5m2 or dtype == torch.float8_e4m3fn:
|
|
242
252
|
t = t.view(torch.uint8)
|
|
243
|
-
buffer = t.numpy().tobytes()
|
|
253
|
+
buffer = t.cpu().numpy().tobytes()
|
|
244
254
|
bytes_written = writer.write(buffer)
|
|
245
255
|
assert bytes_written == size
|
|
246
256
|
i+=1
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.5.
|
|
3
|
+
Version: 3.5.12
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.5.
|
|
18
|
+
<H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|