mmgp 3.5.5__tar.gz → 3.5.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.5.5/src/mmgp.egg-info → mmgp-3.5.7}/PKG-INFO +2 -2
- {mmgp-3.5.5 → mmgp-3.5.7}/README.md +1 -1
- {mmgp-3.5.5 → mmgp-3.5.7}/pyproject.toml +1 -1
- {mmgp-3.5.5 → mmgp-3.5.7}/src/mmgp/offload.py +34 -29
- {mmgp-3.5.5 → mmgp-3.5.7/src/mmgp.egg-info}/PKG-INFO +2 -2
- {mmgp-3.5.5 → mmgp-3.5.7}/LICENSE.md +0 -0
- {mmgp-3.5.5 → mmgp-3.5.7}/setup.cfg +0 -0
- {mmgp-3.5.5 → mmgp-3.5.7}/src/__init__.py +0 -0
- {mmgp-3.5.5 → mmgp-3.5.7}/src/mmgp/__init__.py +0 -0
- {mmgp-3.5.5 → mmgp-3.5.7}/src/mmgp/safetensors2.py +0 -0
- {mmgp-3.5.5 → mmgp-3.5.7}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.5.5 → mmgp-3.5.7}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.5.5 → mmgp-3.5.7}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.5.5 → mmgp-3.5.7}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.5.
|
|
3
|
+
Version: 3.5.7
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.5.
|
|
18
|
+
<H2>Memory Management 3.5.7 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.5.
|
|
1
|
+
# ------------------ Memory Management 3.5.7 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -122,8 +122,6 @@ class clock:
|
|
|
122
122
|
def format_time_gap(self):
|
|
123
123
|
return f"{self.stop_time - self.start_time:.2f}s"
|
|
124
124
|
|
|
125
|
-
|
|
126
|
-
|
|
127
125
|
# useful functions to move a group of tensors (to design custom offload patches)
|
|
128
126
|
def move_tensors(obj, device):
|
|
129
127
|
if torch.is_tensor(obj):
|
|
@@ -668,7 +666,7 @@ def _welcome():
|
|
|
668
666
|
if welcome_displayed:
|
|
669
667
|
return
|
|
670
668
|
welcome_displayed = True
|
|
671
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.
|
|
669
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.7) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
672
670
|
|
|
673
671
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
674
672
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -1295,7 +1293,7 @@ def move_loras_to_device(model, device="cpu" ):
|
|
|
1295
1293
|
if ".lora_" in k:
|
|
1296
1294
|
m.to(device)
|
|
1297
1295
|
|
|
1298
|
-
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, modules = None, return_shared_modules = None, configKwargs ={}):
|
|
1296
|
+
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, preprocess_sd = None, modules = None, return_shared_modules = None, configKwargs ={}):
|
|
1299
1297
|
"""
|
|
1300
1298
|
quick version of .LoadfromPretrained of the transformers library
|
|
1301
1299
|
used to build a model and load the corresponding weights (quantized or not)
|
|
@@ -1383,13 +1381,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
|
|
|
1383
1381
|
|
|
1384
1382
|
model._config = transformer_config
|
|
1385
1383
|
|
|
1386
|
-
load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, modules = modules, return_shared_modules = return_shared_modules, verboseLevel=verboseLevel )
|
|
1384
|
+
load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, preprocess_sd = preprocess_sd , modules = modules, return_shared_modules = return_shared_modules, verboseLevel=verboseLevel )
|
|
1387
1385
|
|
|
1388
1386
|
return model
|
|
1389
1387
|
|
|
1390
1388
|
|
|
1391
1389
|
|
|
1392
|
-
def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, modules = None, return_shared_modules = None, verboseLevel = -1):
|
|
1390
|
+
def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, modules = None, return_shared_modules = None, verboseLevel = -1):
|
|
1393
1391
|
"""
|
|
1394
1392
|
Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
|
|
1395
1393
|
"""
|
|
@@ -1506,6 +1504,9 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
|
|
|
1506
1504
|
full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
|
|
1507
1505
|
|
|
1508
1506
|
# deal if we are trying to load just a sub part of a larger model
|
|
1507
|
+
if preprocess_sd != None:
|
|
1508
|
+
state_dict, quantization_map = preprocess_sd(state_dict, quantization_map)
|
|
1509
|
+
|
|
1509
1510
|
if modelPrefix != None:
|
|
1510
1511
|
base_model_prefix = modelPrefix + "."
|
|
1511
1512
|
state_dict = filter_state_dict(state_dict,base_model_prefix)
|
|
@@ -2095,24 +2096,23 @@ class offload:
|
|
|
2095
2096
|
if scaling == 0:
|
|
2096
2097
|
continue
|
|
2097
2098
|
if first_weight:
|
|
2098
|
-
original_weight= weight.clone() if weight
|
|
2099
|
+
original_weight= weight.clone() if weight is not None else None
|
|
2099
2100
|
first_weight = False
|
|
2100
2101
|
if first_bias:
|
|
2101
|
-
original_bias= bias.clone() if bias
|
|
2102
|
+
original_bias= bias.clone() if bias is not None else None
|
|
2102
2103
|
first_bias = False
|
|
2103
2104
|
|
|
2104
|
-
if diff_w
|
|
2105
|
+
if diff_w is not None:
|
|
2105
2106
|
weight.add_(diff_w, alpha= scaling)
|
|
2106
2107
|
diff_w = None
|
|
2107
|
-
if diff_b
|
|
2108
|
+
if diff_b is not None:
|
|
2108
2109
|
bias.add_(diff_b, alpha= scaling)
|
|
2109
2110
|
diff_b = None
|
|
2110
2111
|
|
|
2111
2112
|
ret = func(*args, **kwargs )
|
|
2112
2113
|
|
|
2113
|
-
|
|
2114
|
-
if original_bias
|
|
2115
|
-
bias.data = original_bias
|
|
2114
|
+
if original_weight is not None: weight.data = original_weight
|
|
2115
|
+
if original_bias is not None: bias.data = original_bias
|
|
2116
2116
|
|
|
2117
2117
|
return ret
|
|
2118
2118
|
|
|
@@ -2277,7 +2277,7 @@ class offload:
|
|
|
2277
2277
|
setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
|
|
2278
2278
|
|
|
2279
2279
|
|
|
2280
|
-
def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
|
|
2280
|
+
def hook_change_module(self, target_module, model, model_id, module_id, previous_method, previous_method_name ):
|
|
2281
2281
|
if hasattr(target_module, "_lock_dtype"):
|
|
2282
2282
|
dtype = target_module._lock_dtype
|
|
2283
2283
|
else:
|
|
@@ -2290,11 +2290,12 @@ class offload:
|
|
|
2290
2290
|
args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
|
|
2291
2291
|
return previous_method(*args, **kwargs)
|
|
2292
2292
|
|
|
2293
|
-
if hasattr(target_module, "
|
|
2293
|
+
if hasattr(target_module, "_mm_" + previous_method_name):
|
|
2294
2294
|
return
|
|
2295
|
-
setattr(target_module, "
|
|
2295
|
+
setattr(target_module, "_mm_Id", model_id)
|
|
2296
|
+
setattr(target_module, "_mm_" + previous_method_name, previous_method)
|
|
2296
2297
|
|
|
2297
|
-
setattr(target_module,
|
|
2298
|
+
setattr(target_module, previous_method_name, functools.update_wrapper(functools.partial(check_change_module, target_module), previous_method) )
|
|
2298
2299
|
|
|
2299
2300
|
if not self.verboseLevel >=1:
|
|
2300
2301
|
return
|
|
@@ -2662,23 +2663,27 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2662
2663
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre, None, num, False
|
|
2663
2664
|
cur_blocks_name = submodule_name
|
|
2664
2665
|
# print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
2665
|
-
|
|
2666
|
-
|
|
2667
|
-
if
|
|
2668
|
-
|
|
2669
|
-
|
|
2666
|
+
top_submodule = len(submodule_name.split("."))==1
|
|
2667
|
+
offload_hooks = submodule._offload_hooks if hasattr(submodule, "_offload_hooks") else []
|
|
2668
|
+
if len(offload_hooks) > 0:
|
|
2669
|
+
pass
|
|
2670
|
+
assert top_submodule or len(offload_hooks) == 0, "custom offload hooks can only be set at the of the module"
|
|
2671
|
+
submodule_method_names = ["forward"] + offload_hooks
|
|
2672
|
+
for submodule_method_name in submodule_method_names:
|
|
2673
|
+
if not hasattr(submodule, submodule_method_name ): continue
|
|
2674
|
+
if submodule_method_name == "forward" and any_lora and hasattr(submodule,"weight"):
|
|
2670
2675
|
submodule_method = self.hook_lora(submodule, current_model, model_id, loras_model_data, loras_model_shortcuts, submodule_name)
|
|
2671
2676
|
else:
|
|
2672
|
-
submodule_method = getattr(submodule,
|
|
2673
|
-
if callable(submodule_method):
|
|
2674
|
-
if
|
|
2675
|
-
self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method)
|
|
2677
|
+
submodule_method = getattr(submodule, submodule_method_name)
|
|
2678
|
+
if callable(submodule_method):
|
|
2679
|
+
if top_submodule and cur_blocks_name is None:
|
|
2680
|
+
self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method, submodule_method_name)
|
|
2676
2681
|
elif compilationInThisOne and submodule in towers_modules:
|
|
2677
2682
|
self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
|
|
2678
2683
|
else:
|
|
2679
2684
|
self.hook_check_empty_cache_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2680
|
-
|
|
2681
|
-
|
|
2685
|
+
|
|
2686
|
+
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
|
|
2682
2687
|
|
|
2683
2688
|
|
|
2684
2689
|
self.tune_preloading(model_id, current_budget, towers_names)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.5.
|
|
3
|
+
Version: 3.5.7
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.5.
|
|
18
|
+
<H2>Memory Management 3.5.7 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|