mmgp 3.2.0__tar.gz → 3.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.2.0/src/mmgp.egg-info → mmgp-3.2.1}/PKG-INFO +1 -1
- {mmgp-3.2.0 → mmgp-3.2.1}/pyproject.toml +1 -1
- {mmgp-3.2.0 → mmgp-3.2.1}/src/mmgp/offload.py +28 -16
- {mmgp-3.2.0 → mmgp-3.2.1/src/mmgp.egg-info}/PKG-INFO +1 -1
- {mmgp-3.2.0 → mmgp-3.2.1}/LICENSE.md +0 -0
- {mmgp-3.2.0 → mmgp-3.2.1}/README.md +0 -0
- {mmgp-3.2.0 → mmgp-3.2.1}/setup.cfg +0 -0
- {mmgp-3.2.0 → mmgp-3.2.1}/src/__init__.py +0 -0
- {mmgp-3.2.0 → mmgp-3.2.1}/src/mmgp/__init__.py +0 -0
- {mmgp-3.2.0 → mmgp-3.2.1}/src/mmgp/safetensors2.py +0 -0
- {mmgp-3.2.0 → mmgp-3.2.1}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.2.0 → mmgp-3.2.1}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.2.0 → mmgp-3.2.1}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.2.0 → mmgp-3.2.1}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.2.
|
|
1
|
+
# ------------------ Memory Management 3.2.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -479,7 +479,7 @@ def _welcome():
|
|
|
479
479
|
if welcome_displayed:
|
|
480
480
|
return
|
|
481
481
|
welcome_displayed = True
|
|
482
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.
|
|
482
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
483
483
|
|
|
484
484
|
def _extract_num_from_str(num_in_str):
|
|
485
485
|
size = len(num_in_str)
|
|
@@ -603,8 +603,6 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 2*
|
|
|
603
603
|
tied_weights= {}
|
|
604
604
|
|
|
605
605
|
for submodule_name, submodule in model_to_quantize.named_modules():
|
|
606
|
-
if "embed_token" in submodule_name:
|
|
607
|
-
pass
|
|
608
606
|
if isinstance(submodule, QModuleMixin):
|
|
609
607
|
if verboseLevel>=1:
|
|
610
608
|
print("No quantization to do as model is already quantized")
|
|
@@ -802,7 +800,7 @@ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor
|
|
|
802
800
|
scaling = get_scaling(active_adapter)
|
|
803
801
|
lora_A_weight = lora_A.weight
|
|
804
802
|
lora_B_weight = lora_B.weight
|
|
805
|
-
if new_weights:
|
|
803
|
+
if new_weights:
|
|
806
804
|
base_weight = torch.addmm(base_weight, lora_B_weight, lora_A_weight, alpha= scaling )
|
|
807
805
|
# base_weight = base_weight + scaling * lora_B_weight @ lora_A_weight
|
|
808
806
|
else:
|
|
@@ -1017,7 +1015,7 @@ def move_loras_to_device(model, device="cpu" ):
|
|
|
1017
1015
|
if ".lora_" in k:
|
|
1018
1016
|
m.to(device)
|
|
1019
1017
|
|
|
1020
|
-
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, verboseLevel = -1):
|
|
1018
|
+
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, verboseLevel = -1):
|
|
1021
1019
|
"""
|
|
1022
1020
|
quick version of .LoadfromPretrained of the transformers library
|
|
1023
1021
|
used to build a model and load the corresponding weights (quantized or not)
|
|
@@ -1031,6 +1029,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1031
1029
|
raise Exception("full model path to file expected")
|
|
1032
1030
|
|
|
1033
1031
|
model_path = _get_model(model_path)
|
|
1032
|
+
|
|
1034
1033
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
1035
1034
|
|
|
1036
1035
|
with safetensors2.safe_open(model_path) as f:
|
|
@@ -1058,11 +1057,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1058
1057
|
if "architectures" in transformer_config:
|
|
1059
1058
|
architectures = transformer_config["architectures"]
|
|
1060
1059
|
class_name = architectures[0]
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1060
|
+
if modelClass !=None:
|
|
1061
|
+
transfomer_class = modelClass
|
|
1062
|
+
else:
|
|
1063
|
+
module = __import__("transformers")
|
|
1064
|
+
map = { "T5WithLMHeadModel" : "T5EncoderModel"}
|
|
1065
|
+
class_name = map.get(class_name, class_name)
|
|
1066
|
+
transfomer_class = getattr(module, class_name)
|
|
1066
1067
|
from transformers import AutoConfig
|
|
1067
1068
|
|
|
1068
1069
|
import tempfile
|
|
@@ -1081,8 +1082,11 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1081
1082
|
elif "_class_name" in transformer_config:
|
|
1082
1083
|
class_name = transformer_config["_class_name"]
|
|
1083
1084
|
|
|
1084
|
-
|
|
1085
|
-
|
|
1085
|
+
if modelClass !=None:
|
|
1086
|
+
transfomer_class = modelClass
|
|
1087
|
+
else:
|
|
1088
|
+
module = __import__("diffusers")
|
|
1089
|
+
transfomer_class = getattr(module, class_name)
|
|
1086
1090
|
|
|
1087
1091
|
with init_empty_weights():
|
|
1088
1092
|
model = transfomer_class.from_config(transformer_config)
|
|
@@ -1104,6 +1108,8 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
1104
1108
|
"""
|
|
1105
1109
|
|
|
1106
1110
|
file_path = _get_model(file_path)
|
|
1111
|
+
if file_path == None:
|
|
1112
|
+
raise Exception("Unable to find file")
|
|
1107
1113
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
1108
1114
|
|
|
1109
1115
|
model = _remove_model_wrapper(model)
|
|
@@ -1153,9 +1159,16 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
|
|
|
1153
1159
|
_requantize(model, state_dict, quantization_map)
|
|
1154
1160
|
|
|
1155
1161
|
missing_keys , unexpected_keys = model.load_state_dict(state_dict, False, assign = True )
|
|
1156
|
-
if len(missing_keys) > 0
|
|
1162
|
+
if len(missing_keys) > 0 :
|
|
1157
1163
|
# if there is a key mismatch maybe we forgot to remove some prefix or we are trying to load just a sub part of a larger model
|
|
1158
|
-
|
|
1164
|
+
if hasattr(model, "base_model_prefix"):
|
|
1165
|
+
base_model_prefix = model.base_model_prefix + "."
|
|
1166
|
+
else:
|
|
1167
|
+
for k,v in state_dict.items():
|
|
1168
|
+
if k.endswith(missing_keys[0]):
|
|
1169
|
+
base_model_prefix = k[:-len(missing_keys[0])]
|
|
1170
|
+
break
|
|
1171
|
+
|
|
1159
1172
|
new_state_dict= {}
|
|
1160
1173
|
start = -1
|
|
1161
1174
|
for k,v in state_dict.items():
|
|
@@ -1521,7 +1534,6 @@ class offload:
|
|
|
1521
1534
|
model = self.models[model_id]
|
|
1522
1535
|
self.active_models.append(model)
|
|
1523
1536
|
self.active_models_ids.append(model_id)
|
|
1524
|
-
|
|
1525
1537
|
self.gpu_load_blocks(model_id, None, True)
|
|
1526
1538
|
for block_name in self.preloaded_blocks_per_model[model_id]:
|
|
1527
1539
|
self.gpu_load_blocks(model_id, block_name, True)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|