mmgp 3.4.5__tar.gz → 3.4.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.4.5/src/mmgp.egg-info → mmgp-3.4.7}/PKG-INFO +3 -3
- {mmgp-3.4.5 → mmgp-3.4.7}/README.md +2 -2
- {mmgp-3.4.5 → mmgp-3.4.7}/pyproject.toml +1 -1
- {mmgp-3.4.5 → mmgp-3.4.7}/src/mmgp/offload.py +195 -60
- {mmgp-3.4.5 → mmgp-3.4.7/src/mmgp.egg-info}/PKG-INFO +3 -3
- {mmgp-3.4.5 → mmgp-3.4.7}/LICENSE.md +0 -0
- {mmgp-3.4.5 → mmgp-3.4.7}/setup.cfg +0 -0
- {mmgp-3.4.5 → mmgp-3.4.7}/src/__init__.py +0 -0
- {mmgp-3.4.5 → mmgp-3.4.7}/src/mmgp/__init__.py +0 -0
- {mmgp-3.4.5 → mmgp-3.4.7}/src/mmgp/safetensors2.py +0 -0
- {mmgp-3.4.5 → mmgp-3.4.7}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.4.5 → mmgp-3.4.7}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.4.5 → mmgp-3.4.7}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.4.5 → mmgp-3.4.7}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.4.
|
|
3
|
+
Version: 3.4.7
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.4.
|
|
18
|
+
<H2>Memory Management 3.4.7 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -43,7 +43,7 @@ Each profile may use a combination of the following:
|
|
|
43
43
|
## Sample applications that use mmgp
|
|
44
44
|
It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
|
|
45
45
|
- Wan2GP: https://github.com/deepbeepmeep/Wan2GP :\
|
|
46
|
-
An excellent text to video and image to video generator
|
|
46
|
+
An excellent text to video and image to video generator that supports the best Open Source Video Architectures: Wan, Hunyuan and LTX Video
|
|
47
47
|
|
|
48
48
|
- Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
|
|
49
49
|
A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
|
|
2
2
|
<p align="center">
|
|
3
|
-
<H2>Memory Management 3.4.
|
|
3
|
+
<H2>Memory Management 3.4.7 for the GPU Poor by DeepBeepMeep</H2>
|
|
4
4
|
</p>
|
|
5
5
|
|
|
6
6
|
|
|
@@ -28,7 +28,7 @@ Each profile may use a combination of the following:
|
|
|
28
28
|
## Sample applications that use mmgp
|
|
29
29
|
It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
|
|
30
30
|
- Wan2GP: https://github.com/deepbeepmeep/Wan2GP :\
|
|
31
|
-
An excellent text to video and image to video generator
|
|
31
|
+
An excellent text to video and image to video generator that supports the best Open Source Video Architectures: Wan, Hunyuan and LTX Video
|
|
32
32
|
|
|
33
33
|
- Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
|
|
34
34
|
A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.4.
|
|
1
|
+
# ------------------ Memory Management 3.4.7 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -331,12 +331,35 @@ def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
|
|
|
331
331
|
|
|
332
332
|
def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
333
333
|
global max_pinnable_bytes, total_pinned_bytes
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
names_list = sd_name if isinstance(sd, list) else [sd_name]
|
|
337
|
+
|
|
334
338
|
if max_pinnable_bytes > 0 and total_pinned_bytes >= max_pinnable_bytes:
|
|
335
339
|
|
|
336
340
|
if verboseLevel>=1 :
|
|
337
|
-
print(f"Unable pin data of '{
|
|
341
|
+
print(f"Unable pin data of '{','.join(names_list)}' to reserved RAM as there is no reserved RAM left")
|
|
338
342
|
return
|
|
339
343
|
|
|
344
|
+
|
|
345
|
+
if isinstance(sd, list):
|
|
346
|
+
new_sd = {}
|
|
347
|
+
for i, sub_sd, in enumerate(sd):
|
|
348
|
+
for k, v in sub_sd.items():
|
|
349
|
+
new_sd[str(i) + "#" + k] =v
|
|
350
|
+
sd = new_sd
|
|
351
|
+
del new_sd
|
|
352
|
+
sub_sd = None
|
|
353
|
+
|
|
354
|
+
if isinstance(tied_weights, list):
|
|
355
|
+
new_tied_weights = {}
|
|
356
|
+
for i, sub_tied_weights, in enumerate(tied_weights):
|
|
357
|
+
for k, v in sub_tied_weights.items():
|
|
358
|
+
new_tied_weights[str(i) + "#" + k] =v
|
|
359
|
+
sd = new_tied_weights
|
|
360
|
+
del new_tied_weights
|
|
361
|
+
sub_tied_weights = None
|
|
362
|
+
|
|
340
363
|
current_big_tensor_size = 0
|
|
341
364
|
big_tensor_no = 0
|
|
342
365
|
big_tensors_sizes = []
|
|
@@ -365,11 +388,14 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
365
388
|
big_tensors = []
|
|
366
389
|
last_big_tensor = 0
|
|
367
390
|
total = 0
|
|
391
|
+
incomplete_pinning = False
|
|
368
392
|
|
|
369
393
|
try:
|
|
370
394
|
dummy_pinned_tensor = torch.empty( RESERVED_RAM_MIN_AVAILABLE, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
371
395
|
except:
|
|
372
396
|
print("There isn't any Reserved RAM left, you may need to choose a profile with a higher number that requires less Reserved RAM or set OS env 'perc_reserved_mem_max' to a value less 0.3")
|
|
397
|
+
gc.collect()
|
|
398
|
+
torch.cuda.empty_cache()
|
|
373
399
|
return
|
|
374
400
|
|
|
375
401
|
for size in big_tensors_sizes:
|
|
@@ -377,6 +403,7 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
377
403
|
current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
378
404
|
big_tensors.append(current_big_tensor)
|
|
379
405
|
except:
|
|
406
|
+
incomplete_pinning = True
|
|
380
407
|
print(f"Unable to pin more tensors for '{sd_name}' as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
381
408
|
break
|
|
382
409
|
|
|
@@ -410,9 +437,21 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
410
437
|
# global total_pinned_bytes
|
|
411
438
|
# total_pinned_bytes += total
|
|
412
439
|
gc.collect()
|
|
440
|
+
torch.cuda.empty_cache()
|
|
441
|
+
|
|
413
442
|
|
|
414
443
|
if verboseLevel >=1:
|
|
415
|
-
|
|
444
|
+
if incomplete_pinning :
|
|
445
|
+
if len(names_list) > 0:
|
|
446
|
+
print(f"'{','.join(names_list)}' were partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
447
|
+
else:
|
|
448
|
+
print(f"'{','.join(names_list)}' was partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
449
|
+
else:
|
|
450
|
+
if len(names_list) > 0:
|
|
451
|
+
print(f"'{','.join(names_list)}' was pinned entirely to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
452
|
+
else:
|
|
453
|
+
print(f"'{','.join(names_list)}' were pinned entirely to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
454
|
+
|
|
416
455
|
|
|
417
456
|
return
|
|
418
457
|
|
|
@@ -619,7 +658,7 @@ def _welcome():
|
|
|
619
658
|
if welcome_displayed:
|
|
620
659
|
return
|
|
621
660
|
welcome_displayed = True
|
|
622
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.4.
|
|
661
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.4.7) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
623
662
|
|
|
624
663
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
625
664
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -961,6 +1000,8 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
961
1000
|
errors = []
|
|
962
1001
|
adapters = {}
|
|
963
1002
|
adapter_no = 0
|
|
1003
|
+
pinned_sd_list = []
|
|
1004
|
+
pinned_names_list = []
|
|
964
1005
|
for i, path in enumerate(lora_path):
|
|
965
1006
|
adapter_name = str(adapter_no)
|
|
966
1007
|
error_msg = ""
|
|
@@ -1042,28 +1083,37 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1042
1083
|
invalid_keys = []
|
|
1043
1084
|
unexpected_keys = []
|
|
1044
1085
|
for k, v in state_dict.items():
|
|
1045
|
-
pos = k.rfind(".lora_")
|
|
1046
|
-
if pos <=0:
|
|
1047
|
-
invalid_keys.append(k)
|
|
1048
|
-
continue
|
|
1049
|
-
module_name = k[ : pos]
|
|
1050
|
-
lora_key = k[ pos+1:]
|
|
1051
1086
|
lora_A = None
|
|
1052
1087
|
lora_B = None
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1088
|
+
diff_b = None
|
|
1089
|
+
diff = None
|
|
1090
|
+
if k.endswith(".diff"):
|
|
1091
|
+
diff = v
|
|
1092
|
+
module_name = k[ : -5]
|
|
1093
|
+
elif k.endswith(".diff_b"):
|
|
1094
|
+
diff_b = v
|
|
1095
|
+
module_name = k[ : -7]
|
|
1057
1096
|
else:
|
|
1058
|
-
|
|
1059
|
-
|
|
1097
|
+
pos = k.rfind(".lora_")
|
|
1098
|
+
if pos <=0:
|
|
1099
|
+
invalid_keys.append(k)
|
|
1100
|
+
continue
|
|
1101
|
+
module_name = k[ : pos]
|
|
1102
|
+
lora_key = k[ pos+1:]
|
|
1103
|
+
if lora_key in ("lora_A.weight", "lora_down.weight"):
|
|
1104
|
+
lora_A = v
|
|
1105
|
+
elif lora_key in ("lora_B.weight", "lora_up.weight"):
|
|
1106
|
+
lora_B = v
|
|
1107
|
+
else:
|
|
1108
|
+
invalid_keys.append(k)
|
|
1109
|
+
continue
|
|
1060
1110
|
|
|
1061
1111
|
module = modules_dict.get(module_name, None)
|
|
1062
1112
|
if module == None:
|
|
1063
1113
|
unexpected_keys.append(k)
|
|
1064
1114
|
continue
|
|
1065
|
-
if not isinstance(module, (QLinear, torch.nn.Linear)):
|
|
1066
|
-
msg = f"Lora '{path}' contains a non
|
|
1115
|
+
if False: #not isinstance(module, (QLinear, torch.nn.Linear, torch.nn.Conv3d, torch.nn.LayerNorm)):
|
|
1116
|
+
msg = f"Lora '{path}' contains a non supported type of layer '{k}'"
|
|
1067
1117
|
error_msg = append(error_msg, msg)
|
|
1068
1118
|
fail = True
|
|
1069
1119
|
break
|
|
@@ -1077,7 +1127,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1077
1127
|
error_msg = append(error_msg, msg)
|
|
1078
1128
|
fail = True
|
|
1079
1129
|
break
|
|
1080
|
-
|
|
1130
|
+
elif lora_B != None:
|
|
1081
1131
|
if module_shape[0] != v.shape[0]:
|
|
1082
1132
|
if ignore_model_variations:
|
|
1083
1133
|
skip = True
|
|
@@ -1086,28 +1136,56 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1086
1136
|
error_msg = append(error_msg, msg)
|
|
1087
1137
|
fail = True
|
|
1088
1138
|
break
|
|
1139
|
+
elif diff != None:
|
|
1140
|
+
lora_B = diff
|
|
1141
|
+
if module_shape != v.shape:
|
|
1142
|
+
if ignore_model_variations:
|
|
1143
|
+
skip = True
|
|
1144
|
+
else:
|
|
1145
|
+
msg = f"Lora '{path}': Lora shape is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, lora = {v.shape[0]}). It is likely this Lora has been made for another version of this model."
|
|
1146
|
+
error_msg = append(error_msg, msg)
|
|
1147
|
+
fail = True
|
|
1148
|
+
break
|
|
1149
|
+
elif diff_b != None:
|
|
1150
|
+
if module.bias == None:
|
|
1151
|
+
msg = f"Lora '{path}': Lora Basis is defined while it doesnt exist in model '{_get_module_name(model)}'. It is likely this Lora has been made for another version of this model."
|
|
1152
|
+
fail = True
|
|
1153
|
+
break
|
|
1154
|
+
else:
|
|
1155
|
+
module_shape = module.bias.shape
|
|
1156
|
+
if module_shape != v.shape:
|
|
1157
|
+
if ignore_model_variations:
|
|
1158
|
+
skip = True
|
|
1159
|
+
else:
|
|
1160
|
+
msg = f"Lora '{path}': Lora Basis dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, lora Basis = {v.shape[0]}). It is likely this Lora has been made for another version of this model."
|
|
1161
|
+
error_msg = append(error_msg, msg)
|
|
1162
|
+
fail = True
|
|
1163
|
+
break
|
|
1164
|
+
|
|
1089
1165
|
if not check_only:
|
|
1090
1166
|
loras_module_data = loras_model_data.get(module, None)
|
|
1167
|
+
if loras_module_data == None:
|
|
1168
|
+
pass
|
|
1091
1169
|
assert loras_module_data != None
|
|
1092
|
-
# if loras_module_data == None:
|
|
1093
|
-
# loras_module_data = dict()
|
|
1094
|
-
# loras_model_data[module] = loras_module_data
|
|
1095
1170
|
loras_adapter_data = loras_module_data.get(adapter_name, None)
|
|
1096
1171
|
lora_A = None if lora_A == None else lora_A.to(module.weight.dtype)
|
|
1097
1172
|
lora_B = None if lora_B == None else lora_B.to(module.weight.dtype)
|
|
1173
|
+
diff_b = None if diff_b == None else diff_b.to(module.weight.dtype)
|
|
1098
1174
|
if loras_adapter_data == None:
|
|
1099
1175
|
alpha = lora_alphas.get(k[:-len("lora_X.weight")] + "alpha", 1.)
|
|
1100
|
-
loras_adapter_data = [lora_A, lora_B, alpha]
|
|
1176
|
+
loras_adapter_data = [lora_A, lora_B, diff_b, alpha]
|
|
1101
1177
|
loras_module_data[adapter_name] = loras_adapter_data
|
|
1102
1178
|
elif lora_A != None:
|
|
1103
1179
|
loras_adapter_data[0] = lora_A
|
|
1104
|
-
|
|
1180
|
+
elif lora_B != None:
|
|
1105
1181
|
loras_adapter_data[1] = lora_B
|
|
1106
|
-
|
|
1182
|
+
else:
|
|
1183
|
+
loras_adapter_data[2] = diff_b
|
|
1184
|
+
lora_A, lora_B, diff, diff_b, v, loras_module_data, loras_adapter_data = None, None, None, None, None, None, None
|
|
1107
1185
|
lora_alphas = None
|
|
1108
1186
|
|
|
1109
1187
|
if len(invalid_keys) > 0:
|
|
1110
|
-
msg = "Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
|
|
1188
|
+
msg = f"Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
|
|
1111
1189
|
error_msg = append(error_msg, msg)
|
|
1112
1190
|
fail = True
|
|
1113
1191
|
if len(unexpected_keys) > 0:
|
|
@@ -1127,7 +1205,9 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1127
1205
|
if not check_only:
|
|
1128
1206
|
# model._loras_tied_weights[adapter_name] = tied_weights
|
|
1129
1207
|
if pinnedLora:
|
|
1130
|
-
|
|
1208
|
+
pinned_sd_list.append(state_dict)
|
|
1209
|
+
pinned_names_list.append(path)
|
|
1210
|
+
# _pin_sd_to_memory(state_dict, path)
|
|
1131
1211
|
|
|
1132
1212
|
del state_dict
|
|
1133
1213
|
|
|
@@ -1146,6 +1226,8 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1146
1226
|
|
|
1147
1227
|
model._loras_errors = errors
|
|
1148
1228
|
if not check_only:
|
|
1229
|
+
if pinnedLora:
|
|
1230
|
+
_pin_sd_to_memory(pinned_sd_list, pinned_names_list)
|
|
1149
1231
|
model._loras_adapters = adapters
|
|
1150
1232
|
if activate_all_loras:
|
|
1151
1233
|
activate_loras(model, loras_nos, loras_multi)
|
|
@@ -1193,7 +1275,7 @@ def move_loras_to_device(model, device="cpu" ):
|
|
|
1193
1275
|
if ".lora_" in k:
|
|
1194
1276
|
m.to(device)
|
|
1195
1277
|
|
|
1196
|
-
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1):
|
|
1278
|
+
def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, configKwargs ={}):
|
|
1197
1279
|
"""
|
|
1198
1280
|
quick version of .LoadfromPretrained of the transformers library
|
|
1199
1281
|
used to build a model and load the corresponding weights (quantized or not)
|
|
@@ -1235,6 +1317,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1235
1317
|
text = reader.read()
|
|
1236
1318
|
transformer_config= json.loads(text)
|
|
1237
1319
|
|
|
1320
|
+
transformer_config.update( configKwargs )
|
|
1238
1321
|
|
|
1239
1322
|
if "architectures" in transformer_config:
|
|
1240
1323
|
architectures = transformer_config["architectures"]
|
|
@@ -1254,7 +1337,6 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1254
1337
|
fp.close()
|
|
1255
1338
|
config_obj = AutoConfig.from_pretrained(fp.name)
|
|
1256
1339
|
os.remove(fp.name)
|
|
1257
|
-
|
|
1258
1340
|
#needed to keep inits of non persistent buffers
|
|
1259
1341
|
with init_empty_weights():
|
|
1260
1342
|
model = transfomer_class(config_obj)
|
|
@@ -1270,7 +1352,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1270
1352
|
transfomer_class = getattr(module, class_name)
|
|
1271
1353
|
|
|
1272
1354
|
with init_empty_weights():
|
|
1273
|
-
model = transfomer_class.from_config(transformer_config)
|
|
1355
|
+
model = transfomer_class.from_config(transformer_config )
|
|
1274
1356
|
|
|
1275
1357
|
|
|
1276
1358
|
torch.set_default_device('cpu')
|
|
@@ -1650,10 +1732,9 @@ class offload:
|
|
|
1650
1732
|
lora_data = lora_module.get(adapter, None)
|
|
1651
1733
|
if lora_data == None:
|
|
1652
1734
|
continue
|
|
1653
|
-
lora_A, lora_B, alpha = lora_data
|
|
1654
1735
|
key = adapter + '_GPU'
|
|
1655
1736
|
if to_GPU:
|
|
1656
|
-
lora_module[key] = [
|
|
1737
|
+
lora_module[key] = [None if item == None else item.cuda(non_blocking=True) for item in lora_data[ :-1] ] + lora_data[ -1:]
|
|
1657
1738
|
elif key in lora_module:
|
|
1658
1739
|
del lora_module[key]
|
|
1659
1740
|
|
|
@@ -1876,27 +1957,64 @@ class offload:
|
|
|
1876
1957
|
|
|
1877
1958
|
return False
|
|
1878
1959
|
|
|
1879
|
-
def
|
|
1960
|
+
def _get_lora_scaling(self, loras_scaling, model, active_adapter):
|
|
1961
|
+
scaling_list = loras_scaling[active_adapter]
|
|
1962
|
+
if isinstance(scaling_list, list):
|
|
1963
|
+
step_no =getattr(model, "_lora_step_no", 0)
|
|
1964
|
+
return scaling_list[step_no]
|
|
1965
|
+
else:
|
|
1966
|
+
return float(scaling_list)
|
|
1880
1967
|
|
|
1881
|
-
def get_scaling(active_adapter):
|
|
1882
|
-
scaling_list = loras_scaling[active_adapter]
|
|
1883
|
-
if isinstance(scaling_list, list):
|
|
1884
|
-
step_no =getattr(model, "_lora_step_no", 0)
|
|
1885
|
-
return scaling_list[step_no]
|
|
1886
|
-
else:
|
|
1887
|
-
return float(scaling_list)
|
|
1888
1968
|
|
|
1889
|
-
weight = submodule.weight
|
|
1890
1969
|
|
|
1891
|
-
|
|
1892
|
-
|
|
1970
|
+
def _lora_generic_forward(self, model, submodule, loras_data, func, *args, **kwargs) -> torch.Tensor:
|
|
1971
|
+
|
|
1972
|
+
weight = submodule.weight
|
|
1973
|
+
bias = getattr(submodule, "bias", None)
|
|
1974
|
+
original_weight = None
|
|
1975
|
+
original_bias = None
|
|
1976
|
+
active_adapters = model._loras_active_adapters
|
|
1977
|
+
loras_scaling = model._loras_scaling
|
|
1978
|
+
first_weight = True
|
|
1979
|
+
first_bias = True
|
|
1980
|
+
for active_adapter in active_adapters:
|
|
1981
|
+
data = loras_data.get(active_adapter + '_GPU', None)
|
|
1982
|
+
if data == None:
|
|
1983
|
+
continue
|
|
1984
|
+
diff_w , _ , diff_b, alpha = data
|
|
1985
|
+
if first_weight:
|
|
1986
|
+
original_weight= weight.clone() if weight != None else None
|
|
1987
|
+
first_weight = False
|
|
1988
|
+
if first_bias:
|
|
1989
|
+
original_bias= bias.clone() if bias != None else None
|
|
1990
|
+
first_bias = False
|
|
1991
|
+
scaling = self._get_lora_scaling( loras_scaling, model, active_adapter) * alpha
|
|
1992
|
+
if diff_w != None:
|
|
1993
|
+
weight.add_(diff_w, alpha= scaling)
|
|
1994
|
+
diff_w = None
|
|
1995
|
+
if diff_b != None:
|
|
1996
|
+
bias.add_(diff_b, alpha= scaling)
|
|
1997
|
+
diff_b = None
|
|
1998
|
+
|
|
1999
|
+
ret = func(*args, **kwargs )
|
|
2000
|
+
|
|
2001
|
+
weight.data = original_weight if original_weight != None else None
|
|
2002
|
+
if original_bias != None:
|
|
2003
|
+
bias.data = original_bias
|
|
2004
|
+
|
|
2005
|
+
return ret
|
|
2006
|
+
|
|
1893
2007
|
|
|
2008
|
+
def _lora_linear_forward(self, model, submodule, loras_data, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
|
2009
|
+
weight = submodule.weight
|
|
1894
2010
|
active_adapters = model._loras_active_adapters
|
|
1895
2011
|
loras_scaling = model._loras_scaling
|
|
1896
2012
|
training = False
|
|
1897
2013
|
|
|
1898
2014
|
dtype = weight.dtype
|
|
1899
2015
|
if weight.shape[-1] < x.shape[-2] : # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
2016
|
+
bias = submodule.bias
|
|
2017
|
+
original_bias = True
|
|
1900
2018
|
if len(active_adapters) > 0:
|
|
1901
2019
|
if isinstance(submodule, QModuleMixin):
|
|
1902
2020
|
weight = weight.view(weight.shape) # get a persistent copy of the on the fly dequantized weights
|
|
@@ -1908,16 +2026,25 @@ class offload:
|
|
|
1908
2026
|
data = loras_data.get(active_adapter + '_GPU', None)
|
|
1909
2027
|
if data == None:
|
|
1910
2028
|
continue
|
|
1911
|
-
lora_A_weight, lora_B_weight, alpha = data
|
|
1912
|
-
scaling =
|
|
2029
|
+
lora_A_weight, lora_B_weight, diff_b, alpha = data
|
|
2030
|
+
scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
|
|
1913
2031
|
weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
|
|
2032
|
+
if diff_b != None:
|
|
2033
|
+
if bias == None:
|
|
2034
|
+
bias = diff_b.clone()
|
|
2035
|
+
original_bias = False
|
|
2036
|
+
elif original_bias:
|
|
2037
|
+
bias = bias.clone()
|
|
2038
|
+
original_bias = False
|
|
2039
|
+
bias.add_(diff_b, alpha=scaling)
|
|
2040
|
+
|
|
1914
2041
|
# base_weight += scaling * lora_B_weight @ lora_A_weight
|
|
1915
2042
|
|
|
1916
2043
|
if training:
|
|
1917
2044
|
pass
|
|
1918
2045
|
# result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
|
|
1919
2046
|
else:
|
|
1920
|
-
result = torch.nn.functional.linear(x, weight, bias=
|
|
2047
|
+
result = torch.nn.functional.linear(x, weight, bias=bias)
|
|
1921
2048
|
|
|
1922
2049
|
else:
|
|
1923
2050
|
result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
|
|
@@ -1929,9 +2056,9 @@ class offload:
|
|
|
1929
2056
|
data = loras_data.get(active_adapter + '_GPU', None)
|
|
1930
2057
|
if data == None:
|
|
1931
2058
|
continue
|
|
1932
|
-
lora_A, lora_B, alpha = data
|
|
2059
|
+
lora_A, lora_B, diff_b, alpha = data
|
|
1933
2060
|
# dropout = self.lora_dropout[active_adapter]
|
|
1934
|
-
scaling =
|
|
2061
|
+
scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
|
|
1935
2062
|
x = x.to(lora_A.dtype)
|
|
1936
2063
|
|
|
1937
2064
|
if training:
|
|
@@ -1939,8 +2066,7 @@ class offload:
|
|
|
1939
2066
|
# y = lora_A(dropout(x))
|
|
1940
2067
|
else:
|
|
1941
2068
|
y = torch.nn.functional.linear(x, lora_A, bias=None)
|
|
1942
|
-
|
|
1943
|
-
y = torch.nn.functional.linear(y, lora_B, bias=None)
|
|
2069
|
+
y = torch.nn.functional.linear(y, lora_B, bias=diff_b)
|
|
1944
2070
|
y*= scaling
|
|
1945
2071
|
result+= y
|
|
1946
2072
|
del y
|
|
@@ -1948,19 +2074,27 @@ class offload:
|
|
|
1948
2074
|
return result
|
|
1949
2075
|
|
|
1950
2076
|
|
|
1951
|
-
def
|
|
2077
|
+
def hook_lora(self, submodule, current_model, model_id, loras_model_data, submodule_name):
|
|
1952
2078
|
old_forward = submodule.forward
|
|
1953
2079
|
|
|
1954
2080
|
loras_data = {}
|
|
1955
2081
|
loras_model_data[submodule] = loras_data
|
|
1956
2082
|
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
2083
|
+
if isinstance(submodule, torch.nn.Linear):
|
|
2084
|
+
def lora_linear_forward(module, *args, **kwargs):
|
|
2085
|
+
if len(loras_data) == 0:
|
|
2086
|
+
return old_forward(*args, **kwargs)
|
|
2087
|
+
else:
|
|
2088
|
+
return self._lora_linear_forward(current_model, submodule, loras_data, *args, **kwargs)
|
|
2089
|
+
target_fn = lora_linear_forward
|
|
2090
|
+
else:
|
|
2091
|
+
def lora_generic_forward(module, *args, **kwargs):
|
|
2092
|
+
if len(loras_data) == 0:
|
|
2093
|
+
return old_forward(*args, **kwargs)
|
|
2094
|
+
else:
|
|
2095
|
+
return self._lora_generic_forward(current_model, submodule, loras_data, old_forward, *args, **kwargs)
|
|
2096
|
+
target_fn = lora_generic_forward
|
|
2097
|
+
return functools.update_wrapper(functools.partial(target_fn, submodule), old_forward)
|
|
1964
2098
|
|
|
1965
2099
|
def ensure_model_loaded(self, model_id):
|
|
1966
2100
|
if model_id in self.active_models_ids:
|
|
@@ -2413,8 +2547,9 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2413
2547
|
|
|
2414
2548
|
|
|
2415
2549
|
if hasattr(submodule, "forward"):
|
|
2416
|
-
if any_lora and isinstance(submodule, torch.nn.Linear):
|
|
2417
|
-
|
|
2550
|
+
# if any_lora and isinstance(submodule, ( torch.nn.Linear, torch.nn.Conv3d, torch.nn.LayerNorm)):
|
|
2551
|
+
if any_lora and hasattr(submodule,"weight"):
|
|
2552
|
+
submodule_method = self.hook_lora(submodule, current_model, model_id, loras_model_data, submodule_name)
|
|
2418
2553
|
else:
|
|
2419
2554
|
submodule_method = getattr(submodule, "forward")
|
|
2420
2555
|
if callable(submodule_method):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.4.
|
|
3
|
+
Version: 3.4.7
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.4.
|
|
18
|
+
<H2>Memory Management 3.4.7 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -43,7 +43,7 @@ Each profile may use a combination of the following:
|
|
|
43
43
|
## Sample applications that use mmgp
|
|
44
44
|
It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
|
|
45
45
|
- Wan2GP: https://github.com/deepbeepmeep/Wan2GP :\
|
|
46
|
-
An excellent text to video and image to video generator
|
|
46
|
+
An excellent text to video and image to video generator that supports the best Open Source Video Architectures: Wan, Hunyuan and LTX Video
|
|
47
47
|
|
|
48
48
|
- Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
|
|
49
49
|
A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|