mmgp 3.5.12__tar.gz → 3.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.5.12/src/mmgp.egg-info → mmgp-3.6.1}/PKG-INFO +2 -2
- {mmgp-3.5.12 → mmgp-3.6.1}/README.md +1 -1
- {mmgp-3.5.12 → mmgp-3.6.1}/pyproject.toml +1 -1
- {mmgp-3.5.12 → mmgp-3.6.1}/src/mmgp/offload.py +25 -15
- {mmgp-3.5.12 → mmgp-3.6.1/src/mmgp.egg-info}/PKG-INFO +2 -2
- {mmgp-3.5.12 → mmgp-3.6.1}/LICENSE.md +0 -0
- {mmgp-3.5.12 → mmgp-3.6.1}/setup.cfg +0 -0
- {mmgp-3.5.12 → mmgp-3.6.1}/src/__init__.py +0 -0
- {mmgp-3.5.12 → mmgp-3.6.1}/src/mmgp/__init__.py +0 -0
- {mmgp-3.5.12 → mmgp-3.6.1}/src/mmgp/safetensors2.py +0 -0
- {mmgp-3.5.12 → mmgp-3.6.1}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.5.12 → mmgp-3.6.1}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.5.12 → mmgp-3.6.1}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.5.12 → mmgp-3.6.1}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.6.1
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.
|
|
18
|
+
<H2>Memory Management 3.6.1 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.
|
|
1
|
+
# ------------------ Memory Management 3.6.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -688,7 +688,7 @@ def _welcome():
|
|
|
688
688
|
if welcome_displayed:
|
|
689
689
|
return
|
|
690
690
|
welcome_displayed = True
|
|
691
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.
|
|
691
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
692
692
|
|
|
693
693
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
694
694
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -1097,7 +1097,9 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1097
1097
|
|
|
1098
1098
|
invalid_keys = []
|
|
1099
1099
|
unexpected_keys = []
|
|
1100
|
-
|
|
1100
|
+
new_state_dict = {}
|
|
1101
|
+
for k in list(state_dict.keys()):
|
|
1102
|
+
v = state_dict.pop(k)
|
|
1101
1103
|
lora_A = lora_B = diff_b = diff = lora_key = None
|
|
1102
1104
|
if k.endswith(".diff"):
|
|
1103
1105
|
diff = v
|
|
@@ -1141,6 +1143,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1141
1143
|
error_msg = append(error_msg, msg)
|
|
1142
1144
|
fail = True
|
|
1143
1145
|
break
|
|
1146
|
+
v = lora_A = lora_A.to(module.weight.dtype)
|
|
1144
1147
|
elif lora_B != None:
|
|
1145
1148
|
rank = lora_B.shape[1]
|
|
1146
1149
|
if module_shape[0] != v.shape[0]:
|
|
@@ -1151,6 +1154,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1151
1154
|
error_msg = append(error_msg, msg)
|
|
1152
1155
|
fail = True
|
|
1153
1156
|
break
|
|
1157
|
+
v = lora_B = lora_B.to(module.weight.dtype)
|
|
1154
1158
|
elif diff != None:
|
|
1155
1159
|
lora_B = diff
|
|
1156
1160
|
if module_shape != v.shape:
|
|
@@ -1161,6 +1165,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1161
1165
|
error_msg = append(error_msg, msg)
|
|
1162
1166
|
fail = True
|
|
1163
1167
|
break
|
|
1168
|
+
v = lora_B = lora_B.to(module.weight.dtype)
|
|
1164
1169
|
elif diff_b != None:
|
|
1165
1170
|
rank = diff_b.shape[0]
|
|
1166
1171
|
if not hasattr(module, "bias"):
|
|
@@ -1179,8 +1184,11 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1179
1184
|
error_msg = append(error_msg, msg)
|
|
1180
1185
|
fail = True
|
|
1181
1186
|
break
|
|
1187
|
+
v = diff_b = diff_b.to(module.weight.dtype)
|
|
1182
1188
|
|
|
1183
1189
|
if not check_only:
|
|
1190
|
+
new_state_dict[k] = v
|
|
1191
|
+
v = None
|
|
1184
1192
|
loras_module_data = loras_model_data.get(module, None)
|
|
1185
1193
|
assert loras_module_data != None
|
|
1186
1194
|
loras_adapter_data = loras_module_data.get(adapter_name, None)
|
|
@@ -1188,11 +1196,11 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1188
1196
|
loras_adapter_data = [None, None, None, 1.]
|
|
1189
1197
|
loras_module_data[adapter_name] = loras_adapter_data
|
|
1190
1198
|
if lora_A != None:
|
|
1191
|
-
loras_adapter_data[0] = lora_A
|
|
1199
|
+
loras_adapter_data[0] = lora_A
|
|
1192
1200
|
elif lora_B != None:
|
|
1193
|
-
loras_adapter_data[1] = lora_B
|
|
1201
|
+
loras_adapter_data[1] = lora_B
|
|
1194
1202
|
else:
|
|
1195
|
-
loras_adapter_data[2] = diff_b
|
|
1203
|
+
loras_adapter_data[2] = diff_b
|
|
1196
1204
|
if rank != None and lora_key is not None and "lora" in lora_key:
|
|
1197
1205
|
alpha_key = k[:-len(lora_key)] + "alpha"
|
|
1198
1206
|
alpha = lora_alphas.get(alpha_key, None)
|
|
@@ -1220,7 +1228,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1220
1228
|
if not check_only:
|
|
1221
1229
|
# model._loras_tied_weights[adapter_name] = tied_weights
|
|
1222
1230
|
if pinnedLora:
|
|
1223
|
-
pinned_sd_list.append(
|
|
1231
|
+
pinned_sd_list.append(new_state_dict)
|
|
1224
1232
|
pinned_names_list.append(path)
|
|
1225
1233
|
# _pin_sd_to_memory(state_dict, path)
|
|
1226
1234
|
|
|
@@ -2287,9 +2295,10 @@ class offload:
|
|
|
2287
2295
|
src = f"""
|
|
2288
2296
|
def {fname}(module, *args, **kwargs):
|
|
2289
2297
|
_ = __TYPE_CONST # anchor type as a constant to make code object unique per class
|
|
2298
|
+
nada = "{fname}"
|
|
2290
2299
|
mgr = module._mm_manager
|
|
2291
2300
|
mgr._pre_check(module)
|
|
2292
|
-
return module._mm_forward(*args, **kwargs)
|
|
2301
|
+
return module._mm_forward(*args, **kwargs) #{fname}
|
|
2293
2302
|
"""
|
|
2294
2303
|
ns = {"__TYPE_CONST": mod_cls}
|
|
2295
2304
|
exec(src, ns) # compile a new function object/code object for this class
|
|
@@ -2310,7 +2319,8 @@ def {fname}(module, *args, **kwargs):
|
|
|
2310
2319
|
wrapper_fn = self._get_wrapper_for_type(type(target_module))
|
|
2311
2320
|
|
|
2312
2321
|
# bind as a bound method (no partial/closures)
|
|
2313
|
-
target_module.forward = types.MethodType(wrapper_fn, target_module)
|
|
2322
|
+
# target_module.forward = types.MethodType(wrapper_fn, target_module)
|
|
2323
|
+
target_module.forward = functools.update_wrapper(functools.partial(wrapper_fn, target_module), previous_method)
|
|
2314
2324
|
|
|
2315
2325
|
def hook_check_load_into_GPU_if_needed_default(self, target_module, model, model_id, blocks_name, previous_method, context):
|
|
2316
2326
|
|
|
@@ -2345,12 +2355,12 @@ def {fname}(module, *args, **kwargs):
|
|
|
2345
2355
|
if isinstance(target_module, torch.nn.Linear):
|
|
2346
2356
|
def check_load_into_GPU_needed_linear(module, *args, **kwargs):
|
|
2347
2357
|
check_load_into_GPU_needed()
|
|
2348
|
-
return previous_method(*args, **kwargs)
|
|
2358
|
+
return previous_method(*args, **kwargs) # linear
|
|
2349
2359
|
check_load_into_GPU_needed_module = check_load_into_GPU_needed_linear
|
|
2350
2360
|
else:
|
|
2351
2361
|
def check_load_into_GPU_needed_other(module, *args, **kwargs):
|
|
2352
2362
|
check_load_into_GPU_needed()
|
|
2353
|
-
return previous_method(*args, **kwargs)
|
|
2363
|
+
return previous_method(*args, **kwargs) # other
|
|
2354
2364
|
check_load_into_GPU_needed_module = check_load_into_GPU_needed_other
|
|
2355
2365
|
|
|
2356
2366
|
setattr(target_module, "_mm_id", model_id)
|
|
@@ -2498,7 +2508,7 @@ def {fname}(module, *args, **kwargs):
|
|
|
2498
2508
|
|
|
2499
2509
|
|
|
2500
2510
|
|
|
2501
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, verboseLevel = -1):
|
|
2511
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, compile_mode ="default", verboseLevel = -1):
|
|
2502
2512
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
2503
2513
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
2504
2514
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -2771,8 +2781,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2771
2781
|
elif compilationInThisOne and submodule in towers_modules:
|
|
2772
2782
|
self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
|
|
2773
2783
|
else:
|
|
2774
|
-
if compilationInThisOne and False
|
|
2775
|
-
self.
|
|
2784
|
+
if compilationInThisOne: #and False
|
|
2785
|
+
self.hook_check_load_into_GPU_if_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2776
2786
|
else:
|
|
2777
2787
|
self.hook_check_load_into_GPU_if_needed_default(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2778
2788
|
|
|
@@ -2789,7 +2799,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2789
2799
|
print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
|
|
2790
2800
|
|
|
2791
2801
|
for submodel in towers_modules:
|
|
2792
|
-
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode=
|
|
2802
|
+
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode= compile_mode) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
|
|
2793
2803
|
#dynamic=True,
|
|
2794
2804
|
|
|
2795
2805
|
self.tune_preloading(model_id, current_budget, towers_names)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.6.1
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.
|
|
18
|
+
<H2>Memory Management 3.6.1 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|