mmgp 3.5.7__py3-none-any.whl → 3.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +99 -28
- {mmgp-3.5.7.dist-info → mmgp-3.5.8.dist-info}/METADATA +2 -2
- mmgp-3.5.8.dist-info/RECORD +9 -0
- mmgp-3.5.7.dist-info/RECORD +0 -9
- {mmgp-3.5.7.dist-info → mmgp-3.5.8.dist-info}/WHEEL +0 -0
- {mmgp-3.5.7.dist-info → mmgp-3.5.8.dist-info}/licenses/LICENSE.md +0 -0
- {mmgp-3.5.7.dist-info → mmgp-3.5.8.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.5.
|
|
1
|
+
# ------------------ Memory Management 3.5.8 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -64,6 +64,11 @@ import psutil
|
|
|
64
64
|
import builtins
|
|
65
65
|
from accelerate import init_empty_weights
|
|
66
66
|
|
|
67
|
+
import functools
|
|
68
|
+
import types
|
|
69
|
+
from functools import lru_cache
|
|
70
|
+
import torch
|
|
71
|
+
|
|
67
72
|
|
|
68
73
|
from mmgp import safetensors2
|
|
69
74
|
from mmgp import profile_type
|
|
@@ -666,7 +671,7 @@ def _welcome():
|
|
|
666
671
|
if welcome_displayed:
|
|
667
672
|
return
|
|
668
673
|
welcome_displayed = True
|
|
669
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.
|
|
674
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.8) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
670
675
|
|
|
671
676
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
672
677
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -1757,6 +1762,7 @@ class offload:
|
|
|
1757
1762
|
global last_offload_obj
|
|
1758
1763
|
last_offload_obj = self
|
|
1759
1764
|
|
|
1765
|
+
self._type_wrappers = {}
|
|
1760
1766
|
|
|
1761
1767
|
def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
|
|
1762
1768
|
|
|
@@ -2204,7 +2210,7 @@ class offload:
|
|
|
2204
2210
|
if len(loras_data) == 0:
|
|
2205
2211
|
return old_forward(*args, **kwargs)
|
|
2206
2212
|
else:
|
|
2207
|
-
submodule.aaa = submodule_name
|
|
2213
|
+
#submodule.aaa = submodule_name # just for debugging if uncommented will cause pytorch recompilation
|
|
2208
2214
|
return self._lora_linear_forward(current_model, submodule, loras_data, *args, **kwargs)
|
|
2209
2215
|
target_fn = lora_linear_forward
|
|
2210
2216
|
else:
|
|
@@ -2236,10 +2242,63 @@ class offload:
|
|
|
2236
2242
|
|
|
2237
2243
|
# need to be registered before the forward not to be break the efficiency of the compilation chain
|
|
2238
2244
|
# it should be at the top of the compilation as this type of hook in the middle of a chain seems to break memory performance
|
|
2239
|
-
target_module.register_forward_pre_hook(preload_blocks_for_compile)
|
|
2245
|
+
target_module.register_forward_pre_hook(preload_blocks_for_compile)
|
|
2240
2246
|
|
|
2241
2247
|
|
|
2242
|
-
|
|
2248
|
+
|
|
2249
|
+
|
|
2250
|
+
@torch._dynamo.disable
|
|
2251
|
+
def _pre_check(self, module):
|
|
2252
|
+
model_id = getattr(module, "_mm_model_id", None)
|
|
2253
|
+
blocks_name = getattr(module, "_mm_blocks_name", None)
|
|
2254
|
+
|
|
2255
|
+
self.ensure_model_loaded(model_id)
|
|
2256
|
+
if blocks_name is None:
|
|
2257
|
+
if self.ready_to_check_mem():
|
|
2258
|
+
self.empty_cache_if_needed()
|
|
2259
|
+
elif blocks_name != self.loaded_blocks[model_id] and \
|
|
2260
|
+
blocks_name not in self.preloaded_blocks_per_model[model_id]:
|
|
2261
|
+
self.gpu_load_blocks(model_id, blocks_name)
|
|
2262
|
+
|
|
2263
|
+
def _get_wrapper_for_type(self, mod_cls):
|
|
2264
|
+
fn = self._type_wrappers.get(mod_cls)
|
|
2265
|
+
if fn is not None:
|
|
2266
|
+
return fn
|
|
2267
|
+
|
|
2268
|
+
# Unique function name per class -> unique compiled code object
|
|
2269
|
+
fname = f"_mm_wrap_{mod_cls.__module__.replace('.', '_')}_{mod_cls.__name__}"
|
|
2270
|
+
|
|
2271
|
+
# Keep body minimal; all heavy/offload logic runs out-of-graph in _pre_check
|
|
2272
|
+
# Include __TYPE_CONST in the code so the bytecode/consts differ per class.
|
|
2273
|
+
src = f"""
|
|
2274
|
+
def {fname}(module, *args, **kwargs):
|
|
2275
|
+
_ = __TYPE_CONST # anchor type as a constant to make code object unique per class
|
|
2276
|
+
mgr = module._mm_manager
|
|
2277
|
+
mgr._pre_check(module)
|
|
2278
|
+
return module._mm_forward(*args, **kwargs)
|
|
2279
|
+
"""
|
|
2280
|
+
ns = {"__TYPE_CONST": mod_cls}
|
|
2281
|
+
exec(src, ns) # compile a new function object/code object for this class
|
|
2282
|
+
fn = ns[fname]
|
|
2283
|
+
self._type_wrappers[mod_cls] = fn
|
|
2284
|
+
return fn
|
|
2285
|
+
|
|
2286
|
+
def hook_check_load_into_GPU_if_needed(
|
|
2287
|
+
self, target_module, model, model_id, blocks_name, previous_method, context
|
|
2288
|
+
):
|
|
2289
|
+
# store instance data on the module (not captured by the wrapper)
|
|
2290
|
+
target_module._mm_manager = self
|
|
2291
|
+
target_module._mm_model_id = model_id
|
|
2292
|
+
target_module._mm_blocks_name = blocks_name
|
|
2293
|
+
target_module._mm_forward = previous_method
|
|
2294
|
+
|
|
2295
|
+
# per-TYPE wrapper (unique bytecode per class, reused across instances of that class)
|
|
2296
|
+
wrapper_fn = self._get_wrapper_for_type(type(target_module))
|
|
2297
|
+
|
|
2298
|
+
# bind as a bound method (no partial/closures)
|
|
2299
|
+
target_module.forward = types.MethodType(wrapper_fn, target_module)
|
|
2300
|
+
|
|
2301
|
+
def hook_check_load_into_GPU_if_needed_default(self, target_module, model, model_id, blocks_name, previous_method, context):
|
|
2243
2302
|
|
|
2244
2303
|
dtype = model._dtype
|
|
2245
2304
|
qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
|
|
@@ -2259,22 +2318,32 @@ class offload:
|
|
|
2259
2318
|
target_module.forward = target_module._mm_forward
|
|
2260
2319
|
return
|
|
2261
2320
|
|
|
2262
|
-
def
|
|
2321
|
+
def check_load_into_GPU_needed():
|
|
2263
2322
|
self.ensure_model_loaded(model_id)
|
|
2264
2323
|
if blocks_name == None:
|
|
2265
2324
|
if self.ready_to_check_mem():
|
|
2266
2325
|
self.empty_cache_if_needed()
|
|
2267
2326
|
elif blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
|
|
2268
2327
|
self.gpu_load_blocks(model_id, blocks_name)
|
|
2269
|
-
if qint4quantization and dtype !=None:
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2328
|
+
# if qint4quantization and dtype !=None:
|
|
2329
|
+
# args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
|
|
2330
|
+
|
|
2331
|
+
if isinstance(target_module, torch.nn.Linear):
|
|
2332
|
+
def check_load_into_GPU_needed_linear(module, *args, **kwargs):
|
|
2333
|
+
check_load_into_GPU_needed()
|
|
2334
|
+
return previous_method(*args, **kwargs)
|
|
2335
|
+
check_load_into_GPU_needed_module = check_load_into_GPU_needed_linear
|
|
2336
|
+
else:
|
|
2337
|
+
def check_load_into_GPU_needed_other(module, *args, **kwargs):
|
|
2338
|
+
check_load_into_GPU_needed()
|
|
2339
|
+
return previous_method(*args, **kwargs)
|
|
2340
|
+
check_load_into_GPU_needed_module = check_load_into_GPU_needed_other
|
|
2273
2341
|
|
|
2274
2342
|
setattr(target_module, "_mm_id", model_id)
|
|
2275
2343
|
setattr(target_module, "_mm_forward", previous_method)
|
|
2276
2344
|
|
|
2277
|
-
setattr(target_module, "forward", functools.update_wrapper(functools.partial(
|
|
2345
|
+
setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_load_into_GPU_needed_module, target_module), previous_method) )
|
|
2346
|
+
# target_module.register_forward_pre_hook(check_empty_cuda_cache)
|
|
2278
2347
|
|
|
2279
2348
|
|
|
2280
2349
|
def hook_change_module(self, target_module, model, model_id, module_id, previous_method, previous_method_name ):
|
|
@@ -2300,7 +2369,7 @@ class offload:
|
|
|
2300
2369
|
if not self.verboseLevel >=1:
|
|
2301
2370
|
return
|
|
2302
2371
|
|
|
2303
|
-
if module_id == None or module_id =='':
|
|
2372
|
+
if previous_method_name =="forward" and (module_id == None or module_id ==''):
|
|
2304
2373
|
model_name = model._get_name()
|
|
2305
2374
|
print(f"Hooked to model '{model_id}' ({model_name})")
|
|
2306
2375
|
|
|
@@ -2607,19 +2676,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2607
2676
|
for model_id in models:
|
|
2608
2677
|
current_model: torch.nn.Module = models[model_id]
|
|
2609
2678
|
towers_names, towers_modules = _detect_main_towers(current_model)
|
|
2610
|
-
# compile main iterative modules stacks ("towers")
|
|
2611
2679
|
compilationInThisOne = compileAllModels or model_id in modelsToCompile
|
|
2612
|
-
if compilationInThisOne:
|
|
2613
|
-
if self.verboseLevel>=1:
|
|
2614
|
-
if len(towers_modules)>0:
|
|
2615
|
-
formated_tower_names = [name + '*' for name in towers_names]
|
|
2616
|
-
print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
|
|
2617
|
-
else:
|
|
2618
|
-
print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
|
|
2619
|
-
|
|
2620
|
-
for submodel in towers_modules:
|
|
2621
|
-
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
|
|
2622
|
-
#dynamic=True,
|
|
2623
2680
|
|
|
2624
2681
|
if pinAllModels or model_id in modelsToPin:
|
|
2625
2682
|
if hasattr(current_model,"_already_pinned"):
|
|
@@ -2665,8 +2722,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2665
2722
|
# print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
|
|
2666
2723
|
top_submodule = len(submodule_name.split("."))==1
|
|
2667
2724
|
offload_hooks = submodule._offload_hooks if hasattr(submodule, "_offload_hooks") else []
|
|
2668
|
-
if len(offload_hooks) > 0:
|
|
2669
|
-
pass
|
|
2670
2725
|
assert top_submodule or len(offload_hooks) == 0, "custom offload hooks can only be set at the of the module"
|
|
2671
2726
|
submodule_method_names = ["forward"] + offload_hooks
|
|
2672
2727
|
for submodule_method_name in submodule_method_names:
|
|
@@ -2681,11 +2736,27 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2681
2736
|
elif compilationInThisOne and submodule in towers_modules:
|
|
2682
2737
|
self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
|
|
2683
2738
|
else:
|
|
2684
|
-
|
|
2685
|
-
|
|
2739
|
+
if compilationInThisOne and False:
|
|
2740
|
+
self.hook_check_load_into_GPU_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2741
|
+
else:
|
|
2742
|
+
self.hook_check_load_into_GPU_if_needed_default(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2743
|
+
|
|
2686
2744
|
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
|
|
2687
2745
|
|
|
2688
2746
|
|
|
2747
|
+
# compile main iterative modules stacks ("towers")
|
|
2748
|
+
if compilationInThisOne:
|
|
2749
|
+
if self.verboseLevel>=1:
|
|
2750
|
+
if len(towers_modules)>0:
|
|
2751
|
+
formated_tower_names = [name + '*' for name in towers_names]
|
|
2752
|
+
print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
|
|
2753
|
+
else:
|
|
2754
|
+
print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
|
|
2755
|
+
|
|
2756
|
+
for submodel in towers_modules:
|
|
2757
|
+
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
|
|
2758
|
+
#dynamic=True,
|
|
2759
|
+
|
|
2689
2760
|
self.tune_preloading(model_id, current_budget, towers_names)
|
|
2690
2761
|
self.parameters_ref = {}
|
|
2691
2762
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.5.
|
|
3
|
+
Version: 3.5.8
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.5.
|
|
18
|
+
<H2>Memory Management 3.5.8 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=Hc3h2JZIQ7rpmDB_Ozq8VM0oVqm2XEV7arJwT_AIwCI,130372
|
|
4
|
+
mmgp/safetensors2.py,sha256=4nKV13qCMabnNEB1TA_ueFbfGYYmiQ9racR_C6SsGug,18693
|
|
5
|
+
mmgp-3.5.8.dist-info/licenses/LICENSE.md,sha256=DD-WIS0BkPoWJ_8hQO3J8hMP9K_1-dyrYv1YCbkxcDU,94
|
|
6
|
+
mmgp-3.5.8.dist-info/METADATA,sha256=23S2c-Z4z6_npl-w2-0hOllnw13dWhiL3dew02XEknU,16309
|
|
7
|
+
mmgp-3.5.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
mmgp-3.5.8.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.5.8.dist-info/RECORD,,
|
mmgp-3.5.7.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=SKt-EunQrH6omBFI7aNLe82GIoXBKW9y1i0HMPFrKLY,127089
|
|
4
|
-
mmgp/safetensors2.py,sha256=4nKV13qCMabnNEB1TA_ueFbfGYYmiQ9racR_C6SsGug,18693
|
|
5
|
-
mmgp-3.5.7.dist-info/licenses/LICENSE.md,sha256=DD-WIS0BkPoWJ_8hQO3J8hMP9K_1-dyrYv1YCbkxcDU,94
|
|
6
|
-
mmgp-3.5.7.dist-info/METADATA,sha256=s420bK-WQuSZM2RpVwYjzXY-QmtIHkRbIiL9hAyV7sA,16309
|
|
7
|
-
mmgp-3.5.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
mmgp-3.5.7.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.5.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|