mmgp 3.5.12__tar.gz → 3.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.5.12/src/mmgp.egg-info → mmgp-3.6.0}/PKG-INFO +2 -2
- {mmgp-3.5.12 → mmgp-3.6.0}/README.md +1 -1
- {mmgp-3.5.12 → mmgp-3.6.0}/pyproject.toml +1 -1
- {mmgp-3.5.12 → mmgp-3.6.0}/src/mmgp/offload.py +12 -10
- {mmgp-3.5.12 → mmgp-3.6.0/src/mmgp.egg-info}/PKG-INFO +2 -2
- {mmgp-3.5.12 → mmgp-3.6.0}/LICENSE.md +0 -0
- {mmgp-3.5.12 → mmgp-3.6.0}/setup.cfg +0 -0
- {mmgp-3.5.12 → mmgp-3.6.0}/src/__init__.py +0 -0
- {mmgp-3.5.12 → mmgp-3.6.0}/src/mmgp/__init__.py +0 -0
- {mmgp-3.5.12 → mmgp-3.6.0}/src/mmgp/safetensors2.py +0 -0
- {mmgp-3.5.12 → mmgp-3.6.0}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.5.12 → mmgp-3.6.0}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.5.12 → mmgp-3.6.0}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.5.12 → mmgp-3.6.0}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.6.0
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.
|
|
18
|
+
<H2>Memory Management 3.6.0 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.
|
|
1
|
+
# ------------------ Memory Management 3.6.0 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -688,7 +688,7 @@ def _welcome():
|
|
|
688
688
|
if welcome_displayed:
|
|
689
689
|
return
|
|
690
690
|
welcome_displayed = True
|
|
691
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.
|
|
691
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.0) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
692
692
|
|
|
693
693
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
694
694
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -2287,9 +2287,10 @@ class offload:
|
|
|
2287
2287
|
src = f"""
|
|
2288
2288
|
def {fname}(module, *args, **kwargs):
|
|
2289
2289
|
_ = __TYPE_CONST # anchor type as a constant to make code object unique per class
|
|
2290
|
+
nada = "{fname}"
|
|
2290
2291
|
mgr = module._mm_manager
|
|
2291
2292
|
mgr._pre_check(module)
|
|
2292
|
-
return module._mm_forward(*args, **kwargs)
|
|
2293
|
+
return module._mm_forward(*args, **kwargs) #{fname}
|
|
2293
2294
|
"""
|
|
2294
2295
|
ns = {"__TYPE_CONST": mod_cls}
|
|
2295
2296
|
exec(src, ns) # compile a new function object/code object for this class
|
|
@@ -2310,7 +2311,8 @@ def {fname}(module, *args, **kwargs):
|
|
|
2310
2311
|
wrapper_fn = self._get_wrapper_for_type(type(target_module))
|
|
2311
2312
|
|
|
2312
2313
|
# bind as a bound method (no partial/closures)
|
|
2313
|
-
target_module.forward = types.MethodType(wrapper_fn, target_module)
|
|
2314
|
+
# target_module.forward = types.MethodType(wrapper_fn, target_module)
|
|
2315
|
+
target_module.forward = functools.update_wrapper(functools.partial(wrapper_fn, target_module), previous_method)
|
|
2314
2316
|
|
|
2315
2317
|
def hook_check_load_into_GPU_if_needed_default(self, target_module, model, model_id, blocks_name, previous_method, context):
|
|
2316
2318
|
|
|
@@ -2345,12 +2347,12 @@ def {fname}(module, *args, **kwargs):
|
|
|
2345
2347
|
if isinstance(target_module, torch.nn.Linear):
|
|
2346
2348
|
def check_load_into_GPU_needed_linear(module, *args, **kwargs):
|
|
2347
2349
|
check_load_into_GPU_needed()
|
|
2348
|
-
return previous_method(*args, **kwargs)
|
|
2350
|
+
return previous_method(*args, **kwargs) # linear
|
|
2349
2351
|
check_load_into_GPU_needed_module = check_load_into_GPU_needed_linear
|
|
2350
2352
|
else:
|
|
2351
2353
|
def check_load_into_GPU_needed_other(module, *args, **kwargs):
|
|
2352
2354
|
check_load_into_GPU_needed()
|
|
2353
|
-
return previous_method(*args, **kwargs)
|
|
2355
|
+
return previous_method(*args, **kwargs) # other
|
|
2354
2356
|
check_load_into_GPU_needed_module = check_load_into_GPU_needed_other
|
|
2355
2357
|
|
|
2356
2358
|
setattr(target_module, "_mm_id", model_id)
|
|
@@ -2498,7 +2500,7 @@ def {fname}(module, *args, **kwargs):
|
|
|
2498
2500
|
|
|
2499
2501
|
|
|
2500
2502
|
|
|
2501
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, verboseLevel = -1):
|
|
2503
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, compile_mode ="default", verboseLevel = -1):
|
|
2502
2504
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
2503
2505
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
2504
2506
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -2771,8 +2773,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2771
2773
|
elif compilationInThisOne and submodule in towers_modules:
|
|
2772
2774
|
self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
|
|
2773
2775
|
else:
|
|
2774
|
-
if compilationInThisOne and False
|
|
2775
|
-
self.
|
|
2776
|
+
if compilationInThisOne: #and False
|
|
2777
|
+
self.hook_check_load_into_GPU_if_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2776
2778
|
else:
|
|
2777
2779
|
self.hook_check_load_into_GPU_if_needed_default(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2778
2780
|
|
|
@@ -2789,7 +2791,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2789
2791
|
print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
|
|
2790
2792
|
|
|
2791
2793
|
for submodel in towers_modules:
|
|
2792
|
-
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode=
|
|
2794
|
+
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode= compile_mode) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
|
|
2793
2795
|
#dynamic=True,
|
|
2794
2796
|
|
|
2795
2797
|
self.tune_preloading(model_id, current_budget, towers_names)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.6.0
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.
|
|
18
|
+
<H2>Memory Management 3.6.0 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|