mmgp 3.5.11__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +23 -15
- {mmgp-3.5.11.dist-info → mmgp-3.6.0.dist-info}/METADATA +2 -2
- mmgp-3.6.0.dist-info/RECORD +9 -0
- mmgp-3.5.11.dist-info/RECORD +0 -9
- {mmgp-3.5.11.dist-info → mmgp-3.6.0.dist-info}/WHEEL +0 -0
- {mmgp-3.5.11.dist-info → mmgp-3.6.0.dist-info}/licenses/LICENSE.md +0 -0
- {mmgp-3.5.11.dist-info → mmgp-3.6.0.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.
|
|
1
|
+
# ------------------ Memory Management 3.6.0 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -94,7 +94,7 @@ def get_cache(cache_name):
|
|
|
94
94
|
if all_cache is None:
|
|
95
95
|
all_cache = {}
|
|
96
96
|
shared_state["_cache"]= all_cache
|
|
97
|
-
cache =
|
|
97
|
+
cache = all_cache.get(cache_name, None)
|
|
98
98
|
if cache is None:
|
|
99
99
|
cache = {}
|
|
100
100
|
all_cache[cache_name] = cache
|
|
@@ -688,7 +688,7 @@ def _welcome():
|
|
|
688
688
|
if welcome_displayed:
|
|
689
689
|
return
|
|
690
690
|
welcome_displayed = True
|
|
691
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.
|
|
691
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.0) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
692
692
|
|
|
693
693
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
694
694
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -2287,9 +2287,10 @@ class offload:
|
|
|
2287
2287
|
src = f"""
|
|
2288
2288
|
def {fname}(module, *args, **kwargs):
|
|
2289
2289
|
_ = __TYPE_CONST # anchor type as a constant to make code object unique per class
|
|
2290
|
+
nada = "{fname}"
|
|
2290
2291
|
mgr = module._mm_manager
|
|
2291
2292
|
mgr._pre_check(module)
|
|
2292
|
-
return module._mm_forward(*args, **kwargs)
|
|
2293
|
+
return module._mm_forward(*args, **kwargs) #{fname}
|
|
2293
2294
|
"""
|
|
2294
2295
|
ns = {"__TYPE_CONST": mod_cls}
|
|
2295
2296
|
exec(src, ns) # compile a new function object/code object for this class
|
|
@@ -2310,7 +2311,8 @@ def {fname}(module, *args, **kwargs):
|
|
|
2310
2311
|
wrapper_fn = self._get_wrapper_for_type(type(target_module))
|
|
2311
2312
|
|
|
2312
2313
|
# bind as a bound method (no partial/closures)
|
|
2313
|
-
target_module.forward = types.MethodType(wrapper_fn, target_module)
|
|
2314
|
+
# target_module.forward = types.MethodType(wrapper_fn, target_module)
|
|
2315
|
+
target_module.forward = functools.update_wrapper(functools.partial(wrapper_fn, target_module), previous_method)
|
|
2314
2316
|
|
|
2315
2317
|
def hook_check_load_into_GPU_if_needed_default(self, target_module, model, model_id, blocks_name, previous_method, context):
|
|
2316
2318
|
|
|
@@ -2345,12 +2347,12 @@ def {fname}(module, *args, **kwargs):
|
|
|
2345
2347
|
if isinstance(target_module, torch.nn.Linear):
|
|
2346
2348
|
def check_load_into_GPU_needed_linear(module, *args, **kwargs):
|
|
2347
2349
|
check_load_into_GPU_needed()
|
|
2348
|
-
return previous_method(*args, **kwargs)
|
|
2350
|
+
return previous_method(*args, **kwargs) # linear
|
|
2349
2351
|
check_load_into_GPU_needed_module = check_load_into_GPU_needed_linear
|
|
2350
2352
|
else:
|
|
2351
2353
|
def check_load_into_GPU_needed_other(module, *args, **kwargs):
|
|
2352
2354
|
check_load_into_GPU_needed()
|
|
2353
|
-
return previous_method(*args, **kwargs)
|
|
2355
|
+
return previous_method(*args, **kwargs) # other
|
|
2354
2356
|
check_load_into_GPU_needed_module = check_load_into_GPU_needed_other
|
|
2355
2357
|
|
|
2356
2358
|
setattr(target_module, "_mm_id", model_id)
|
|
@@ -2498,7 +2500,7 @@ def {fname}(module, *args, **kwargs):
|
|
|
2498
2500
|
|
|
2499
2501
|
|
|
2500
2502
|
|
|
2501
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
2503
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, compile_mode ="default", verboseLevel = -1):
|
|
2502
2504
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
2503
2505
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
2504
2506
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -2507,6 +2509,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2507
2509
|
budgets: 0 by default (unlimited). If non 0, it corresponds to the maximum size in MB that every model will occupy at any moment
|
|
2508
2510
|
(in fact the real usage is twice this number). It is very efficient to reduce VRAM consumption but this feature may be very slow
|
|
2509
2511
|
if pinnedMemory is not enabled
|
|
2512
|
+
vram_safety_coefficient: float between 0 and 1 (exclusive), default 0.8. Sets the maximum portion of VRAM that can be used for models.
|
|
2513
|
+
Lower values provide more safety margin but may reduce performance.
|
|
2510
2514
|
"""
|
|
2511
2515
|
self = offload()
|
|
2512
2516
|
self.verboseLevel = verboseLevel
|
|
@@ -2522,7 +2526,11 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2522
2526
|
return float(b[:-1]) * self.device_mem_capacity
|
|
2523
2527
|
else:
|
|
2524
2528
|
return b * ONE_MB
|
|
2525
|
-
|
|
2529
|
+
|
|
2530
|
+
# Validate vram_safety_coefficient
|
|
2531
|
+
if not isinstance(vram_safety_coefficient, float) or vram_safety_coefficient <= 0 or vram_safety_coefficient >= 1:
|
|
2532
|
+
raise ValueError("vram_safety_coefficient must be a float between 0 and 1 (exclusive)")
|
|
2533
|
+
|
|
2526
2534
|
budget = 0
|
|
2527
2535
|
if not budgets is None:
|
|
2528
2536
|
if isinstance(budgets , dict):
|
|
@@ -2667,14 +2675,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2667
2675
|
model_budget = new_budget if model_budget == 0 or new_budget < model_budget else model_budget
|
|
2668
2676
|
if model_budget > 0 and model_budget > current_model_size:
|
|
2669
2677
|
model_budget = 0
|
|
2670
|
-
coef =
|
|
2678
|
+
coef =vram_safety_coefficient
|
|
2671
2679
|
if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
|
|
2672
2680
|
if verboseLevel >= 1:
|
|
2673
2681
|
if model_budget == 0:
|
|
2674
|
-
print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
|
|
2682
|
+
print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100:.0f}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB:0.1f}) MB)")
|
|
2675
2683
|
else:
|
|
2676
2684
|
print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
|
|
2677
|
-
print(f"Budget allocation for this model has been consequently reduced to the
|
|
2685
|
+
print(f"Budget allocation for this model has been consequently reduced to the {coef * 100:.0f}% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
|
|
2678
2686
|
model_budget = coef * self.device_mem_capacity
|
|
2679
2687
|
|
|
2680
2688
|
|
|
@@ -2765,8 +2773,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2765
2773
|
elif compilationInThisOne and submodule in towers_modules:
|
|
2766
2774
|
self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
|
|
2767
2775
|
else:
|
|
2768
|
-
if compilationInThisOne and False
|
|
2769
|
-
self.
|
|
2776
|
+
if compilationInThisOne: #and False
|
|
2777
|
+
self.hook_check_load_into_GPU_if_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2770
2778
|
else:
|
|
2771
2779
|
self.hook_check_load_into_GPU_if_needed_default(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
|
|
2772
2780
|
|
|
@@ -2783,7 +2791,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2783
2791
|
print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
|
|
2784
2792
|
|
|
2785
2793
|
for submodel in towers_modules:
|
|
2786
|
-
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode=
|
|
2794
|
+
submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode= compile_mode) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
|
|
2787
2795
|
#dynamic=True,
|
|
2788
2796
|
|
|
2789
2797
|
self.tune_preloading(model_id, current_budget, towers_names)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.6.0
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.
|
|
18
|
+
<H2>Memory Management 3.6.0 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=UaqWLw8jrNs9ibbIWplbLO5Cym84Txuu4lzttgxmnXs,132411
|
|
4
|
+
mmgp/safetensors2.py,sha256=zYNMprt1KoxgVALbcz6DawxsQDNNRImvgO9cYRChUiY,19028
|
|
5
|
+
mmgp-3.6.0.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.6.0.dist-info/METADATA,sha256=epm8_KuIB_c4W9iB31KIbHtNjdVuLyvW-DZoc8RR434,16309
|
|
7
|
+
mmgp-3.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
mmgp-3.6.0.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.6.0.dist-info/RECORD,,
|
mmgp-3.5.11.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=KO5wEuGNKxJPaL_ZHuGZDL8l0ZZIY_zf3yI4vBYzoFQ,131664
|
|
4
|
-
mmgp/safetensors2.py,sha256=zYNMprt1KoxgVALbcz6DawxsQDNNRImvgO9cYRChUiY,19028
|
|
5
|
-
mmgp-3.5.11.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.5.11.dist-info/METADATA,sha256=-071YZvgNg093aC0OMNZT1-o3ZXu9RqTquoEzBYsPBE,16311
|
|
7
|
-
mmgp-3.5.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
mmgp-3.5.11.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.5.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|