mmgp 3.2.5__py3-none-any.whl → 3.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +60 -11
- {mmgp-3.2.5.dist-info → mmgp-3.2.6.dist-info}/METADATA +2 -2
- mmgp-3.2.6.dist-info/RECORD +9 -0
- {mmgp-3.2.5.dist-info → mmgp-3.2.6.dist-info}/WHEEL +1 -1
- mmgp-3.2.5.dist-info/RECORD +0 -9
- {mmgp-3.2.5.dist-info → mmgp-3.2.6.dist-info}/LICENSE.md +0 -0
- {mmgp-3.2.5.dist-info → mmgp-3.2.6.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.2.
|
|
1
|
+
# ------------------ Memory Management 3.2.6 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -61,7 +61,12 @@ import sys
|
|
|
61
61
|
import os
|
|
62
62
|
import json
|
|
63
63
|
import psutil
|
|
64
|
-
|
|
64
|
+
from accelerate import init_empty_weights
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
|
|
68
|
+
from peft.tuners.tuners_utils import BaseTuner
|
|
69
|
+
|
|
65
70
|
from diffusers.utils.peft_utils import set_weights_and_activate_adapters, get_peft_kwargs
|
|
66
71
|
except:
|
|
67
72
|
set_weights_and_activate_adapters = None
|
|
@@ -297,12 +302,13 @@ def _get_tensor_ref(p):
|
|
|
297
302
|
return p.data_ptr()
|
|
298
303
|
|
|
299
304
|
|
|
300
|
-
def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
|
|
305
|
+
def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, verboseLevel = 1):
|
|
301
306
|
if partialPinning:
|
|
302
307
|
towers_names, _ = _detect_main_towers(model)
|
|
303
308
|
|
|
304
309
|
|
|
305
|
-
BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
310
|
+
# BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
311
|
+
BIG_TENSOR_MAX_SIZE = 2**27 # 128 MB
|
|
306
312
|
current_big_tensor_size = 0
|
|
307
313
|
big_tensor_no = 0
|
|
308
314
|
big_tensors_sizes = []
|
|
@@ -314,6 +320,9 @@ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
|
|
|
314
320
|
include = True
|
|
315
321
|
if partialPinning:
|
|
316
322
|
include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
|
|
323
|
+
if include and not pinnedLora and ".lora_" in k:
|
|
324
|
+
include = False
|
|
325
|
+
|
|
317
326
|
if include:
|
|
318
327
|
params_dict.update( { k + '.' + n : (p, False) for n, p in sub_module.named_parameters(recurse=False) } )
|
|
319
328
|
params_dict.update( { k + '.' + n : (b, True) for n, b in sub_module.named_buffers(recurse=False) } )
|
|
@@ -479,7 +488,7 @@ def _welcome():
|
|
|
479
488
|
if welcome_displayed:
|
|
480
489
|
return
|
|
481
490
|
welcome_displayed = True
|
|
482
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.
|
|
491
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.6) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
483
492
|
|
|
484
493
|
def _extract_num_from_str(num_in_str):
|
|
485
494
|
size = len(num_in_str)
|
|
@@ -800,7 +809,7 @@ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor
|
|
|
800
809
|
scaling = get_scaling(active_adapter)
|
|
801
810
|
lora_A_weight = lora_A.weight
|
|
802
811
|
lora_B_weight = lora_B.weight
|
|
803
|
-
if new_weights:
|
|
812
|
+
if new_weights or True:
|
|
804
813
|
base_weight = torch.addmm(base_weight, lora_B_weight, lora_A_weight, alpha= scaling )
|
|
805
814
|
# base_weight = base_weight + scaling * lora_B_weight @ lora_A_weight
|
|
806
815
|
else:
|
|
@@ -857,7 +866,47 @@ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor
|
|
|
857
866
|
|
|
858
867
|
result = result.to(torch_result_dtype)
|
|
859
868
|
return result
|
|
860
|
-
|
|
869
|
+
|
|
870
|
+
def _inject_adapter(
|
|
871
|
+
self, model: torch.nn.Module, adapter_name: str, autocast_adapter_dtype: bool = True, low_cpu_mem_usage: bool = False
|
|
872
|
+
) -> None:
|
|
873
|
+
|
|
874
|
+
def _get_submodules(model, key):
|
|
875
|
+
parent = model.get_submodule(".".join(key.split(".")[:-1]))
|
|
876
|
+
target_name = key.split(".")[-1]
|
|
877
|
+
target = model.get_submodule(key)
|
|
878
|
+
return parent, target, target_name
|
|
879
|
+
|
|
880
|
+
peft_config = self.peft_config[adapter_name]
|
|
881
|
+
self._check_new_adapter_config(peft_config)
|
|
882
|
+
|
|
883
|
+
model_config = self.get_model_config(model)
|
|
884
|
+
|
|
885
|
+
peft_config = self._prepare_adapter_config(peft_config, model_config)
|
|
886
|
+
|
|
887
|
+
self._prepare_model(peft_config, model)
|
|
888
|
+
|
|
889
|
+
target_modules = peft_config.target_modules.copy()
|
|
890
|
+
|
|
891
|
+
# unexpected_modules = []
|
|
892
|
+
for key, target in model.named_modules():
|
|
893
|
+
if not key:
|
|
894
|
+
continue
|
|
895
|
+
if key in target_modules:
|
|
896
|
+
target_modules.remove(key)
|
|
897
|
+
self.targeted_module_names.append(key)
|
|
898
|
+
# pos = key.rfind(".")
|
|
899
|
+
# parent = key[:pos]
|
|
900
|
+
# target_name = key[pos+1:]
|
|
901
|
+
parent, target, target_name = _get_submodules(model, key)
|
|
902
|
+
with init_empty_weights():
|
|
903
|
+
self._create_and_replace(peft_config, adapter_name, target, target_name, parent, current_key=key)
|
|
904
|
+
|
|
905
|
+
self.set_adapter(self.active_adapters)
|
|
906
|
+
self._mark_only_adapters_as_trainable(model)
|
|
907
|
+
|
|
908
|
+
return target_modules
|
|
909
|
+
|
|
861
910
|
def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
|
|
862
911
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
863
912
|
|
|
@@ -866,6 +915,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
866
915
|
|
|
867
916
|
from peft.tuners.lora import Linear
|
|
868
917
|
Linear.forward = _lora_linear_forward
|
|
918
|
+
BaseTuner.inject_adapter = _inject_adapter
|
|
869
919
|
|
|
870
920
|
if not isinstance(lora_path, list):
|
|
871
921
|
lora_path = [lora_path]
|
|
@@ -979,7 +1029,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
979
1029
|
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
|
|
980
1030
|
if unexpected_keys:
|
|
981
1031
|
raise Exception(f"Lora '{path}' contains invalid keys '{unexpected_keys}'")
|
|
982
|
-
|
|
1032
|
+
|
|
983
1033
|
if verboseLevel >=1:
|
|
984
1034
|
print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
|
|
985
1035
|
if activate_all_loras:
|
|
@@ -1025,7 +1075,6 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
|
|
|
1025
1075
|
|
|
1026
1076
|
|
|
1027
1077
|
import os.path
|
|
1028
|
-
from accelerate import init_empty_weights
|
|
1029
1078
|
|
|
1030
1079
|
if not (model_path.endswith(".sft") or model_path.endswith(".safetensors")):
|
|
1031
1080
|
raise Exception("full model path to file expected")
|
|
@@ -1811,7 +1860,7 @@ class offload:
|
|
|
1811
1860
|
|
|
1812
1861
|
|
|
1813
1862
|
|
|
1814
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
1863
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
1815
1864
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
1816
1865
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
1817
1866
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -2010,7 +2059,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
|
|
|
2010
2059
|
if self.verboseLevel >=1:
|
|
2011
2060
|
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
2012
2061
|
else:
|
|
2013
|
-
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, verboseLevel=verboseLevel)
|
|
2062
|
+
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedLora = pinnedLora, verboseLevel=verboseLevel)
|
|
2014
2063
|
|
|
2015
2064
|
current_budget = model_budgets[model_id]
|
|
2016
2065
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.2.
|
|
3
|
+
Version: 3.2.6
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -17,7 +17,7 @@ Requires-Dist: peft
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.2.
|
|
20
|
+
<H2>Memory Management 3.2.6 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
23
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=sN95BJAvdWOu36AWwJlACdxMDiOzeqL2HXLN90oaec4,98169
|
|
4
|
+
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
+
mmgp-3.2.6.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.2.6.dist-info/METADATA,sha256=F7LmNAvBTLEEfFT-Wbh7md4s1U4Vdnt4RrBfuBXpH_s,16151
|
|
7
|
+
mmgp-3.2.6.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
|
8
|
+
mmgp-3.2.6.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.2.6.dist-info/RECORD,,
|
mmgp-3.2.5.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=XQOTMMp5UQku3byZwDr_dYgD3tK4DNTZkwotVyPg-Lk,96434
|
|
4
|
-
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
-
mmgp-3.2.5.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.2.5.dist-info/METADATA,sha256=s6c1X2ar9DQH1CiLAHdO5X60fuNfKqfmqu-xL_W6j5s,16151
|
|
7
|
-
mmgp-3.2.5.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
8
|
-
mmgp-3.2.5.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.2.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|