mmgp 3.1.4.post1592__py3-none-any.whl → 3.1.4.post151926__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +52 -19
- {mmgp-3.1.4.post1592.dist-info → mmgp-3.1.4.post151926.dist-info}/METADATA +1 -1
- mmgp-3.1.4.post151926.dist-info/RECORD +9 -0
- mmgp-3.1.4.post1592.dist-info/RECORD +0 -9
- {mmgp-3.1.4.post1592.dist-info → mmgp-3.1.4.post151926.dist-info}/LICENSE.md +0 -0
- {mmgp-3.1.4.post1592.dist-info → mmgp-3.1.4.post151926.dist-info}/WHEEL +0 -0
- {mmgp-3.1.4.post1592.dist-info → mmgp-3.1.4.post151926.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.1.4-
|
|
1
|
+
# ------------------ Memory Management 3.1.4-15926 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -457,7 +457,7 @@ def _welcome():
|
|
|
457
457
|
if welcome_displayed:
|
|
458
458
|
return
|
|
459
459
|
welcome_displayed = True
|
|
460
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-
|
|
460
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-15192) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
461
461
|
|
|
462
462
|
def _extract_num_from_str(num_in_str):
|
|
463
463
|
size = len(num_in_str)
|
|
@@ -728,15 +728,20 @@ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor
|
|
|
728
728
|
continue
|
|
729
729
|
if self.use_dora[active_adapter]:
|
|
730
730
|
raise Exception("Dora not yet supported by mmgp")
|
|
731
|
+
|
|
731
732
|
lora_A = self.lora_A[active_adapter]
|
|
732
733
|
lora_B = self.lora_B[active_adapter]
|
|
734
|
+
dropout = self.lora_dropout[active_adapter]
|
|
733
735
|
scaling = self.scaling[active_adapter]
|
|
734
736
|
lora_A_weight = lora_A.weight
|
|
735
737
|
lora_B_weight = lora_B.weight
|
|
736
738
|
lora_BA = lora_B_weight @ lora_A_weight
|
|
737
739
|
base_weight += scaling * lora_BA
|
|
738
740
|
|
|
739
|
-
|
|
741
|
+
if self.training:
|
|
742
|
+
result = torch.nn.functional.linear(dropout(x), base_weight, bias=self.base_layer.bias)
|
|
743
|
+
else:
|
|
744
|
+
result = torch.nn.functional.linear(x, base_weight, bias=self.base_layer.bias)
|
|
740
745
|
torch_result_dtype = result.dtype
|
|
741
746
|
|
|
742
747
|
else:
|
|
@@ -754,14 +759,18 @@ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor
|
|
|
754
759
|
x = x.to(lora_A.weight.dtype)
|
|
755
760
|
|
|
756
761
|
if not self.use_dora[active_adapter]:
|
|
757
|
-
|
|
762
|
+
if self.training:
|
|
763
|
+
y = lora_A(dropout(x))
|
|
764
|
+
else:
|
|
765
|
+
y = lora_A(x)
|
|
766
|
+
|
|
758
767
|
y = lora_B(y)
|
|
759
768
|
y*= scaling
|
|
760
769
|
result+= y
|
|
761
770
|
del lora_A, lora_B, y
|
|
762
771
|
# result = result + lora_B(lora_A(dropout(x))) * scaling
|
|
763
772
|
else:
|
|
764
|
-
if isinstance(dropout, nn.Identity) or not self.training:
|
|
773
|
+
if isinstance(dropout, torch.nn.Identity) or not self.training:
|
|
765
774
|
base_result = result
|
|
766
775
|
else:
|
|
767
776
|
x = dropout(x)
|
|
@@ -1612,6 +1621,31 @@ class offload:
|
|
|
1612
1621
|
if self.verboseLevel >=1:
|
|
1613
1622
|
print(f"Async loading plan for model '{model_id}' : {(preload_total+base_size)/ONE_MB:0.2f} MB will be preloaded (base size of {base_size/ONE_MB:0.2f} MB + {preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async" + (" circular" if len(towers) == 1 else "") + " shuttle")
|
|
1614
1623
|
|
|
1624
|
+
def release(self):
|
|
1625
|
+
global last_offload_obj
|
|
1626
|
+
|
|
1627
|
+
if last_offload_obj == self:
|
|
1628
|
+
last_offload_obj = None
|
|
1629
|
+
|
|
1630
|
+
self.unload_all()
|
|
1631
|
+
self.default_stream = None
|
|
1632
|
+
keys= [k for k in self.blocks_of_modules.keys()]
|
|
1633
|
+
for k in keys:
|
|
1634
|
+
del self.blocks_of_modules[k]
|
|
1635
|
+
|
|
1636
|
+
self.blocks_of_modules = None
|
|
1637
|
+
|
|
1638
|
+
|
|
1639
|
+
for model_id, model in self.models.items():
|
|
1640
|
+
move_loras_to_device(model, "cpu")
|
|
1641
|
+
|
|
1642
|
+
self.models = None
|
|
1643
|
+
|
|
1644
|
+
gc.collect()
|
|
1645
|
+
torch.cuda.empty_cache()
|
|
1646
|
+
|
|
1647
|
+
|
|
1648
|
+
|
|
1615
1649
|
|
|
1616
1650
|
def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
1617
1651
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
@@ -1893,12 +1927,11 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1893
1927
|
"""Apply a configuration profile that depends on your hardware:
|
|
1894
1928
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
1895
1929
|
profile_name : num of the profile:
|
|
1896
|
-
HighRAM_HighVRAM_Fastest (=1):
|
|
1897
|
-
HighRAM_LowVRAM_Fast (=2):
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
VerylowRAM_LowVRAM_Slowest (=5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
|
|
1930
|
+
HighRAM_HighVRAM_Fastest (=1): will try to load entirely a model in VRAM and to keep a copy in reserved RAM for fast loading / unloading
|
|
1931
|
+
HighRAM_LowVRAM_Fast (=2): will try to load only the needed parts of a model in VRAM and to keep a copy in reserved RAM for fast loading / unloading
|
|
1932
|
+
LowRAM_HighVRAM_Medium (=3): will try to load entirely a model in VRAM and to keep a copy in reserved RAM for fast loading / unloading, 8 bits quantization of main model
|
|
1933
|
+
LowRAM_LowVRAM_Slow (=4): will try to load only the needed parts of a model in VRAM and to keep a copy in reserved RAM for fast loading / unloading, 8 bits quantization of main models
|
|
1934
|
+
VerylowRAM_LowVRAM_Slowest (=5): will try to load only the needed parts of a model in VRAM, 8 bits quantization of main models
|
|
1902
1935
|
overrideKwargs: every parameter accepted by Offload.All can be added here to override the profile choice
|
|
1903
1936
|
For instance set quantizeTransformer = False to disable transformer quantization which is by default in every profile
|
|
1904
1937
|
"""
|
|
@@ -1942,21 +1975,21 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1942
1975
|
if profile_no == profile_type.HighRAM_HighVRAM:
|
|
1943
1976
|
pinnedMemory= True
|
|
1944
1977
|
budgets = None
|
|
1945
|
-
info = "You have chosen a profile that may require 48 GB of RAM and up to 24 GB of VRAM on some applications."
|
|
1978
|
+
# info = "You have chosen a profile that may require 48 GB of RAM and up to 24 GB of VRAM on some applications."
|
|
1946
1979
|
elif profile_no == profile_type.HighRAM_LowVRAM:
|
|
1947
1980
|
pinnedMemory= True
|
|
1948
1981
|
budgets["*"] = 3000
|
|
1949
|
-
info = "You have chosen a profile that may require 48 GB of RAM and up to 12 GB of VRAM on some applications."
|
|
1982
|
+
# info = "You have chosen a profile that may require 48 GB of RAM and up to 12 GB of VRAM on some applications."
|
|
1950
1983
|
elif profile_no == profile_type.LowRAM_HighVRAM:
|
|
1951
1984
|
pinnedMemory= "transformer"
|
|
1952
1985
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
1953
1986
|
budgets = None
|
|
1954
|
-
info = "You have chosen a Medium speed profile that may require 32 GB of RAM and up to 24 GB of VRAM on some applications."
|
|
1987
|
+
# info = "You have chosen a Medium speed profile that may require 32 GB of RAM and up to 24 GB of VRAM on some applications."
|
|
1955
1988
|
elif profile_no == profile_type.LowRAM_LowVRAM:
|
|
1956
1989
|
pinnedMemory= "transformer"
|
|
1957
1990
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
1958
1991
|
budgets["*"] = 3000
|
|
1959
|
-
info = "You have chosen a profile that usually may require 32 GB of RAM and up to 12 GB of VRAM on some applications."
|
|
1992
|
+
# info = "You have chosen a profile that usually may require 32 GB of RAM and up to 12 GB of VRAM on some applications."
|
|
1960
1993
|
elif profile_no == profile_type.VerylowRAM_LowVRAM:
|
|
1961
1994
|
pinnedMemory= False
|
|
1962
1995
|
extraModelsToQuantize = default_extraModelsToQuantize
|
|
@@ -1964,11 +1997,11 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1964
1997
|
if "transformer" in modules:
|
|
1965
1998
|
budgets["transformer"] = 400
|
|
1966
1999
|
#asyncTransfers = False
|
|
1967
|
-
info = "You have chosen the slowest profile that may require 24 GB of RAM and up to 10 GB of VRAM on some applications."
|
|
2000
|
+
# info = "You have chosen the slowest profile that may require 24 GB of RAM and up to 10 GB of VRAM on some applications."
|
|
1968
2001
|
else:
|
|
1969
2002
|
raise Exception("Unknown profile")
|
|
1970
|
-
info += " Actual requirements may varry depending on the application or on the tuning done to the profile."
|
|
1971
|
-
|
|
2003
|
+
# info += " Actual requirements may varry depending on the application or on the tuning done to the profile."
|
|
2004
|
+
info =""
|
|
1972
2005
|
if budgets != None and len(budgets) == 0:
|
|
1973
2006
|
budgets = None
|
|
1974
2007
|
|
|
@@ -1976,7 +2009,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
|
|
|
1976
2009
|
kwargs = { "pinnedMemory": pinnedMemory, "extraModelsToQuantize" : extraModelsToQuantize, "budgets": budgets, "asyncTransfers" : asyncTransfers, "quantizeTransformer": quantizeTransformer }
|
|
1977
2010
|
|
|
1978
2011
|
if verboseLevel>=2:
|
|
1979
|
-
info = info
|
|
2012
|
+
info = info + f"Profile '{profile_type.tostr(profile_no)}' sets the following options:" #CrLf
|
|
1980
2013
|
for k,v in kwargs.items():
|
|
1981
2014
|
if k in overrideKwargs:
|
|
1982
2015
|
info = info + CrLf + f"- '{k}': '{kwargs[k]}' overriden with value '{overrideKwargs[k]}'"
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=gllpz2NNwJL_ESk-L1fMU96inOwh3WB4mu4NitGKyHI,87180
|
|
4
|
+
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
+
mmgp-3.1.4.post151926.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.1.4.post151926.dist-info/METADATA,sha256=JHxiA5Y6w4OwT2LV8mZ-biMZ5Sk3UA87fAdD7O2cCko,15950
|
|
7
|
+
mmgp-3.1.4.post151926.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
8
|
+
mmgp-3.1.4.post151926.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.1.4.post151926.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=dfmplgTm19DPJ8AKqOf8McaY2f63cz3Dqim_-Hvpcqo,86202
|
|
4
|
-
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
-
mmgp-3.1.4.post1592.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.1.4.post1592.dist-info/METADATA,sha256=4PHdTr9MliaSu4UO5ET8GSD9sJRKqLyL8PMcbCcnhtQ,15948
|
|
7
|
-
mmgp-3.1.4.post1592.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
8
|
-
mmgp-3.1.4.post1592.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.1.4.post1592.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|