mmgp 3.1.4.post1592__tar.gz → 3.1.4.post151926__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.1.4.post1592
3
+ Version: 3.1.4.post151926
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.1.4-1592"
3
+ version = "3.1.4-151926"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.1.4-1519 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.1.4-15926 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -457,7 +457,7 @@ def _welcome():
457
457
  if welcome_displayed:
458
458
  return
459
459
  welcome_displayed = True
460
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-151) by DeepBeepMeep ************{ENDC}{UNBOLD}")
460
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-15192) by DeepBeepMeep ************{ENDC}{UNBOLD}")
461
461
 
462
462
  def _extract_num_from_str(num_in_str):
463
463
  size = len(num_in_str)
@@ -728,15 +728,20 @@ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor
728
728
  continue
729
729
  if self.use_dora[active_adapter]:
730
730
  raise Exception("Dora not yet supported by mmgp")
731
+
731
732
  lora_A = self.lora_A[active_adapter]
732
733
  lora_B = self.lora_B[active_adapter]
734
+ dropout = self.lora_dropout[active_adapter]
733
735
  scaling = self.scaling[active_adapter]
734
736
  lora_A_weight = lora_A.weight
735
737
  lora_B_weight = lora_B.weight
736
738
  lora_BA = lora_B_weight @ lora_A_weight
737
739
  base_weight += scaling * lora_BA
738
740
 
739
- result = torch.nn.functional.linear(x, base_weight, bias=self.base_layer.bias)
741
+ if self.training:
742
+ result = torch.nn.functional.linear(dropout(x), base_weight, bias=self.base_layer.bias)
743
+ else:
744
+ result = torch.nn.functional.linear(x, base_weight, bias=self.base_layer.bias)
740
745
  torch_result_dtype = result.dtype
741
746
 
742
747
  else:
@@ -754,14 +759,18 @@ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor
754
759
  x = x.to(lora_A.weight.dtype)
755
760
 
756
761
  if not self.use_dora[active_adapter]:
757
- y = lora_A(x)
762
+ if self.training:
763
+ y = lora_A(dropout(x))
764
+ else:
765
+ y = lora_A(x)
766
+
758
767
  y = lora_B(y)
759
768
  y*= scaling
760
769
  result+= y
761
770
  del lora_A, lora_B, y
762
771
  # result = result + lora_B(lora_A(dropout(x))) * scaling
763
772
  else:
764
- if isinstance(dropout, nn.Identity) or not self.training:
773
+ if isinstance(dropout, torch.nn.Identity) or not self.training:
765
774
  base_result = result
766
775
  else:
767
776
  x = dropout(x)
@@ -1612,6 +1621,31 @@ class offload:
1612
1621
  if self.verboseLevel >=1:
1613
1622
  print(f"Async loading plan for model '{model_id}' : {(preload_total+base_size)/ONE_MB:0.2f} MB will be preloaded (base size of {base_size/ONE_MB:0.2f} MB + {preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async" + (" circular" if len(towers) == 1 else "") + " shuttle")
1614
1623
 
1624
+ def release(self):
1625
+ global last_offload_obj
1626
+
1627
+ if last_offload_obj == self:
1628
+ last_offload_obj = None
1629
+
1630
+ self.unload_all()
1631
+ self.default_stream = None
1632
+ keys= [k for k in self.blocks_of_modules.keys()]
1633
+ for k in keys:
1634
+ del self.blocks_of_modules[k]
1635
+
1636
+ self.blocks_of_modules = None
1637
+
1638
+
1639
+ for model_id, model in self.models.items():
1640
+ move_loras_to_device(model, "cpu")
1641
+
1642
+ self.models = None
1643
+
1644
+ gc.collect()
1645
+ torch.cuda.empty_cache()
1646
+
1647
+
1648
+
1615
1649
 
1616
1650
  def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
1617
1651
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
@@ -1893,12 +1927,11 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1893
1927
  """Apply a configuration profile that depends on your hardware:
1894
1928
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
1895
1929
  profile_name : num of the profile:
1896
- HighRAM_HighVRAM_Fastest (=1): at least 48 GB of RAM and 24 GB of VRAM : the fastest well suited for a RTX 3090 / RTX 4090
1897
- HighRAM_LowVRAM_Fast (=2): at least 48 GB of RAM and 12 GB of VRAM : a bit slower, better suited for RTX 3070/3080/4070/4080
1898
- or for RTX 3090 / RTX 4090 with large pictures batches or long videos
1899
- LowRAM_HighVRAM_Medium (=3): at least 32 GB of RAM and 24 GB of VRAM : so so speed but adapted for RTX 3090 / RTX 4090 with limited RAM
1900
- LowRAM_LowVRAM_Slow (=4): at least 32 GB of RAM and 12 GB of VRAM : if have little VRAM or generate longer videos
1901
- VerylowRAM_LowVRAM_Slowest (=5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
1930
+ HighRAM_HighVRAM_Fastest (=1): will try to load entirely a model in VRAM and to keep a copy in reserved RAM for fast loading / unloading
1931
+ HighRAM_LowVRAM_Fast (=2): will try to load only the needed parts of a model in VRAM and to keep a copy in reserved RAM for fast loading / unloading
1932
+ LowRAM_HighVRAM_Medium (=3): will try to load entirely a model in VRAM and to keep a copy in reserved RAM for fast loading / unloading, 8 bits quantization of main model
1933
+ LowRAM_LowVRAM_Slow (=4): will try to load only the needed parts of a model in VRAM and to keep a copy in reserved RAM for fast loading / unloading, 8 bits quantization of main models
1934
+ VerylowRAM_LowVRAM_Slowest (=5): will try to load only the needed parts of a model in VRAM, 8 bits quantization of main models
1902
1935
  overrideKwargs: every parameter accepted by Offload.All can be added here to override the profile choice
1903
1936
  For instance set quantizeTransformer = False to disable transformer quantization which is by default in every profile
1904
1937
  """
@@ -1942,21 +1975,21 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1942
1975
  if profile_no == profile_type.HighRAM_HighVRAM:
1943
1976
  pinnedMemory= True
1944
1977
  budgets = None
1945
- info = "You have chosen a profile that may require 48 GB of RAM and up to 24 GB of VRAM on some applications."
1978
+ # info = "You have chosen a profile that may require 48 GB of RAM and up to 24 GB of VRAM on some applications."
1946
1979
  elif profile_no == profile_type.HighRAM_LowVRAM:
1947
1980
  pinnedMemory= True
1948
1981
  budgets["*"] = 3000
1949
- info = "You have chosen a profile that may require 48 GB of RAM and up to 12 GB of VRAM on some applications."
1982
+ # info = "You have chosen a profile that may require 48 GB of RAM and up to 12 GB of VRAM on some applications."
1950
1983
  elif profile_no == profile_type.LowRAM_HighVRAM:
1951
1984
  pinnedMemory= "transformer"
1952
1985
  extraModelsToQuantize = default_extraModelsToQuantize
1953
1986
  budgets = None
1954
- info = "You have chosen a Medium speed profile that may require 32 GB of RAM and up to 24 GB of VRAM on some applications."
1987
+ # info = "You have chosen a Medium speed profile that may require 32 GB of RAM and up to 24 GB of VRAM on some applications."
1955
1988
  elif profile_no == profile_type.LowRAM_LowVRAM:
1956
1989
  pinnedMemory= "transformer"
1957
1990
  extraModelsToQuantize = default_extraModelsToQuantize
1958
1991
  budgets["*"] = 3000
1959
- info = "You have chosen a profile that usually may require 32 GB of RAM and up to 12 GB of VRAM on some applications."
1992
+ # info = "You have chosen a profile that usually may require 32 GB of RAM and up to 12 GB of VRAM on some applications."
1960
1993
  elif profile_no == profile_type.VerylowRAM_LowVRAM:
1961
1994
  pinnedMemory= False
1962
1995
  extraModelsToQuantize = default_extraModelsToQuantize
@@ -1964,11 +1997,11 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1964
1997
  if "transformer" in modules:
1965
1998
  budgets["transformer"] = 400
1966
1999
  #asyncTransfers = False
1967
- info = "You have chosen the slowest profile that may require 24 GB of RAM and up to 10 GB of VRAM on some applications."
2000
+ # info = "You have chosen the slowest profile that may require 24 GB of RAM and up to 10 GB of VRAM on some applications."
1968
2001
  else:
1969
2002
  raise Exception("Unknown profile")
1970
- info += " Actual requirements may varry depending on the application or on the tuning done to the profile."
1971
-
2003
+ # info += " Actual requirements may varry depending on the application or on the tuning done to the profile."
2004
+ info =""
1972
2005
  if budgets != None and len(budgets) == 0:
1973
2006
  budgets = None
1974
2007
 
@@ -1976,7 +2009,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1976
2009
  kwargs = { "pinnedMemory": pinnedMemory, "extraModelsToQuantize" : extraModelsToQuantize, "budgets": budgets, "asyncTransfers" : asyncTransfers, "quantizeTransformer": quantizeTransformer }
1977
2010
 
1978
2011
  if verboseLevel>=2:
1979
- info = info + CrLf + f"Profile '{profile_type.tostr(profile_no)}' sets the following options:"
2012
+ info = info + f"Profile '{profile_type.tostr(profile_no)}' sets the following options:" #CrLf
1980
2013
  for k,v in kwargs.items():
1981
2014
  if k in overrideKwargs:
1982
2015
  info = info + CrLf + f"- '{k}': '{kwargs[k]}' overriden with value '{overrideKwargs[k]}'"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.1.4.post1592
3
+ Version: 3.1.4.post151926
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
File without changes
File without changes