PyPI - mmgp - Versions diffs - 3.2.2__py3-none-any.whl → 3.2.4__py3-none-any.whl - Mend

mmgp 3.2.2py3-none-any.whl → 3.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mmgp might be problematic. Click here for more details.

Files changed (7) hide show

mmgp/offload.py +62 -50
{mmgp-3.2.2.dist-info → mmgp-3.2.4.dist-info}/METADATA +4 -4
mmgp-3.2.4.dist-info/RECORD +9 -0
mmgp-3.2.2.dist-info/RECORD +0 -9
{mmgp-3.2.2.dist-info → mmgp-3.2.4.dist-info}/LICENSE.md +0 -0
{mmgp-3.2.2.dist-info → mmgp-3.2.4.dist-info}/WHEEL +0 -0
{mmgp-3.2.2.dist-info → mmgp-3.2.4.dist-info}/top_level.txt +0 -0

mmgp/offload.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# ------------------ Memory Management 3.2.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
+# ------------------ Memory Management 3.2.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
 #
 # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ...  can run smoothly on a 24 GB GPU limited card.
 # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -479,7 +479,7 @@ def _welcome():
     if welcome_displayed:
          return
     welcome_displayed = True
-    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
+    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
 def _extract_num_from_str(num_in_str):
     size = len(num_in_str)
@@ -858,7 +858,7 @@ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor
         result = result.to(torch_result_dtype)
         return result
-def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, split_linear_modules_map = None,verboseLevel = -1,):
+def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
     verboseLevel = _compute_verbose_level(verboseLevel)
     if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or  get_peft_kwargs == None:
@@ -877,7 +877,8 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
         adapter_name = str(i)
         state_dict = safetensors2.torch_load_file(path)
+        if preprocess_sd != None:
+            state_dict = preprocess_sd(state_dict)
         if split_linear_modules_map != None:
             new_state_dict = {}
@@ -977,7 +978,8 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
             # Check only for unexpected keys.
             unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
             if unexpected_keys:
-                pass
+                raise Exception(f"Lora '{path}' contains invalid keys '{unexpected_keys}'")
         if verboseLevel >=1:
             print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
     if activate_all_loras:
@@ -1015,7 +1017,7 @@ def move_loras_to_device(model, device="cpu" ):
         if ".lora_" in k:
             m.to(device)
-def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType =  qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, verboseLevel = -1):
+def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType =  qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, modelPrefix = None, verboseLevel = -1):
     """
     quick version of .LoadfromPretrained of  the transformers library
     used to build a model and load the corresponding weights (quantized or not)
@@ -1096,13 +1098,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
     model._config = transformer_config
-    load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, verboseLevel=verboseLevel )
+    load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, verboseLevel=verboseLevel )
     return model
-def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
+def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, verboseLevel = -1):
     """
     Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
     """
@@ -1113,6 +1115,26 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
     verboseLevel = _compute_verbose_level(verboseLevel)
     model = _remove_model_wrapper(model)
+    def filter_state_dict(state_dict, base_model_prefix):
+        new_state_dict= {}
+        start = -1
+        for k,v in state_dict.items():
+            if k.startswith(base_model_prefix):
+                new_start = len(base_model_prefix)
+            else:
+                pos = k.find("." + base_model_prefix)
+                if pos < 0:
+                    continue
+                new_start = pos + len(base_model_prefix)  +1
+            if start != -1 and start != new_start:
+                new_state_dict  = state_dict
+                break
+            start = new_start
+            new_state_dict[k[ start:]] = v
+        return new_state_dict
     if not (".safetensors" in file_path or ".sft" in file_path):
         if pinToMemory:
             raise Exception("Pinning to memory while loading only supported for safe tensors files")
@@ -1151,6 +1173,11 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
                     quantization_map = json.load(f)
+        # deal if we are trying to load just a sub part of a larger model
+        if modelPrefix != None:
+            base_model_prefix = modelPrefix + "."
+            state_dict = filter_state_dict(state_dict,base_model_prefix)
+            quantization_map = filter_state_dict(quantization_map,base_model_prefix)
         if quantization_map is None :
             if "quanto" in file_path and not do_quantize:
@@ -1160,32 +1187,12 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
     missing_keys , unexpected_keys = model.load_state_dict(state_dict, False,  assign = True )
     if len(missing_keys) > 0 :
-        # if there is a key mismatch maybe we forgot to remove some prefix or we are trying to load just a sub part of a larger model
-        if hasattr(model, "base_model_prefix"):
-            base_model_prefix = model.base_model_prefix + "."
-        else:
-            for k,v in state_dict.items():
-                if k.endswith(missing_keys[0]):
-                    base_model_prefix = k[:-len(missing_keys[0])]
-                    break
-        new_state_dict= {}
-        start = -1
+        # if there is a key mismatch maybe we forgot to remove some prefix
         for k,v in state_dict.items():
-            if k.startswith(base_model_prefix):
-                new_start = len(base_model_prefix)
-            else:
-                pos = k.find("." + base_model_prefix)
-                if pos < 0:
-                    continue
-                new_start = pos + len(base_model_prefix)  +1
-            if start != -1 and start != new_start:
-                new_state_dict  = state_dict
+            if k.endswith(missing_keys[0]):
+                base_model_prefix = k[:-len(missing_keys[0])]
                 break
-            start = new_start
-            new_state_dict[k[ start:]] = v
-        state_dict = new_state_dict
-        del new_state_dict
+        state_dict = filter_state_dict(state_dict,base_model_prefix)
         missing_keys , unexpected_keys = model.load_state_dict(state_dict, False,  assign = True )
     del state_dict
@@ -1354,6 +1361,8 @@ class offload:
     def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
+        if blocks_name!=None and ".lora_" in blocks_name:
+            blocks_name = None
         entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
         if entry_name in self.blocks_of_modules:
             blocks_params = self.blocks_of_modules[entry_name]
@@ -1372,7 +1381,6 @@ class offload:
         lora_name = None
         if self.lora_parents.get(submodule, None) != None:
             lora_name = str(submodule_name[ submodule_name.rfind(".") + 1: ] )
         for k,p in submodule.named_parameters(recurse=False):
             param_size = 0
             ref = _get_tensor_ref(p)
@@ -1457,11 +1465,10 @@ class offload:
                     if tied_param != None:
                         setattr( tied_param[0], tied_param[1], q)
                     del p, q
-        any_past_block = False
         loaded_block = self.loaded_blocks[model_id]
         if not preload and loaded_block != None:
-            any_past_block = True
             self.gpu_unload_blocks(model_id, loaded_block)
             if self.ready_to_check_mem():
                 self.empty_cache_if_needed()
@@ -1475,7 +1482,8 @@ class offload:
         if self.async_transfers and blocks_name != None:
-            first = self.prev_blocks_names[entry_name] == None or not any_past_block
+            prev = self.prev_blocks_names[entry_name]
+            first = prev == None or prev != loaded_block
             next_blocks_entry = self.next_blocks_names[entry_name] if entry_name in self.next_blocks_names else None
             if first:
                 if self.verboseLevel >=2:
@@ -1497,7 +1505,6 @@ class offload:
                 print(f"Loading model {entry_name} ({model_name}) in GPU")
             cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
             torch.cuda.synchronize()
         if not preload:
             self.loaded_blocks[model_id] = blocks_name
@@ -1710,7 +1717,7 @@ class offload:
         current_budget -= base_size
         if current_budget <= 0:
             if self.verboseLevel >=1:
-                print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only only base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
+                print(f"Async loading plan for model '{model_id}' : minimum budget management, beside the async shuttle only base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
             return
         towers = []
@@ -1732,7 +1739,7 @@ class offload:
             current_budget -=  2 * max_floor_size
             if current_budget <= 0:
                 if self.verboseLevel >=1:
-                    print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
+                    print(f"Async loading plan for model '{model_id}' : minimum budget management, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
                 return
@@ -1743,7 +1750,7 @@ class offload:
             max_blocks_fetch = max(max_floor_size, max_blocks_fetch)
             if preload_blocks_count  <= 0:
                 if self.verboseLevel >=1:
-                    print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
+                    print(f"Async loading plan for model '{model_id}' : minimum budget management, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
                 return
             nb_blocks= len(floors)
@@ -1821,16 +1828,20 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
     windows_os =  os.name == 'nt'
+    def get_parsed_budget(b):
+        if isinstance(b , str) and b.endswith("%"):
+            return float(b[:-1]) * self.device_mem_capacity
+        else:
+            return b * ONE_MB
     budget = 0
     if not budgets is None:
         if isinstance(budgets , dict):
-            model_budgets = budgets
-            budget = budgets.get("*", 0) * ONE_MB
+            model_budgets = { k : get_parsed_budget(b) for k , b in budgets.items() }
+            budget = model_budgets.get("*", 0)
         else:
-            budget = int(budgets) * ONE_MB
+            budget = get_parsed_budget(budget)
-    # if (budgets!= None or budget >0) :
-    #     self.async_transfers = True
     self.async_transfers = asyncTransfers
@@ -1938,18 +1949,19 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
             estimatesBytesToPin += current_model_size
-        model_budget = model_budgets[model_id] * ONE_MB if model_id in model_budgets else budget
+        model_budget = model_budgets[model_id] if model_id in model_budgets else budget
         if workingVRAM != None:
             model_minimumVRAM = -1
             if isinstance(workingVRAM, dict):
                 if model_id in workingVRAM:
-                    model_minimumVRAM = workingVRAM[model_id]
+                    model_minimumVRAM = get_parsed_budget(workingVRAM[model_id])
                 elif "*" in model_id in workingVRAM:
-                    model_minimumVRAM = workingVRAM["*"]
+                    model_minimumVRAM = get_parsed_budget(workingVRAM["*"])
             else:
-                model_minimumVRAM = workingVRAM
+                model_minimumVRAM = get_parsed_budget(workingVRAM)
             if model_minimumVRAM > 0:
-                new_budget = self.device_mem_capacity -  model_minimumVRAM * ONE_MB
+                new_budget = self.device_mem_capacity -  model_minimumVRAM
                 new_budget = 1 if new_budget  < 0 else new_budget
                 model_budget =  new_budget if model_budget == 0 or new_budget < model_budget else model_budget
         if  model_budget > 0 and model_budget > current_model_size:

{mmgp-3.2.2.dist-info → mmgp-3.2.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: mmgp
-Version: 3.2.2
+Version: 3.2.4
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 License:                     GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Requires-Dist: peft
 <p align="center">
-  <H2>Memory Management 3.2.1 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.2.4 for the GPU Poor by DeepBeepMeep</H2>
 </p>
@@ -119,9 +119,9 @@ For example:
 - pinnedMemory: Boolean (for all models) or List of models ids to pin to RAM. Every model pinned to RAM will load much faster (up to 2 times) but this requires more RAM
 - quantizeTransformer: boolean by default True. The 'transformer' model in the pipe contains usually the video or image generator is by defaut; quantized on the fly by default to 8 bits. If you want to save time on disk and reduce the loading time, you may want to load directly a prequantized model. If you don't want to quantize the image generator, you need to set the option *quantizeTransformer* to *False* to turn off on the fly quantization.
 - extraModelsToQuantize: list of additional modelids of models to quantize on the fly. If the corresponding model is already quantized, this option will be ignored.
-- budgets: either a number in mega bytes (for all models, if 0 unlimited budget) or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in  VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
+- budgets: either a number in mega bytes,  (for all models, if 0 unlimited budget) a string that is perecentage of the total VRAM or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in  VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
 The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be  increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
-- workingVRAM: either a number in mega bytes or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
+- workingVRAM: either a number in mega bytes, a string that is perecentage of the total VRAM or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
 - asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
 - verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
 - compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows

mmgp-3.2.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
+mmgp/offload.py,sha256=vGxgCcWV8PQQ4JjSlYFOX57Mr9RLlvPBMOOj3f63qL4,96389
+mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
+mmgp-3.2.4.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
+mmgp-3.2.4.dist-info/METADATA,sha256=UGZ7ADvrhU5P0hS7gFgu8SHpEnzzpEgE3Ionk-I7ckw,16151
+mmgp-3.2.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+mmgp-3.2.4.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
+mmgp-3.2.4.dist-info/RECORD,,

mmgp-3.2.2.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
-mmgp/offload.py,sha256=hzirru31j78E88OIT38GJ46iMvddEFM2c3_CCn4N4K4,95676
-mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
-mmgp-3.2.2.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
-mmgp-3.2.2.dist-info/METADATA,sha256=hTjAL-soDwYbUlnD1Om7kefG8D4vaXUTjsHoQDikVQA,16054
-mmgp-3.2.2.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
-mmgp-3.2.2.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
-mmgp-3.2.2.dist-info/RECORD,,

{mmgp-3.2.2.dist-info → mmgp-3.2.4.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{mmgp-3.2.2.dist-info → mmgp-3.2.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{mmgp-3.2.2.dist-info → mmgp-3.2.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

mmgp 3.2.2__py3-none-any.whl → 3.2.4__py3-none-any.whl

Potentially problematic release.

mmgp 3.2.2py3-none-any.whl → 3.2.4py3-none-any.whl