PyPI - mmgp - Versions diffs - 3.2.3__tar.gz → 3.2.4__tar.gz - Mend

@@ -1,4 +1,4 @@
-# ------------------ Memory Management 3.2.3 for the GPU Poor by DeepBeepMeep (mmgp)------------------
+# ------------------ Memory Management 3.2.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
 #
 # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ...  can run smoothly on a 24 GB GPU limited card.
 # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -479,7 +479,7 @@ def _welcome():
     if welcome_displayed:
          return
     welcome_displayed = True
-    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
+    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
 def _extract_num_from_str(num_in_str):
     size = len(num_in_str)
@@ -858,7 +858,7 @@ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor
         result = result.to(torch_result_dtype)
         return result
-def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, split_linear_modules_map = None,verboseLevel = -1,):
+def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
     verboseLevel = _compute_verbose_level(verboseLevel)
     if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or  get_peft_kwargs == None:
@@ -877,7 +877,8 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
         adapter_name = str(i)
         state_dict = safetensors2.torch_load_file(path)
+        if preprocess_sd != None:
+            state_dict = preprocess_sd(state_dict)
         if split_linear_modules_map != None:
             new_state_dict = {}
@@ -977,7 +978,8 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
             # Check only for unexpected keys.
             unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
             if unexpected_keys:
-                pass
+                raise Exception(f"Lora '{path}' contains invalid keys '{unexpected_keys}'")
         if verboseLevel >=1:
             print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
     if activate_all_loras:
@@ -1015,7 +1017,7 @@ def move_loras_to_device(model, device="cpu" ):
         if ".lora_" in k:
             m.to(device)
-def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType =  qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, verboseLevel = -1):
+def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType =  qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, modelPrefix = None, verboseLevel = -1):
     """
     quick version of .LoadfromPretrained of  the transformers library
     used to build a model and load the corresponding weights (quantized or not)
@@ -1096,13 +1098,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
     model._config = transformer_config
-    load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, verboseLevel=verboseLevel )
+    load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, verboseLevel=verboseLevel )
     return model
-def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
+def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, verboseLevel = -1):
     """
     Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
     """
@@ -1113,6 +1115,26 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
     verboseLevel = _compute_verbose_level(verboseLevel)
     model = _remove_model_wrapper(model)
+    def filter_state_dict(state_dict, base_model_prefix):
+        new_state_dict= {}
+        start = -1
+        for k,v in state_dict.items():
+            if k.startswith(base_model_prefix):
+                new_start = len(base_model_prefix)
+            else:
+                pos = k.find("." + base_model_prefix)
+                if pos < 0:
+                    continue
+                new_start = pos + len(base_model_prefix)  +1
+            if start != -1 and start != new_start:
+                new_state_dict  = state_dict
+                break
+            start = new_start
+            new_state_dict[k[ start:]] = v
+        return new_state_dict
     if not (".safetensors" in file_path or ".sft" in file_path):
         if pinToMemory:
             raise Exception("Pinning to memory while loading only supported for safe tensors files")
@@ -1151,6 +1173,11 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
                     quantization_map = json.load(f)
+        # deal if we are trying to load just a sub part of a larger model
+        if modelPrefix != None:
+            base_model_prefix = modelPrefix + "."
+            state_dict = filter_state_dict(state_dict,base_model_prefix)
+            quantization_map = filter_state_dict(quantization_map,base_model_prefix)
         if quantization_map is None :
             if "quanto" in file_path and not do_quantize:
@@ -1160,32 +1187,12 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
     missing_keys , unexpected_keys = model.load_state_dict(state_dict, False,  assign = True )
     if len(missing_keys) > 0 :
-        # if there is a key mismatch maybe we forgot to remove some prefix or we are trying to load just a sub part of a larger model
-        if hasattr(model, "base_model_prefix"):
-            base_model_prefix = model.base_model_prefix + "."
-        else:
-            for k,v in state_dict.items():
-                if k.endswith(missing_keys[0]):
-                    base_model_prefix = k[:-len(missing_keys[0])]
-                    break
-        new_state_dict= {}
-        start = -1
+        # if there is a key mismatch maybe we forgot to remove some prefix
         for k,v in state_dict.items():
-            if k.startswith(base_model_prefix):
-                new_start = len(base_model_prefix)
-            else:
-                pos = k.find("." + base_model_prefix)
-                if pos < 0:
-                    continue
-                new_start = pos + len(base_model_prefix)  +1
-            if start != -1 and start != new_start:
-                new_state_dict  = state_dict
+            if k.endswith(missing_keys[0]):
+                base_model_prefix = k[:-len(missing_keys[0])]
                 break
-            start = new_start
-            new_state_dict[k[ start:]] = v
-        state_dict = new_state_dict
-        del new_state_dict
+        state_dict = filter_state_dict(state_dict,base_model_prefix)
         missing_keys , unexpected_keys = model.load_state_dict(state_dict, False,  assign = True )
     del state_dict
@@ -1354,6 +1361,8 @@ class offload:
     def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
+        if blocks_name!=None and ".lora_" in blocks_name:
+            blocks_name = None
         entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
         if entry_name in self.blocks_of_modules:
             blocks_params = self.blocks_of_modules[entry_name]
@@ -1372,7 +1381,6 @@ class offload:
         lora_name = None
         if self.lora_parents.get(submodule, None) != None:
             lora_name = str(submodule_name[ submodule_name.rfind(".") + 1: ] )
         for k,p in submodule.named_parameters(recurse=False):
             param_size = 0
             ref = _get_tensor_ref(p)

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: mmgp
-Version: 3.2.3
+Version: 3.2.4
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 License:                     GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Requires-Dist: peft
 <p align="center">
-  <H2>Memory Management 3.2.3 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.2.4 for the GPU Poor by DeepBeepMeep</H2>
 </p>

@@ -1,6 +1,6 @@
 <p align="center">
-  <H2>Memory Management 3.2.3 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.2.4 for the GPU Poor by DeepBeepMeep</H2>
 </p>

@@ -1,6 +1,6 @@
 [project]
 name = "mmgp"
-version = "3.2.3"
+version = "3.2.4"
 authors = [
   { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
 ]

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: mmgp
-Version: 3.2.3
+Version: 3.2.4
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 License:                     GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Requires-Dist: peft
 <p align="center">
-  <H2>Memory Management 3.2.3 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.2.4 for the GPU Poor by DeepBeepMeep</H2>
 </p>

mmgp 3.2.3tar.gz → 3.2.4tar.gz

Potentially problematic release.

mmgp 3.2.3__tar.gz → 3.2.4__tar.gz

Potentially problematic release.

mmgp 3.2.3tar.gz → 3.2.4tar.gz