PyPI - mmgp - Versions diffs - 3.3.2__tar.gz → 3.3.4__tar.gz - Mend

@@ -1,4 +1,4 @@
-# ------------------ Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep (mmgp)------------------
+# ------------------ Memory Management 3.3.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
 #
 # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ...  can run smoothly on a 24 GB GPU limited card.
 # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -401,7 +401,7 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
     return
-def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
+def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, big_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
     global max_pinnable_bytes, total_pinned_bytes
     if max_pinnable_bytes > 0 and  max_pinnable_bytes >= max_pinnable_bytes:
@@ -474,7 +474,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
                 length = torch.numel(p.data) * p.data.element_size()
             ref_cache[ref] = (n, length)
-            if current_big_tensor_size + length > gig_tensor_size :
+            if current_big_tensor_size + length > big_tensor_size and current_big_tensor_size !=0  :
                 big_tensors_sizes.append(current_big_tensor_size)
                 current_big_tensor_size = 0
                 big_tensor_no += 1
@@ -498,28 +498,11 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
     big_tensors_sizes.append(current_big_tensor_size)
     big_tensors = []
-    last_big_tensor = 0
     total = 0
     failed_planned_allocation = False
-    # for size in big_tensors_sizes:
-    #     try:
-    #         # if total > 7000 * ONE_MB:
-    #         #     raise  Exception ("test no more reserved RAM")
-    #         current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
-    #         big_tensors.append(current_big_tensor)
-    #     except:
-    #         print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
-    #         max_pinnable_bytes = total + total_pinned_bytes
-    #         failed_planned_allocation = True
-    #         break
-    #     last_big_tensor += 1
-    #     total += size
     gc.collect()
     last_allocated_big_tensor = -1
@@ -561,13 +544,6 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
                 total += size
-            # if big_tensor_no != prev_big_tensor:
-            #     gc.collect()
-            #     prev_big_tensor = big_tensor_no
-            # match_param, match_isbuffer = tied_weights.get(n, (None, False))
-            # if match_param != None:
-            # if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
             current_big_tensor = big_tensors[big_tensor_no]
             if is_buffer :
                 _force_load_buffer(p) # otherwise potential memory leak
@@ -600,9 +576,9 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
     if verboseLevel >=1:
         if partialPinning or failed_planned_allocation:
-            print(f"The model was partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
+            print(f"The model was partially pinned to reserved RAM: {last_allocated_big_tensor + 1} large blocks spread across {total/ONE_MB:.2f} MB")
         else:
-            print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
+            print(f"The whole model was pinned to reserved RAM: {last_allocated_big_tensor + 1} large blocks spread across {total/ONE_MB:.2f} MB")
     model._already_pinned = True
@@ -615,7 +591,7 @@ def _welcome():
     if welcome_displayed:
          return
     welcome_displayed = True
-    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
+    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
 def _extract_num_from_str(num_in_str):
     size = len(num_in_str)
@@ -901,17 +877,15 @@ def split_linear_modules(model, map ):
 def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, check_only = False, ignore_model_variations = False, pinnedLora = False, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
     verboseLevel = _compute_verbose_level(verboseLevel)
-    modules_dict = {k: v for k,v in model.named_modules()}
+    loras_model_data = getattr(model, "_loras_model_data", None)
+    if loras_model_data == None:
+        raise Exception(f"No Loras has been declared for this model while creating the corresponding offload object")
     if not check_only:
-        loras_model_data = dict()
-        model._loras_model_data = loras_model_data
-        loras_active_adapters = set()
-        model._loras_active_adapters = loras_active_adapters
-        loras_scaling = dict()
-        model._loras_scaling = loras_scaling
-        loras_tied_weights = dict()
-        model._loras_tied_weights = loras_tied_weights
+        unload_loras_from_model(model)
+    modules_dict = {k: v for k,v in model.named_modules()}
     CrLf = '\r\n'
     error_msg = ""
@@ -949,10 +923,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
             continue
         fail = False
         skip = False
-        state_dict = safetensors2.torch_load_file(path)
+        state_dict = safetensors2.torch_load_file(path, writable_tensors= False)
         if preprocess_sd != None:
             state_dict = preprocess_sd(state_dict)
@@ -1069,9 +1040,10 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
                         break
                 if not check_only:
                     loras_module_data = loras_model_data.get(module, None)
-                    if loras_module_data == None:
-                        loras_module_data = dict()
-                        loras_model_data[module] = loras_module_data
+                    assert loras_module_data != None
+                    # if loras_module_data == None:
+                    #     loras_module_data = dict()
+                    #     loras_model_data[module] = loras_module_data
                     loras_adapter_data =  loras_module_data.get(adapter_name, None)
                     lora_A = None if lora_A == None else lora_A.to(torch.bfloat16)
                     lora_B = None if lora_B == None else lora_B.to(torch.bfloat16)
@@ -1132,12 +1104,17 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
     return new_lora_path
 def unload_loras_from_model(model):
-    model._loras_model_data = None
+    for _, v in model._loras_model_data.items():
+        v.clear()
+    model._loras_active_adapters = set()
+    model._loras_scaling = dict()
+    model._loras_tied_weights = dict()
     model._loras_errors = None
     model._loras_adapters = None
-    model._loras_active_adapters = None
     model._loras_scaling = None
 def set_step_no_for_lora(model, step_no):
     model._lora_step_no = step_no
@@ -1881,14 +1858,14 @@ class offload:
         return result
-    def hook_lora_linear(self, submodule, current_model, model_id, submodule_name):
+    def hook_lora_linear(self, submodule, current_model, model_id, loras_model_data, submodule_name):
         old_forward = submodule.forward
+        loras_data = {}
+        loras_model_data[submodule] = loras_data
         def  lora_linear_forward(module,  *args, **kwargs):
-            loras_model_data = getattr(current_model, "_loras_model_data", None)
-            loras_data = None
-            if loras_model_data != None:
-                loras_data = loras_model_data.get(submodule, None)
-            if loras_data == None:
+            if len(loras_data) == 0:
                 return old_forward(*args, **kwargs)
             else:
                 return self._lora_linear_forward(current_model, submodule, loras_data,  *args, **kwargs)
@@ -2295,7 +2272,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
         current_budget = model_budgets[model_id]
         cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
         self.loaded_blocks[model_id] = None
-        any_lora =  loras !=None and model_id in loras  or getattr(current_model, "_loras_model_data", False)
+        any_lora =  loras !=None and model_id in loras
+        if any_lora:
+            loras_model_data = {}
+            current_model._loras_model_data = loras_model_data
         for submodule_name, submodule in current_model.named_modules():
             # create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
             # (it is queried in many pipelines even if offloading is not properly implemented)
@@ -2328,7 +2308,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
             if hasattr(submodule, "forward"):
                 if  any_lora and isinstance(submodule, torch.nn.Linear):
-                    submodule_method = self.hook_lora_linear(submodule, current_model, model_id, submodule_name)
+                    submodule_method = self.hook_lora_linear(submodule, current_model, model_id, loras_model_data, submodule_name)
                 else:
                     submodule_method = getattr(submodule, "forward")
                 if callable(submodule_method):

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mmgp
-Version: 3.3.2
+Version: 3.3.4
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 License:                     GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Dynamic: license-file
 <p align="center">
-  <H2>Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.3.4  for the GPU Poor by DeepBeepMeep</H2>
 </p>

@@ -1,6 +1,6 @@
 <p align="center">
-  <H2>Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.3.4  for the GPU Poor by DeepBeepMeep</H2>
 </p>

@@ -1,6 +1,6 @@
 [project]
 name = "mmgp"
-version = "3.3.2"
+version = "3.3.4"
 authors = [
   { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
 ]

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mmgp
-Version: 3.3.2
+Version: 3.3.4
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 License:                     GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Dynamic: license-file
 <p align="center">
-  <H2>Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.3.4  for the GPU Poor by DeepBeepMeep</H2>
 </p>

mmgp 3.3.2tar.gz → 3.3.4tar.gz

Potentially problematic release.

mmgp 3.3.2__tar.gz → 3.3.4__tar.gz

Potentially problematic release.

mmgp 3.3.2tar.gz → 3.3.4tar.gz