PyPI - mmgp - Versions diffs - 3.3.2__py3-none-any.whl → 3.3.4__py3-none-any.whl - Mend

mmgp 3.3.2py3-none-any.whl → 3.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mmgp might be problematic. Click here for more details.

Files changed (7) hide show

mmgp/offload.py +36 -56
{mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/METADATA +2 -2
mmgp-3.3.4.dist-info/RECORD +9 -0
mmgp-3.3.2.dist-info/RECORD +0 -9
{mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/WHEEL +0 -0
{mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/licenses/LICENSE.md +0 -0
{mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/top_level.txt +0 -0

mmgp/offload.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# ------------------ Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep (mmgp)------------------
+# ------------------ Memory Management 3.3.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
 #
 # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ...  can run smoothly on a 24 GB GPU limited card.
 # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -401,7 +401,7 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
     return
-def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
+def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, big_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
     global max_pinnable_bytes, total_pinned_bytes
     if max_pinnable_bytes > 0 and  max_pinnable_bytes >= max_pinnable_bytes:
@@ -474,7 +474,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
                 length = torch.numel(p.data) * p.data.element_size()
             ref_cache[ref] = (n, length)
-            if current_big_tensor_size + length > gig_tensor_size :
+            if current_big_tensor_size + length > big_tensor_size and current_big_tensor_size !=0  :
                 big_tensors_sizes.append(current_big_tensor_size)
                 current_big_tensor_size = 0
                 big_tensor_no += 1
@@ -498,28 +498,11 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
     big_tensors_sizes.append(current_big_tensor_size)
     big_tensors = []
-    last_big_tensor = 0
     total = 0
     failed_planned_allocation = False
-    # for size in big_tensors_sizes:
-    #     try:
-    #         # if total > 7000 * ONE_MB:
-    #         #     raise  Exception ("test no more reserved RAM")
-    #         current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
-    #         big_tensors.append(current_big_tensor)
-    #     except:
-    #         print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
-    #         max_pinnable_bytes = total + total_pinned_bytes
-    #         failed_planned_allocation = True
-    #         break
-    #     last_big_tensor += 1
-    #     total += size
     gc.collect()
     last_allocated_big_tensor = -1
@@ -561,13 +544,6 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
                 total += size
-            # if big_tensor_no != prev_big_tensor:
-            #     gc.collect()
-            #     prev_big_tensor = big_tensor_no
-            # match_param, match_isbuffer = tied_weights.get(n, (None, False))
-            # if match_param != None:
-            # if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
             current_big_tensor = big_tensors[big_tensor_no]
             if is_buffer :
                 _force_load_buffer(p) # otherwise potential memory leak
@@ -600,9 +576,9 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
     if verboseLevel >=1:
         if partialPinning or failed_planned_allocation:
-            print(f"The model was partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
+            print(f"The model was partially pinned to reserved RAM: {last_allocated_big_tensor + 1} large blocks spread across {total/ONE_MB:.2f} MB")
         else:
-            print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
+            print(f"The whole model was pinned to reserved RAM: {last_allocated_big_tensor + 1} large blocks spread across {total/ONE_MB:.2f} MB")
     model._already_pinned = True
@@ -615,7 +591,7 @@ def _welcome():
     if welcome_displayed:
          return
     welcome_displayed = True
-    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
+    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
 def _extract_num_from_str(num_in_str):
     size = len(num_in_str)
@@ -901,17 +877,15 @@ def split_linear_modules(model, map ):
 def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, check_only = False, ignore_model_variations = False, pinnedLora = False, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
     verboseLevel = _compute_verbose_level(verboseLevel)
-    modules_dict = {k: v for k,v in model.named_modules()}
+    loras_model_data = getattr(model, "_loras_model_data", None)
+    if loras_model_data == None:
+        raise Exception(f"No Loras has been declared for this model while creating the corresponding offload object")
     if not check_only:
-        loras_model_data = dict()
-        model._loras_model_data = loras_model_data
-        loras_active_adapters = set()
-        model._loras_active_adapters = loras_active_adapters
-        loras_scaling = dict()
-        model._loras_scaling = loras_scaling
-        loras_tied_weights = dict()
-        model._loras_tied_weights = loras_tied_weights
+        unload_loras_from_model(model)
+    modules_dict = {k: v for k,v in model.named_modules()}
     CrLf = '\r\n'
     error_msg = ""
@@ -949,10 +923,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
             continue
         fail = False
         skip = False
-        state_dict = safetensors2.torch_load_file(path)
+        state_dict = safetensors2.torch_load_file(path, writable_tensors= False)
         if preprocess_sd != None:
             state_dict = preprocess_sd(state_dict)
@@ -1069,9 +1040,10 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
                         break
                 if not check_only:
                     loras_module_data = loras_model_data.get(module, None)
-                    if loras_module_data == None:
-                        loras_module_data = dict()
-                        loras_model_data[module] = loras_module_data
+                    assert loras_module_data != None
+                    # if loras_module_data == None:
+                    #     loras_module_data = dict()
+                    #     loras_model_data[module] = loras_module_data
                     loras_adapter_data =  loras_module_data.get(adapter_name, None)
                     lora_A = None if lora_A == None else lora_A.to(torch.bfloat16)
                     lora_B = None if lora_B == None else lora_B.to(torch.bfloat16)
@@ -1132,12 +1104,17 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
     return new_lora_path
 def unload_loras_from_model(model):
-    model._loras_model_data = None
+    for _, v in model._loras_model_data.items():
+        v.clear()
+    model._loras_active_adapters = set()
+    model._loras_scaling = dict()
+    model._loras_tied_weights = dict()
     model._loras_errors = None
     model._loras_adapters = None
-    model._loras_active_adapters = None
     model._loras_scaling = None
 def set_step_no_for_lora(model, step_no):
     model._lora_step_no = step_no
@@ -1881,14 +1858,14 @@ class offload:
         return result
-    def hook_lora_linear(self, submodule, current_model, model_id, submodule_name):
+    def hook_lora_linear(self, submodule, current_model, model_id, loras_model_data, submodule_name):
         old_forward = submodule.forward
+        loras_data = {}
+        loras_model_data[submodule] = loras_data
         def  lora_linear_forward(module,  *args, **kwargs):
-            loras_model_data = getattr(current_model, "_loras_model_data", None)
-            loras_data = None
-            if loras_model_data != None:
-                loras_data = loras_model_data.get(submodule, None)
-            if loras_data == None:
+            if len(loras_data) == 0:
                 return old_forward(*args, **kwargs)
             else:
                 return self._lora_linear_forward(current_model, submodule, loras_data,  *args, **kwargs)
@@ -2295,7 +2272,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
         current_budget = model_budgets[model_id]
         cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
         self.loaded_blocks[model_id] = None
-        any_lora =  loras !=None and model_id in loras  or getattr(current_model, "_loras_model_data", False)
+        any_lora =  loras !=None and model_id in loras
+        if any_lora:
+            loras_model_data = {}
+            current_model._loras_model_data = loras_model_data
         for submodule_name, submodule in current_model.named_modules():
             # create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
             # (it is queried in many pipelines even if offloading is not properly implemented)
@@ -2328,7 +2308,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
             if hasattr(submodule, "forward"):
                 if  any_lora and isinstance(submodule, torch.nn.Linear):
-                    submodule_method = self.hook_lora_linear(submodule, current_model, model_id, submodule_name)
+                    submodule_method = self.hook_lora_linear(submodule, current_model, model_id, loras_model_data, submodule_name)
                 else:
                     submodule_method = getattr(submodule, "forward")
                 if callable(submodule_method):

{mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mmgp
-Version: 3.3.2
+Version: 3.3.4
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 License:                     GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Dynamic: license-file
 <p align="center">
-  <H2>Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.3.4  for the GPU Poor by DeepBeepMeep</H2>
 </p>

mmgp-3.3.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
+mmgp/offload.py,sha256=WpQK1af2g0qcAm32EguTX8oBHZGKumPX2EqYS-df69Y,106583
+mmgp/safetensors2.py,sha256=rmUbBmK3Dra5prUTTRSVi6-XUFAa9Mj6B5CNPgzt9To,17333
+mmgp-3.3.4.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
+mmgp-3.3.4.dist-info/METADATA,sha256=Yk2eSpNITRDHK0lclsP6VXhW0_5hkUNVvXSfk25f7Ds,16154
+mmgp-3.3.4.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
+mmgp-3.3.4.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
+mmgp-3.3.4.dist-info/RECORD,,

mmgp-3.3.2.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
-mmgp/offload.py,sha256=43FnFfWqwhh2qz0uykqEpxb_XP9Jx8MPGzN31PExT2w,107470
-mmgp/safetensors2.py,sha256=rmUbBmK3Dra5prUTTRSVi6-XUFAa9Mj6B5CNPgzt9To,17333
-mmgp-3.3.2.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
-mmgp-3.3.2.dist-info/METADATA,sha256=mVMLkutqhUihIeo8uo_LK71ithm84_AEaNvnyRnzmEA,16153
-mmgp-3.3.2.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
-mmgp-3.3.2.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
-mmgp-3.3.2.dist-info/RECORD,,

{mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

{mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

mmgp 3.3.2__py3-none-any.whl → 3.3.4__py3-none-any.whl

Potentially problematic release.

mmgp 3.3.2py3-none-any.whl → 3.3.4py3-none-any.whl