PyPI - mmgp - Versions diffs - 3.5.10__tar.gz → 3.5.12__tar.gz - Mend

mmgp 3.5.10tar.gz → 3.5.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mmgp might be problematic. Click here for more details.

Files changed (14) hide show

{mmgp-3.5.10 → mmgp-3.5.12}/LICENSE.md RENAMED Viewed

@@ -1,2 +1,2 @@
-                    GNU GENERAL PUBLIC LICENSE
+                    GNU GENERAL PUBLIC LICENSE
                        Version 3, 29 June 2007

{mmgp-3.5.10/src/mmgp.egg-info → mmgp-3.5.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mmgp
-Version: 3.5.10
+Version: 3.5.12
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
 <p align="center">
-  <H2>Memory Management 3.5.10 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
 </p>

{mmgp-3.5.10 → mmgp-3.5.12}/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
 <p align="center">
-  <H2>Memory Management 3.5.10 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
 </p>

{mmgp-3.5.10 → mmgp-3.5.12}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mmgp"
-version = "3.5.10"
+version = "3.5.12"
 authors = [
   { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
 ]

{mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp/offload.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# ------------------ Memory Management 3.5.10 for the GPU Poor by DeepBeepMeep (mmgp)------------------
+# ------------------ Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep (mmgp)------------------
 #
 # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ...  can run smoothly on a 24 GB GPU limited card.
 # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -66,7 +66,6 @@ from accelerate import init_empty_weights
 import functools
 import types
-from functools import lru_cache
 import torch
@@ -90,6 +89,23 @@ class QEmbedding(QModuleMixin, torch.nn.Embedding):
 shared_state = {}
+def get_cache(cache_name):
+    all_cache = shared_state.get("_cache",  None)
+    if all_cache is None:
+        all_cache = {}
+        shared_state["_cache"]=  all_cache
+    cache = all_cache.get(cache_name, None)
+    if cache is None:
+        cache = {}
+        all_cache[cache_name] = cache
+    return cache
+def clear_caches():
+    all_cache = shared_state.get("_cache",  None)
+    if all_cache is not None:
+        all_cache.clear()
 mmm = safetensors2.mmm
 default_verboseLevel = 1
@@ -623,6 +639,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
                 total += size
             current_big_tensor = big_tensors[big_tensor_no]
             if is_buffer :
                 _force_load_buffer(p) # otherwise potential memory leak
             if isinstance(p, QTensor):
@@ -671,7 +688,7 @@ def _welcome():
     if welcome_displayed:
          return
     welcome_displayed = True
-    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.10) by DeepBeepMeep ************{ENDC}{UNBOLD}")
+    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.12) by DeepBeepMeep ************{ENDC}{UNBOLD}")
 def change_dtype(model, new_dtype, exclude_buffers = False):
     for submodule_name, submodule in model.named_modules():
@@ -1032,7 +1049,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
         if split_linear_modules_map != None:
             new_state_dict = dict()
-            suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False)]
+            suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False)]
             for module_name, module_data in state_dict.items():
                 name_parts = module_name.split(".")
                 for suffix, pos, any_split in suffixes:
@@ -1306,7 +1323,7 @@ def fast_load_transformers_model(model_path: str,  do_quantize = False, quantiza
         model_path = [model_path]
-    if not builtins.all(file_name.endswith(".sft") or file_name.endswith(".safetensors") or file_name.endswith(".pt") for file_name in model_path):
+    if not builtins.all(file_name.endswith(".sft") or file_name.endswith(".safetensors") or file_name.endswith(".pt") or file_name.endswith(".ckpt") for file_name in model_path):
         raise Exception("full model path to file expected")
     model_path = [ _get_model(file) for file in model_path]
@@ -1314,7 +1331,7 @@ def fast_load_transformers_model(model_path: str,  do_quantize = False, quantiza
         raise Exception("Unable to find file")
     verboseLevel = _compute_verbose_level(verboseLevel)
-    if model_path[-1].endswith(".pt"):
+    if model_path[-1].endswith(".pt") or model_path[-1].endswith(".ckpt"):
         metadata = None
     else:
         with safetensors2.safe_open(model_path[-1], writable_tensors =writable_tensors) as f:
@@ -2481,7 +2498,7 @@ def {fname}(module, *args, **kwargs):
-def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True,  extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
+def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True,  extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, verboseLevel = -1):
     """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
     pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
     quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -2490,6 +2507,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
     budgets: 0 by default (unlimited). If non 0, it corresponds to the maximum size in MB that every model will occupy at any moment
         (in fact the real usage is twice this number). It is very efficient to reduce VRAM consumption but this feature may be very slow
         if pinnedMemory is not enabled
+    vram_safety_coefficient: float between 0 and 1 (exclusive), default 0.8. Sets the maximum portion of VRAM that can be used for models.
+        Lower values provide more safety margin but may reduce performance.
     """
     self = offload()
     self.verboseLevel = verboseLevel
@@ -2505,7 +2524,11 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
             return float(b[:-1]) * self.device_mem_capacity
         else:
             return b * ONE_MB
+    # Validate vram_safety_coefficient
+    if not isinstance(vram_safety_coefficient, float) or vram_safety_coefficient <= 0 or vram_safety_coefficient >= 1:
+        raise ValueError("vram_safety_coefficient must be a float between 0 and 1 (exclusive)")
     budget = 0
     if not budgets is None:
         if isinstance(budgets , dict):
@@ -2650,14 +2673,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
                 model_budget =  new_budget if model_budget == 0 or new_budget < model_budget else model_budget
         if  model_budget > 0 and model_budget > current_model_size:
             model_budget = 0
-        coef =0.8
+        coef =vram_safety_coefficient
         if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
             if verboseLevel >= 1:
                 if model_budget == 0:
-                    print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
+                    print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100:.0f}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB:0.1f}) MB)")
                 else:
                     print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
-                print(f"Budget allocation for this model has been consequently reduced to the 80% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
+                print(f"Budget allocation for this model has been consequently reduced to the {coef * 100:.0f}% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
                 model_budget = coef * self.device_mem_capacity
@@ -2681,6 +2704,21 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
                     print(f"Model '{model_id}' already pinned to reserved memory")
             else:
                 _pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, perc_reserved_mem_max = perc_reserved_mem_max, verboseLevel=verboseLevel)
+                # empty_tensor = torch.empty((1,))
+                # for sub_module_name, sub_module  in current_model.named_modules():
+                #     for k, p in  sub_module.named_parameters(recurse=False):
+                #         if p is not None:
+                #             if isinstance(p, QTensor):
+                #                 p._data.data = empty_tensor
+                #                 p._scale.data = empty_tensor
+                #             else:
+                #                 p.data = empty_tensor
+                #             del k
+                #     for k, v in  sub_module.named_buffers(recurse=False):
+                #         del k
+                # sub_module = None
+                # v = None
+                # gc.collect()
         current_budget = model_budgets[model_id]
         cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
         self.loaded_blocks[model_id] = None

{mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp/safetensors2.py RENAMED Viewed

@@ -46,7 +46,16 @@ class MmapTracker:
         file_path = os.path.join(*s)
         self.file_path = file_path # os.path.abspath(file_path)
         self.count = 0
-        mmm[file_path] = self
+        key = file_path
+        i = 1
+        while True:
+            if key not in mmm:
+                mmm[key] = self
+                break
+            i +=1
+            key = key + "#" + str(i)
+        self.mmm_key = key
+        # print(f"MMAP Add: {file_path}: {mmm.keys()}")
     def register(self, mmap_obj, map_id, start, size):
@@ -61,7 +70,8 @@ class MmapTracker:
                 print(f"MMap Manager of file '{self.file_path}' : MMap no {map_id} has been released" + text)
             if self.count == self._already_released:
-                del mmm[self.file_path]
+                # print(f"MMAP Del: {self.file_path}: {mmm.keys()}")
+                del mmm[self.mmm_key ]
             self._maps.pop(map_id, None)
@@ -240,7 +250,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None, extr
                         t = t.view(torch.uint16)
                     elif  dtype ==  torch.float8_e5m2 or dtype ==  torch.float8_e4m3fn:
                         t = t.view(torch.uint8)
-                    buffer = t.numpy().tobytes()
+                    buffer = t.cpu().numpy().tobytes()
                     bytes_written = writer.write(buffer)
                     assert bytes_written == size
             i+=1

{mmgp-3.5.10 → mmgp-3.5.12/src/mmgp.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mmgp
-Version: 3.5.10
+Version: 3.5.12
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
 <p align="center">
-  <H2>Memory Management 3.5.10 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
 </p>

{mmgp-3.5.10 → mmgp-3.5.12}/setup.cfg RENAMED Viewed

File without changes

{mmgp-3.5.10 → mmgp-3.5.12}/src/__init__.py RENAMED Viewed

File without changes

{mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp/__init__.py RENAMED Viewed

File without changes

{mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp.egg-info/requires.txt RENAMED Viewed

File without changes

{mmgp-3.5.10 → mmgp-3.5.12}/src/mmgp.egg-info/top_level.txt RENAMED Viewed

File without changes

mmgp 3.5.10__tar.gz → 3.5.12__tar.gz

Potentially problematic release.

mmgp 3.5.10tar.gz → 3.5.12tar.gz