PyPI - mmgp - Versions diffs - 3.5.6__tar.gz → 3.5.8__tar.gz - Mend

mmgp 3.5.6tar.gz → 3.5.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mmgp might be problematic. Click here for more details.

Files changed (14) hide show

{mmgp-3.5.6/src/mmgp.egg-info → mmgp-3.5.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mmgp
-Version: 3.5.6
+Version: 3.5.8
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
 <p align="center">
-  <H2>Memory Management 3.5.6 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.5.8 for the GPU Poor by DeepBeepMeep</H2>
 </p>

{mmgp-3.5.6 → mmgp-3.5.8}/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
 <p align="center">
-  <H2>Memory Management 3.5.6 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.5.8 for the GPU Poor by DeepBeepMeep</H2>
 </p>

{mmgp-3.5.6 → mmgp-3.5.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mmgp"
-version = "3.5.6"
+version = "3.5.8"
 authors = [
   { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
 ]

{mmgp-3.5.6 → mmgp-3.5.8}/src/mmgp/offload.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# ------------------ Memory Management 3.5.6 for the GPU Poor by DeepBeepMeep (mmgp)------------------
+# ------------------ Memory Management 3.5.8 for the GPU Poor by DeepBeepMeep (mmgp)------------------
 #
 # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ...  can run smoothly on a 24 GB GPU limited card.
 # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -64,6 +64,11 @@ import psutil
 import builtins
 from accelerate import init_empty_weights
+import functools
+import types
+from functools import lru_cache
+import torch
 from mmgp import safetensors2
 from mmgp import profile_type
@@ -122,8 +127,6 @@ class clock:
     def format_time_gap(self):
         return f"{self.stop_time - self.start_time:.2f}s"
 # useful functions to move a group of tensors (to design custom offload patches)
 def move_tensors(obj, device):
     if torch.is_tensor(obj):
@@ -668,7 +671,7 @@ def _welcome():
     if welcome_displayed:
          return
     welcome_displayed = True
-    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.6) by DeepBeepMeep ************{ENDC}{UNBOLD}")
+    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.8) by DeepBeepMeep ************{ENDC}{UNBOLD}")
 def change_dtype(model, new_dtype, exclude_buffers = False):
     for submodule_name, submodule in model.named_modules():
@@ -1295,7 +1298,7 @@ def move_loras_to_device(model, device="cpu" ):
         if ".lora_" in k:
             m.to(device)
-def fast_load_transformers_model(model_path: str,  do_quantize = False, quantizationType =  qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, modules = None,  return_shared_modules = None,  configKwargs ={}):
+def fast_load_transformers_model(model_path: str,  do_quantize = False, quantizationType =  qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, preprocess_sd  = None, modules = None,  return_shared_modules = None,  configKwargs ={}):
     """
     quick version of .LoadfromPretrained of  the transformers library
     used to build a model and load the corresponding weights (quantized or not)
@@ -1383,13 +1386,13 @@ def fast_load_transformers_model(model_path: str,  do_quantize = False, quantiza
     model._config = transformer_config
-    load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, modules = modules, return_shared_modules =  return_shared_modules, verboseLevel=verboseLevel )
+    load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, preprocess_sd = preprocess_sd , modules = modules, return_shared_modules =  return_shared_modules, verboseLevel=verboseLevel )
     return model
-def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True,  modules = None, return_shared_modules = None, verboseLevel = -1):
+def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True,  preprocess_sd = None, modules = None, return_shared_modules = None, verboseLevel = -1):
     """
     Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
     """
@@ -1506,6 +1509,9 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
     full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
     # deal if we are trying to load just a sub part of a larger model
+    if preprocess_sd != None:
+        state_dict, quantization_map = preprocess_sd(state_dict, quantization_map)
     if modelPrefix != None:
         base_model_prefix = modelPrefix + "."
         state_dict = filter_state_dict(state_dict,base_model_prefix)
@@ -1756,6 +1762,7 @@ class offload:
         global last_offload_obj
         last_offload_obj = self
+        self._type_wrappers = {}
     def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
@@ -2203,7 +2210,7 @@ class offload:
                 if len(loras_data) == 0:
                     return old_forward(*args, **kwargs)
                 else:
-                    submodule.aaa = submodule_name
+                    #submodule.aaa = submodule_name # just for debugging if uncommented will cause pytorch recompilation
                     return self._lora_linear_forward(current_model, submodule, loras_data,  *args, **kwargs)
             target_fn = lora_linear_forward
         else:
@@ -2235,10 +2242,63 @@ class offload:
         # need to be registered before the forward not to be break the efficiency of the compilation chain
         # it should be at the top of the compilation as this type of hook in the middle of a chain seems to break memory performance
-        target_module.register_forward_pre_hook(preload_blocks_for_compile)
+        target_module.register_forward_pre_hook(preload_blocks_for_compile)
-    def hook_check_empty_cache_needed(self, target_module, model, model_id, blocks_name, previous_method,  context):
+    @torch._dynamo.disable
+    def _pre_check(self, module):
+        model_id    = getattr(module, "_mm_model_id", None)
+        blocks_name = getattr(module, "_mm_blocks_name", None)
+        self.ensure_model_loaded(model_id)
+        if blocks_name is None:
+            if self.ready_to_check_mem():
+                self.empty_cache_if_needed()
+        elif blocks_name != self.loaded_blocks[model_id] and \
+             blocks_name not in self.preloaded_blocks_per_model[model_id]:
+            self.gpu_load_blocks(model_id, blocks_name)
+    def _get_wrapper_for_type(self, mod_cls):
+        fn = self._type_wrappers.get(mod_cls)
+        if fn is not None:
+            return fn
+        # Unique function name per class -> unique compiled code object
+        fname = f"_mm_wrap_{mod_cls.__module__.replace('.', '_')}_{mod_cls.__name__}"
+        # Keep body minimal; all heavy/offload logic runs out-of-graph in _pre_check
+        # Include __TYPE_CONST in the code so the bytecode/consts differ per class.
+        src = f"""
+def {fname}(module, *args, **kwargs):
+    _ = __TYPE_CONST  # anchor type as a constant to make code object unique per class
+    mgr = module._mm_manager
+    mgr._pre_check(module)
+    return module._mm_forward(*args, **kwargs)
+"""
+        ns = {"__TYPE_CONST": mod_cls}
+        exec(src, ns)                   # compile a new function object/code object for this class
+        fn = ns[fname]
+        self._type_wrappers[mod_cls] = fn
+        return fn
+    def hook_check_load_into_GPU_if_needed(
+        self, target_module, model, model_id, blocks_name, previous_method, context
+    ):
+        # store instance data on the module (not captured by the wrapper)
+        target_module._mm_manager     = self
+        target_module._mm_model_id    = model_id
+        target_module._mm_blocks_name = blocks_name
+        target_module._mm_forward     = previous_method
+        # per-TYPE wrapper (unique bytecode per class, reused across instances of that class)
+        wrapper_fn = self._get_wrapper_for_type(type(target_module))
+        # bind as a bound method (no partial/closures)
+        target_module.forward = types.MethodType(wrapper_fn, target_module)
+    def hook_check_load_into_GPU_if_needed_default(self, target_module, model, model_id, blocks_name, previous_method,  context):
         dtype = model._dtype
         qint4quantization =  isinstance(target_module, QModuleMixin) and  target_module.weight!= None and  target_module.weight.qtype == qint4
@@ -2258,25 +2318,35 @@ class offload:
             target_module.forward = target_module._mm_forward
             return
-        def check_empty_cuda_cache(module, *args, **kwargs):
+        def check_load_into_GPU_needed():
             self.ensure_model_loaded(model_id)
             if blocks_name == None:
                 if self.ready_to_check_mem():
                     self.empty_cache_if_needed()
             elif blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
                 self.gpu_load_blocks(model_id, blocks_name)
-            if qint4quantization and dtype !=None:
-                args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
-            return previous_method(*args, **kwargs)
+            # if qint4quantization and dtype !=None:
+            #     args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
+        if isinstance(target_module, torch.nn.Linear):
+            def check_load_into_GPU_needed_linear(module, *args, **kwargs):
+                check_load_into_GPU_needed()
+                return previous_method(*args, **kwargs)
+            check_load_into_GPU_needed_module = check_load_into_GPU_needed_linear
+        else:
+            def check_load_into_GPU_needed_other(module, *args, **kwargs):
+                check_load_into_GPU_needed()
+                return previous_method(*args, **kwargs)
+            check_load_into_GPU_needed_module = check_load_into_GPU_needed_other
         setattr(target_module, "_mm_id", model_id)
         setattr(target_module, "_mm_forward", previous_method)
-        setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
+        setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_load_into_GPU_needed_module, target_module), previous_method) )
+        # target_module.register_forward_pre_hook(check_empty_cuda_cache)
-    def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
+    def hook_change_module(self, target_module, model, model_id, module_id, previous_method, previous_method_name ):
         if hasattr(target_module, "_lock_dtype"):
             dtype = target_module._lock_dtype
         else:
@@ -2289,16 +2359,17 @@ class offload:
                 args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
             return previous_method(*args, **kwargs)
-        if hasattr(target_module, "_mm_id"):
+        if hasattr(target_module, "_mm_" + previous_method_name):
             return
-        setattr(target_module, "_mm_id", model_id)
+        setattr(target_module, "_mm_Id", model_id)
+        setattr(target_module, "_mm_" + previous_method_name, previous_method)
-        setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_change_module, target_module), previous_method) )
+        setattr(target_module, previous_method_name, functools.update_wrapper(functools.partial(check_change_module, target_module), previous_method) )
         if not self.verboseLevel >=1:
             return
-        if module_id == None or module_id =='':
+        if previous_method_name =="forward" and (module_id == None or module_id ==''):
             model_name = model._get_name()
             print(f"Hooked to model '{model_id}' ({model_name})")
@@ -2605,19 +2676,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
     for model_id in models:
         current_model: torch.nn.Module = models[model_id]
         towers_names, towers_modules = _detect_main_towers(current_model)
-        # compile main iterative modules stacks ("towers")
         compilationInThisOne = compileAllModels or model_id in modelsToCompile
-        if compilationInThisOne:
-            if self.verboseLevel>=1:
-                if len(towers_modules)>0:
-                    formated_tower_names = [name + '*' for name in towers_names]
-                    print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
-                else:
-                    print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
-            for submodel in towers_modules:
-                submodel.forward= torch.compile(submodel.forward,  backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
-                    #dynamic=True,
         if pinAllModels or model_id in modelsToPin:
             if hasattr(current_model,"_already_pinned"):
@@ -2661,24 +2720,42 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
                         cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre, None, num, False
                         cur_blocks_name = submodule_name
                         # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
-            if hasattr(submodule, "forward"):
-                # if  any_lora and isinstance(submodule, ( torch.nn.Linear, torch.nn.Conv3d, torch.nn.LayerNorm)):
-                if  any_lora and  hasattr(submodule,"weight"):
+            top_submodule = len(submodule_name.split("."))==1
+            offload_hooks = submodule._offload_hooks if hasattr(submodule, "_offload_hooks") else []
+            assert top_submodule or len(offload_hooks) == 0, "custom offload hooks can only be set at the of the module"
+            submodule_method_names = ["forward"] +  offload_hooks
+            for submodule_method_name in submodule_method_names:
+                if not hasattr(submodule, submodule_method_name ): continue
+                if submodule_method_name == "forward" and any_lora and hasattr(submodule,"weight"):
                     submodule_method = self.hook_lora(submodule, current_model, model_id, loras_model_data, loras_model_shortcuts, submodule_name)
                 else:
-                    submodule_method = getattr(submodule, "forward")
-                if callable(submodule_method):
-                    if len(submodule_name.split("."))==1:
-                        self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method)
+                    submodule_method = getattr(submodule, submodule_method_name)
+                if callable(submodule_method):
+                    if top_submodule and cur_blocks_name is None:
+                        self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method, submodule_method_name)
                     elif compilationInThisOne and submodule in towers_modules:
                         self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
                     else:
-                        self.hook_check_empty_cache_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
-                self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
+                        if compilationInThisOne and False:
+                            self.hook_check_load_into_GPU_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
+                        else:
+                            self.hook_check_load_into_GPU_if_needed_default(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
+                    self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
+        # compile main iterative modules stacks ("towers")
+        if compilationInThisOne:
+            if self.verboseLevel>=1:
+                if len(towers_modules)>0:
+                    formated_tower_names = [name + '*' for name in towers_names]
+                    print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
+                else:
+                    print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
+            for submodel in towers_modules:
+                submodel.forward= torch.compile(submodel.forward,  backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
+                    #dynamic=True,
         self.tune_preloading(model_id, current_budget, towers_names)
         self.parameters_ref  = {}

{mmgp-3.5.6 → mmgp-3.5.8/src/mmgp.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mmgp
-Version: 3.5.6
+Version: 3.5.8
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
 <p align="center">
-  <H2>Memory Management 3.5.6 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.5.8 for the GPU Poor by DeepBeepMeep</H2>
 </p>