PyPI - mmgp - Versions diffs - 3.0.9__py3-none-any.whl → 3.1.1__py3-none-any.whl - Mend - Supply Chain Defender

mmgp 3.0.9py3-none-any.whl → 3.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mmgp might be problematic. Click here for more details.

Files changed (8) hide show

mmgp/offload.py +697 -583
mmgp/safetensors2.py +40 -30
{mmgp-3.0.9.dist-info → mmgp-3.1.1.dist-info}/METADATA +3 -3
mmgp-3.1.1.dist-info/RECORD +9 -0
mmgp-3.0.9.dist-info/RECORD +0 -9
{mmgp-3.0.9.dist-info → mmgp-3.1.1.dist-info}/LICENSE.md +0 -0
{mmgp-3.0.9.dist-info → mmgp-3.1.1.dist-info}/WHEEL +0 -0
{mmgp-3.0.9.dist-info → mmgp-3.1.1.dist-info}/top_level.txt +0 -0

mmgp/offload.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# ------------------ Memory Management 3.0 for the GPU Poor by DeepBeepMeep (mmgp)------------------
+# ------------------ Memory Management 3.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
 #
 # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ...  can run smoothly on a 24 GB GPU limited card.
 # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -79,7 +79,7 @@ from mmgp import profile_type
 from optimum.quanto import freeze,  qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor,  quantize_module
+shared_state = {}
 mmm = safetensors2.mmm
@@ -154,33 +154,75 @@ def _get_max_reservable_memory(perc_reserved_mem_max):
         perc_reserved_mem_max = 0.40 if os.name == 'nt' else 0.5
     return  perc_reserved_mem_max * physical_memory
-def _detect_main_towers(model, verboseLevel=1):
+def _detect_main_towers(model, min_floors = 5, verboseLevel=1):
     cur_blocks_prefix = None
     towers_modules= []
     towers_names= []
+    floors_modules= []
+    tower_name = None
     for submodule_name, submodule in model.named_modules():
         if submodule_name=='':
             continue
-        if isinstance(submodule, torch.nn.ModuleList):
-            newList =False
-            if cur_blocks_prefix == None:
-                cur_blocks_prefix = submodule_name + "."
-                newList = True
-            else:
-                if not submodule_name.startswith(cur_blocks_prefix):
-                    cur_blocks_prefix = submodule_name + "."
-                    newList = True
+        if cur_blocks_prefix != None:
+            if submodule_name.startswith(cur_blocks_prefix):
+                depth_prefix = cur_blocks_prefix.split(".")
+                depth_name = submodule_name.split(".")
+                level  =  depth_name[len(depth_prefix)-1]
+                pre , num = _extract_num_from_str(level)
-            if newList and len(submodule)>=5:
-                towers_names.append(submodule_name)
-                towers_modules.append(submodule)
+                if num != cur_blocks_seq:
+                    floors_modules.append(submodule)
+                cur_blocks_seq = num
+            else:
+                if len(floors_modules) >= min_floors:
+                    towers_modules += floors_modules
+                    towers_names.append(tower_name)
+                tower_name = None
+                floors_modules= []
+                cur_blocks_prefix, cur_blocks_seq = None, -1
+        if cur_blocks_prefix == None:
+            pre , num = _extract_num_from_str(submodule_name)
+            if isinstance(submodule, (torch.nn.ModuleList)):
+                cur_blocks_prefix, cur_blocks_seq = pre + ".",  -1
+                tower_name = submodule_name + ".*"
+            elif num >=0:
+                cur_blocks_prefix, cur_blocks_seq = pre, num
+                tower_name = submodule_name[ :-1] + "*"
+                floors_modules.append(submodule)
+    if len(floors_modules) >= min_floors:
+        towers_modules += floors_modules
+        towers_names.append(tower_name)
+    # for submodule_name, submodule in model.named_modules():
+    #     if submodule_name=='':
+    #         continue
+    #     if isinstance(submodule, torch.nn.ModuleList):
+    #         newList =False
+    #         if cur_blocks_prefix == None:
+    #             cur_blocks_prefix = submodule_name + "."
+    #             newList = True
+    #         else:
+    #             if not submodule_name.startswith(cur_blocks_prefix):
+    #                 cur_blocks_prefix = submodule_name + "."
+    #                 newList = True
+    #         if newList and len(submodule)>=5:
+    #             towers_names.append(submodule_name)
+    #             towers_modules.append(submodule)
-        else:
-            if cur_blocks_prefix is not None:
-                if not submodule_name.startswith(cur_blocks_prefix):
-                    cur_blocks_prefix = None
+    #     else:
+    #         if cur_blocks_prefix is not None:
+    #             if not submodule_name.startswith(cur_blocks_prefix):
+    #                 cur_blocks_prefix = None
     return towers_names, towers_modules
@@ -194,7 +236,7 @@ def _get_model(model_path):
     _path = Path(model_path).parts
     _filename = _path[-1]
     _path = _path[:-1]
-    if len(_path)==1:
+    if len(_path)<=1:
         raise("file not found")
     else:
         from huggingface_hub import  hf_hub_download #snapshot_download,
@@ -219,6 +261,29 @@ def _remove_model_wrapper(model):
         return sub_module
     return model
+    # def force_load_tensor(t):
+    #     c = torch.nn.Parameter(t + 0)
+    #     torch.utils.swap_tensors(t, c)
+    #     del c
+    # for n,m in model_to_quantize.named_modules():
+    #     # do not read quantized weights (detected them directly or behind an adapter)
+    #     if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and  isinstance(m.base_layer, QModuleMixin):
+    #         if hasattr(m, "bias") and m.bias is not None:
+    #             force_load_tensor(m.bias.data)
+    #             # m.bias.data = m.bias.data + 0
+    #     else:
+    #         for n, p in m.named_parameters(recurse = False):
+    #             data = getattr(m, n)
+    #             force_load_tensor(data)
+    #             # setattr(m,n, torch.nn.Parameter(data + 0 ) )
+    #     for b in m.buffers(recurse = False):
+    #         # b.data = b.data + 0
+    #         b.data = torch.nn.Buffer(b.data + 0)
+    #         force_load_tensor(b.data)
 def _move_to_pinned_tensor(source_tensor, big_tensor, offset, length):
@@ -248,6 +313,17 @@ def _safetensors_load_file(file_path):
     return sd, metadata
+def _force_load_buffer(p):
+    # To do : check if buffer was persistent and transfer state, or maybe swap keep already this property ?
+    q = torch.nn.Buffer(p + 0)
+    torch.utils.swap_tensors(p, q)
+    del q
+def _force_load_parameter(p):
+    q = torch.nn.Parameter(p + 0)
+    torch.utils.swap_tensors(p, q)
+    del q
 def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_max = 0, verboseLevel = 1):
     if  verboseLevel>=1 :
         if partialPinning:
@@ -260,6 +336,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
         towers_names, _ = _detect_main_towers(model)
         towers_names = [n +"." for n in towers_names]
     BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
     current_big_tensor_size = 0
     big_tensor_no  = 0
@@ -273,10 +350,10 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
         if partialPinning:
             include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
         if include:
-            params_list = params_list +  list(sub_module.buffers(recurse=False)) + list(sub_module.parameters(recurse=False))
+            params_list = params_list +  [ (k + '.' + n, p,  False)  for n, p in sub_module.named_parameters(recurse=False)] +  [ (k + '.' + n, p,  True)  for n, p in sub_module.named_buffers(recurse=False)]
-    # print(f"num params to pin {model_id}: {len(params_list)}")
-    for p in params_list:
+    for n, p, _ in params_list:
         if isinstance(p, QTensor):
             if p._qtype == qint4:
                 if hasattr(p,"_scale_shift"):
@@ -288,10 +365,16 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
         else:
             length = torch.numel(p.data) * p.data.element_size()
         if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
             big_tensors_sizes.append(current_big_tensor_size)
             current_big_tensor_size = 0
             big_tensor_no += 1
+        itemsize = p.data.dtype.itemsize
+        if current_big_tensor_size % itemsize:
+            current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
         tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length  ))
         current_big_tensor_size += length
@@ -320,12 +403,18 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
     gc.collect()
     tensor_no = 0
-    for p in params_list:
+    # prev_big_tensor = 0
+    for n, p,  is_buffer in params_list:
         big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
+        # if big_tensor_no != prev_big_tensor:
+        #     gc.collect()
+        #     prev_big_tensor = big_tensor_no
         if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
             current_big_tensor = big_tensors[big_tensor_no]
+            if is_buffer :
+                _force_load_buffer(p) # otherwise potential memory leak
             if isinstance(p, QTensor):
                 if p._qtype == qint4:
                     length1 = torch.numel(p._data._data) * p._data._data.element_size()
@@ -353,7 +442,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
     gc.collect()
     if verboseLevel >=1:
-        if total_tensor_bytes == total:
+        if total_tensor_bytes <= total:
             print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
         else:
             print(f"{total/ONE_MB:.2f} MB were pinned to reserved RAM out of {total_tensor_bytes/ONE_MB:.2f} MB")
@@ -369,8 +458,16 @@ def _welcome():
     if welcome_displayed:
          return
     welcome_displayed = True
-    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.0) by DeepBeepMeep ************{ENDC}{UNBOLD}")
+    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
+def _extract_num_from_str(num_in_str):
+    for i in range(len(num_in_str)):
+        if not num_in_str[-i-1:].isnumeric():
+            if i == 0:
+                return num_in_str, -1
+            else:
+                return num_in_str[: -i],  int(num_in_str[-i:])
+    return  "", int(num_in_str)
 def  _quantize_dirty_hack(model):
     # dirty hack: add a hook on state_dict() to return a fake non quantized state_dict if called by Lora Diffusers initialization functions
@@ -479,55 +576,56 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
     if hasattr(model_to_quantize, "_quanto_map"):
         for k, entry in model_to_quantize._quanto_map.items():
             weights  =  entry["weights"]
-            print(f"Model '{model_id}' is already quantized to format '{weights}'")
+            print(f"Model '{model_id}' is already quantized in format '{weights}'")
             return False
         print(f"Model '{model_id}' is already quantized")
         return False
     print(f"Quantization of model '{model_id}' started to format '{weights}'")
+    tower_names ,_  = _detect_main_towers(model_to_quantize)
+    tower_names = [ n[:-1] for n in tower_names]
     for submodule_name, submodule in model_to_quantize.named_modules():
         if isinstance(submodule, QModuleMixin):
             if verboseLevel>=1:
                 print("No quantization to do as model is already quantized")
             return False
         if submodule_name=='':
             continue
-        flush = False
-        if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
-            if cur_blocks_prefix == None:
-                cur_blocks_prefix = submodule_name + "."
-                flush = True
-            else:
-                #if cur_blocks_prefix != submodule_name[:len(cur_blocks_prefix)]:
-                if not submodule_name.startswith(cur_blocks_prefix):
+        size = compute_submodule_size(submodule)
+        if not any(submodule_name.startswith(pre) for pre in tower_names):
+            flush = False
+            if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+                if cur_blocks_prefix == None:
                     cur_blocks_prefix = submodule_name + "."
                     flush = True
-        else:
-            if cur_blocks_prefix is not None:
-                #if not cur_blocks_prefix == submodule_name[0:len(cur_blocks_prefix)]:
-                if not submodule_name.startswith(cur_blocks_prefix):
-                    cur_blocks_prefix = None
-                    flush = True
-        if flush:
-            if submodule_size <= threshold:
-                exclude_list += submodule_names
-                if verboseLevel >=2:
-                    print(f"Excluded size {submodule_size/ONE_MB:.1f} MB: {prev_blocks_prefix} : {submodule_names}")
-                total_excluded += submodule_size
-            submodule_size = 0
-            submodule_names = []
-        prev_blocks_prefix = cur_blocks_prefix
-        size = compute_submodule_size(submodule)
-        submodule_size += size
+                else:
+                    if not submodule_name.startswith(cur_blocks_prefix):
+                        cur_blocks_prefix = submodule_name + "."
+                        flush = True
+            else:
+                if cur_blocks_prefix is not None:
+                    #if not cur_blocks_prefix == submodule_name[0:len(cur_blocks_prefix)]:
+                    if not submodule_name.startswith(cur_blocks_prefix):
+                        cur_blocks_prefix = None
+                        flush = True
+            if flush :
+                if submodule_size <= threshold :
+                    exclude_list += submodule_names
+                    if verboseLevel >=2:
+                        print(f"Excluded size {submodule_size/ONE_MB:.1f} MB: {prev_blocks_prefix} : {submodule_names}")
+                    total_excluded += submodule_size
+                submodule_size = 0
+                submodule_names = []
+            prev_blocks_prefix = cur_blocks_prefix
+            submodule_size += size
+            submodule_names.append(submodule_name)
         total_size += size
-        submodule_names.append(submodule_name)
     if submodule_size > 0 and submodule_size <= threshold:
         exclude_list += submodule_names
@@ -543,28 +641,29 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
         exclude_list = None
-    #quantize(model_to_quantize,weights, exclude= exclude_list)
+    quantize(model_to_quantize,weights, exclude= exclude_list)
+    # quantize(model_to_quantize,weights, include= [ "*1.block.attn.to_out*"]) #"
+    # for name, m in model_to_quantize.named_modules():
+    #     if exclude_list is None or not any( name == module_name for module_name in exclude_list):
+    #         _quantize_submodule(model_to_quantize, name, m, weights=weights, activations=None, optimizer=None)
-    for name, m in model_to_quantize.named_modules():
-        if exclude_list is None or not any( name == module_name for module_name in exclude_list):
-            _quantize_submodule(model_to_quantize, name, m, weights=weights, activations=None, optimizer=None)
     # force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
     # otherwise we may end up keeping in memory both the quantized and the non quantize model
-    for m in model_to_quantize.modules():
+    for n,m in model_to_quantize.named_modules():
         # do not read quantized weights (detected them directly or behind an adapter)
         if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and  isinstance(m.base_layer, QModuleMixin):
             if hasattr(m, "bias") and m.bias is not None:
-                m.bias.data = m.bias.data + 0
+                _force_load_parameter(m.bias)
         else:
-            for n, p in m.named_parameters(recurse = False):
-                data = getattr(m, n)
-                setattr(m,n, torch.nn.Parameter(data + 0 ) )
+            for p in m.parameters(recurse = False):
+                _force_load_parameter(p)
         for b in m.buffers(recurse = False):
-            b.data = b.data + 0
+            _force_load_buffer(b)
     freeze(model_to_quantize)
     torch.cuda.empty_cache()
@@ -581,595 +680,609 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
     return True
-def get_model_name(model):
-    return model.name
+def load_loras_into_model(model, lora_path, lora_multi = None, verboseLevel = -1):
+    verboseLevel = _compute_verbose_level(verboseLevel)
-class HfHook:
-    def __init__(self):
-        self.execution_device = "cuda"
+    if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or  get_peft_kwargs == None:
+        raise Exception("Unable to load Lora, missing 'peft' and / or 'diffusers' modules")
+    if not isinstance(lora_path, list):
+        lora_path = [lora_path]
+    if lora_multi is None:
+        lora_multi = [1. for _ in lora_path]
-    def detach_hook(self, module):
-        pass
+    for i, path in enumerate(lora_path):
+        adapter_name = str(i)
-last_offload_obj = None
-class offload:
-    def __init__(self):
-        self.active_models = []
-        self.active_models_ids = []
-        self.active_subcaches = {}
-        self.models = {}
-        self.verboseLevel = 0
-        self.blocks_of_modules = {}
-        self.blocks_of_modules_sizes = {}
-        self.anyCompiledModule = False
-        self.device_mem_capacity = torch.cuda.get_device_properties(0).total_memory
-        self.last_reserved_mem_check =0
-        self.loaded_blocks = {}
-        self.prev_blocks_names = {}
-        self.next_blocks_names = {}
-        self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
-        self.transfer_stream = torch.cuda.Stream()
-        self.async_transfers = False
-        global last_offload_obj
-        last_offload_obj = self
+        state_dict = safetensors2.torch_load_file(path)
-    def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name):
+        keys = list(state_dict.keys())
+        if len(keys) == 0:
+            raise Exception(f"Empty Lora '{path}'")
-        entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
-        if entry_name in self.blocks_of_modules:
-            blocks_params = self.blocks_of_modules[entry_name]
-            blocks_params_size = self.blocks_of_modules_sizes[entry_name]
-        else:
-            blocks_params = []
-            self.blocks_of_modules[entry_name] = blocks_params
-            blocks_params_size = 0
-            if blocks_name !=None:
-                prev_entry_name = None if prev_block_name == None else  model_id + "/" + prev_block_name
-                self.prev_blocks_names[entry_name] =  prev_entry_name
-                if not prev_block_name == None:
-                    self.next_blocks_names[prev_entry_name] = entry_name
+        network_alphas = {}
+        for k in keys:
+            if "alpha" in k:
+                alpha_value = state_dict.pop(k)
+                if not ( (torch.is_tensor(alpha_value) and torch.is_floating_point(alpha_value)) or isinstance(
+                    alpha_value, float
+                )):
+                    network_alphas[k] =  torch.tensor( float(alpha_value.item() ) )
+        pos = keys[0].find(".")
+        prefix = keys[0][0:pos]
+        if not any( prefix.startswith(some_prefix) for some_prefix in ["diffusion_model", "transformer"]):
+            msg = f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format."
+            raise Exception(msg)
-        for k,p in submodule.named_parameters(recurse=False):
-            if isinstance(p, QTensor):
-                blocks_params.append( (submodule, k, p ) )
+        transformer = model
-                if p._qtype == qint4:
-                    if hasattr(p,"_scale_shift"):
-                        blocks_params_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
-                        blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
-                    else:
-                        blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
-                        blocks_params_size += torch.numel(p._shift) * p._shift.element_size()
-                        blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
-                else:
-                    blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
-                    blocks_params_size += torch.numel(p._data) * p._data.element_size()
-            else:
-                blocks_params.append( (submodule, k, p  ) )
-                blocks_params_size += torch.numel(p.data) * p.data.element_size()
+        transformer_keys = [k for k in keys if k.startswith(prefix)]
+        state_dict = {
+            k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in transformer_keys
+        }
-        for k, p in submodule.named_buffers(recurse=False):
-            blocks_params.append( (submodule, k, p) )
-            blocks_params_size += p.data.nbytes
+        sd_keys = state_dict.keys()
+        if len(sd_keys) == 0:
+            print(f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format.")
+            return
+        # is_correct_format = all("lora" in key for key in state_dict.keys())
-        self.blocks_of_modules_sizes[entry_name] = blocks_params_size
-        return blocks_params_size
-    def can_model_be_cotenant(self, model_id):
-        potential_cotenants= cotenants_map.get(model_id, None)
-        if potential_cotenants is None:
-            return False
-        for existing_cotenant in self.active_models_ids:
-            if existing_cotenant not in potential_cotenants:
-                return False
-        return True
+        # check with first key if is not in peft format
+        # first_key = next(iter(state_dict.keys()))
+        # if "lora_A" not in first_key:
+        #     state_dict = convert_unet_state_dict_to_peft(state_dict)
+        if adapter_name in getattr(transformer, "peft_config", {}):
+            raise ValueError(
+                f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
+            )
-    def gpu_load_blocks(self, model_id, blocks_name):
-        # cl = clock.start()
+        rank = {}
+        for key, val in state_dict.items():
+            if "lora_B" in key:
+                rank[key] = val.shape[1]
-        if blocks_name != None:
-            self.loaded_blocks[model_id] = blocks_name
+        if network_alphas is not None and len(network_alphas) >= 1:
+            alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
+            network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
-        entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
+        lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
-        def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
-            with torch.cuda.stream(stream_to_use):
-                for param in blocks_params:
-                    parent_module, n, p  = param
-                    q = p.to("cuda", non_blocking=True)
-                    q = torch.nn.Parameter(q , requires_grad=False)
-                    setattr(parent_module, n , q)
-                    # if record_for_stream != None:
-                    #     if isinstance(p, QTensor):
-                    #         q._data.record_stream(record_for_stream)
-                    #         q._scale.record_stream(record_for_stream)
-                    #     else:
-                    #         p.data.record_stream(record_for_stream)
+        lora_config = LoraConfig(**lora_config_kwargs)
+        peft_kwargs = {}
+        peft_kwargs["low_cpu_mem_usage"] = True
+        inject_adapter_in_model(lora_config, model, adapter_name=adapter_name, **peft_kwargs)
+        incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name, **peft_kwargs)
-        if self.verboseLevel >=2:
-            model = self.models[model_id]
-            model_name = model._get_name()
-            print(f"Loading model {entry_name} ({model_name}) in GPU")
+        warn_msg = ""
+        if incompatible_keys is not None:
+            # Check only for unexpected keys.
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                pass
+        if verboseLevel >=1:
+            print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
+    set_weights_and_activate_adapters(model,[ str(i) for i in range(len(lora_multi))], lora_multi)
-        if self.async_transfers and blocks_name != None:
-            first = self.prev_blocks_names[entry_name] == None
-            next_blocks_entry = self.next_blocks_names[entry_name] if entry_name in self.next_blocks_names else None
-            if first:
-                cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
-            torch.cuda.synchronize()
+def move_loras_to_device(model, device="cpu" ):
+    if hasattr( model, "_lora_loadable_modules"):
+        for k in model._lora_loadable_modules:
+            move_loras_to_device(getattr(model,k), device)
+        return
+    for k, m in model.named_modules():
+        if ".lora_" in k:
+            m.to(device)
-            if next_blocks_entry != None:
-                cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
+def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType =  qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
+    """
+    quick version of .LoadfromPretrained of  the transformers library
+    used to build a model and load the corresponding weights (quantized or not)
+    """
-        else:
-            cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
-            torch.cuda.synchronize()
-        # cl.stop()
-        # print(f"load time: {cl.format_time_gap()}")
+    import os.path
+    from accelerate import init_empty_weights
+    if not (model_path.endswith(".sft") or model_path.endswith(".safetensors")):
+        raise Exception("full model path to file expected")
+    model_path = _get_model(model_path)
+    verboseLevel = _compute_verbose_level(verboseLevel)
-    def gpu_unload_blocks(self, model_id, blocks_name):
-        # cl = clock.start()
-        if blocks_name != None:
-            self.loaded_blocks[model_id] = None
+    with safetensors2.safe_open(model_path) as f:
+        metadata = f.metadata()
-        blocks_name = model_id if blocks_name is None else model_id + "/" + blocks_name
+    if metadata is None:
+        transformer_config = None
+    else:
+        transformer_config = metadata.get("config", None)
-        if self.verboseLevel >=2:
-            model = self.models[model_id]
-            model_name = model._get_name()
-            print(f"Unloading model {blocks_name} ({model_name}) from GPU")
-        blocks_params = self.blocks_of_modules[blocks_name]
-        for param in blocks_params:
-            parent_module, n, p  = param
-            q = torch.nn.Parameter(p , requires_grad=False)
-            setattr(parent_module, n , q)
-        # cl.stop()
-        # print(f"unload time: {cl.format_time_gap()}")
+    if transformer_config == None:
+        config_fullpath =  os.path.join(os.path.dirname(model_path), "config.json")
+        if not os.path.isfile(config_fullpath):
+            raise Exception("a 'config.json' that describes the model is required in the directory of the model or inside the safetensor file")
-    def gpu_load(self, model_id):
-        model = self.models[model_id]
-        self.active_models.append(model)
-        self.active_models_ids.append(model_id)
+        with open(config_fullpath, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        transformer_config= json.loads(text)
-        self.gpu_load_blocks(model_id, None)
-        # torch.cuda.current_stream().synchronize()
+    if "architectures" in transformer_config:
+        architectures = transformer_config["architectures"]
+        class_name = architectures[0]
-    def unload_all(self):
-        for model_id in self.active_models_ids:
-            self.gpu_unload_blocks(model_id, None)
-            loaded_block = self.loaded_blocks[model_id]
-            if loaded_block != None:
-                self.gpu_unload_blocks(model_id, loaded_block)
-                self.loaded_blocks[model_id] = None
-        self.active_models = []
-        self.active_models_ids = []
-        self.active_subcaches = []
-        torch.cuda.empty_cache()
-        gc.collect()
-        self.last_reserved_mem_check = time.time()
+        module = __import__("transformers")
+        map = {  "T5WithLMHeadModel" : "T5EncoderModel"}
+        class_name = map.get(class_name, class_name)
+        transfomer_class = getattr(module, class_name)
+        from transformers import AutoConfig
-    def move_args_to_gpu(self, *args, **kwargs):
-        new_args= []
-        new_kwargs={}
-        for arg in args:
-            if torch.is_tensor(arg):
-                if arg.dtype == torch.float32:
-                    arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
-                elif not arg.is_cuda:
-                    arg = arg.cuda(non_blocking=True)
-            new_args.append(arg)
+        import tempfile
+        with tempfile.NamedTemporaryFile("w", delete = False,  encoding ="utf-8") as fp:
+            fp.write(json.dumps(transformer_config))
+            fp.close()
+            config_obj = AutoConfig.from_pretrained(fp.name)
+        os.remove(fp.name)
-        for k in kwargs:
-            arg = kwargs[k]
-            if torch.is_tensor(arg):
-                if arg.dtype == torch.float32:
-                    arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
-                elif not arg.is_cuda:
-                    arg = arg.cuda(non_blocking=True)
-            new_kwargs[k]= arg
-        return new_args, new_kwargs
+        #needed to keep inits of non persistent buffers
+        with init_empty_weights():
+            model = transfomer_class(config_obj)
+        model = model.base_model
-    def ready_to_check_mem(self):
-        if self.anyCompiledModule:
-             return
-        cur_clock = time.time()
-        # can't check at each call if we can empty the cuda cache as quering the reserved memory value is a time consuming operation
-        if (cur_clock - self.last_reserved_mem_check)<0.200:
-            return False
-        self.last_reserved_mem_check = cur_clock
-        return True
+    elif "_class_name" in transformer_config:
+        class_name = transformer_config["_class_name"]
+        module = __import__("diffusers")
+        transfomer_class = getattr(module, class_name)
-    def empty_cache_if_needed(self):
-        mem_reserved = torch.cuda.memory_reserved()
-        mem_threshold = 0.9*self.device_mem_capacity
-        if mem_reserved >= mem_threshold:
-            mem_allocated = torch.cuda.memory_allocated()
-            if mem_allocated <= 0.70 * mem_reserved:
-                # print(f"Cuda empty cache triggered as Allocated Memory ({mem_allocated/1024000:0f} MB) is lot less than Cached Memory ({mem_reserved/1024000:0f} MB)  ")
-                torch.cuda.empty_cache()
-                tm= time.time()
-                if self.verboseLevel >=2:
-                    print(f"Empty Cuda cache at {tm}")
-                # print(f"New cached memory after purge is {torch.cuda.memory_reserved()/1024000:0f} MB)  ")
+        with init_empty_weights():
+            model = transfomer_class.from_config(transformer_config)
-    def any_param_or_buffer(self, target_module: torch.nn.Module):
-        for _ in target_module.parameters(recurse= False):
-            return True
-        for _ in target_module.buffers(recurse= False):
-            return True
-        return False
+    torch.set_default_device('cpu')
-    def hook_load_data_if_needed(self, target_module, model_id,blocks_name, context):
+    model._config = transformer_config
+    load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, verboseLevel=verboseLevel )
-        @torch.compiler.disable()
-        def load_data_if_needed(module,  *args, **kwargs):
-            some_context = context #for debugging
-            if blocks_name == None:
-                if self.ready_to_check_mem():
-                    self.empty_cache_if_needed()
-            else:
-                loaded_block = self.loaded_blocks[model_id]
-                if (loaded_block == None or loaded_block != blocks_name) :
-                    if loaded_block != None:
-                        self.gpu_unload_blocks(model_id, loaded_block)
-                        if self.ready_to_check_mem():
-                            self.empty_cache_if_needed()
-                    self.loaded_blocks[model_id] = blocks_name
-                    self.gpu_load_blocks(model_id, blocks_name)
+    return model
-        target_module.register_forward_pre_hook(load_data_if_needed)
-    def hook_check_empty_cache_needed(self, target_module, model_id,blocks_name, previous_method,  context):
+def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
+    """
+    Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
+    """
-        qint4quantization =  isinstance(target_module, QModuleMixin) and  target_module.weight!= None and  target_module.weight.qtype == qint4
-        if qint4quantization:
-            pass
+    file_path = _get_model(file_path)
+    verboseLevel = _compute_verbose_level(verboseLevel)
-        def check_empty_cuda_cache(module, *args, **kwargs):
-            # if self.ready_to_check_mem():
-            #     self.empty_cache_if_needed()
-            if blocks_name == None:
-                if self.ready_to_check_mem():
-                    self.empty_cache_if_needed()
-            else:
-                loaded_block = self.loaded_blocks[model_id]
-                if (loaded_block == None or loaded_block != blocks_name) :
-                    if loaded_block != None:
-                        self.gpu_unload_blocks(model_id, loaded_block)
-                        if self.ready_to_check_mem():
-                            self.empty_cache_if_needed()
-                    self.loaded_blocks[model_id] = blocks_name
-                    self.gpu_load_blocks(model_id, blocks_name)
-            if qint4quantization:
-                args, kwargs = self.move_args_to_gpu(*args, **kwargs)
+    model = _remove_model_wrapper(model)
-            return previous_method(*args, **kwargs)
+    if not (".safetensors" in file_path or ".sft" in file_path):
+        if pinToMemory:
+            raise Exception("Pinning to memory while loading only supported for safe tensors files")
+        state_dict = torch.load(file_path, weights_only=True)
+        if "module" in state_dict:
+            state_dict = state_dict["module"]
+    else:
+        state_dict, metadata = _safetensors_load_file(file_path)
+        if metadata is None:
+            quantization_map = None
+        else:
+            quantization_map = metadata.get("quantization_map", None)
+            config = metadata.get("config", None)
+            if config is not None:
+                model._config = config
-        if hasattr(target_module, "_mm_id"):
-            orig_model_id = getattr(target_module, "_mm_id")
-            if self.verboseLevel >=2:
-                print(f"Model '{model_id}' shares module '{target_module._get_name()}' with module '{orig_model_id}' ")
-            assert not self.any_param_or_buffer(target_module)
-            return
-        setattr(target_module, "_mm_id", model_id)
-        setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
+        if quantization_map is None:
+            pos = str.rfind(file_path, ".")
+            if pos > 0:
+                quantization_map_path = file_path[:pos]
+            quantization_map_path += "_map.json"
-    def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
-        def check_change_module(module, *args, **kwargs):
-            performEmptyCacheTest = False
-            if not model_id in self.active_models_ids:
-                new_model_id = getattr(module, "_mm_id")
-                # do not always unload existing models if it is more efficient to keep in them in the GPU
-                # (e.g: small modules whose calls are text encoders)
-                if not self.can_model_be_cotenant(new_model_id) :
-                    self.unload_all()
-                    performEmptyCacheTest = False
-                self.gpu_load(new_model_id)
-            # transfer leftovers inputs that were incorrectly created in the RAM (mostly due to some .device tests that returned incorrectly "cpu")
-            args, kwargs = self.move_args_to_gpu(*args, **kwargs)
-            if performEmptyCacheTest:
-                self.empty_cache_if_needed()
-            return previous_method(*args, **kwargs)
-        if hasattr(target_module, "_mm_id"):
-            return
-        setattr(target_module, "_mm_id", model_id)
+            if os.path.isfile(quantization_map_path):
+                with open(quantization_map_path, 'r') as f:
+                    quantization_map = json.load(f)
-        setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_change_module, target_module), previous_method) )
-        if not self.verboseLevel >=1:
-            return
-        if module_id == None or module_id =='':
-            model_name = model._get_name()
-            print(f"Hooked in model '{model_id}' ({model_name})")
+        if quantization_map is None :
+            if "quanto" in file_path and not do_quantize:
+                print("Model seems to be quantized by quanto but no quantization map was found whether inside the model or in a separate '{file_path[:json]}_map.json' file")
+        else:
+            _requantize(model, state_dict, quantization_map)
+    missing_keys , unexpected_keys = model.load_state_dict(state_dict, False,  assign = True )
+    # if len(missing_keys) > 0:
+    #     sd_crap = { k : None for k in missing_keys}
+    #     missing_keys , unexpected_keys = model.load_state_dict(sd_crap, strict =False,  assign = True )
+    del state_dict
-    # Not implemented yet, but why would one want to get rid of these features ?
-    # def unhook_module(module: torch.nn.Module):
-    #     if not hasattr(module,"_mm_id"):
-    #         return
-    #     delattr(module, "_mm_id")
-    # def unhook_all(parent_module: torch.nn.Module):
-    #     for module in parent_module.components.items():
-    #         self.unhook_module(module)
+    for k,p in model.named_parameters():
+        if p.is_meta:
+            txt  = f"Incompatible State Dictionary or 'Init_Empty_Weights' not set since parameter '{k}' has no data"
+            raise Exception(txt)
+    for k,b in model.named_buffers():
+        if b.is_meta:
+            txt  = f"Incompatible State Dictionary or 'Init_Empty_Weights' not set since buffer '{k}' has no data"
+            raise Exception(txt)
-import torch
+    if do_quantize:
+        if quantization_map is None:
+            if _quantize(model, quantizationType, verboseLevel=verboseLevel, model_id=file_path):
+                quantization_map = model._quanto_map
+        else:
+            if verboseLevel >=1:
+                print("Model already quantized")
+    if pinToMemory:
+        _pin_to_memory(model, file_path, partialPinning = partialPinning, verboseLevel = verboseLevel)
+    return
+def get_model_name(model):
+    return model.name
-def load_loras_into_model(model, lora_path, lora_multi = None, verboseLevel = -1):
-    verboseLevel = _compute_verbose_level(verboseLevel)
+class HfHook:
+    def __init__(self):
+        self.execution_device = "cuda"
-    if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or  get_peft_kwargs == None:
-        raise Exception("Unable to load Lora, missing 'peft' and / or 'diffusers' modules")
-    if not isinstance(lora_path, list):
-        lora_path = [lora_path]
-    if lora_multi is None:
-        lora_multi = [1. for _ in lora_path]
+    def detach_hook(self, module):
+        pass
-    for i, path in enumerate(lora_path):
-        adapter_name = str(i)
+last_offload_obj = None
+class offload:
+    def __init__(self):
+        self.active_models = []
+        self.active_models_ids = []
+        self.active_subcaches = {}
+        self.models = {}
+        self.verboseLevel = 0
+        self.blocks_of_modules = {}
+        self.blocks_of_modules_sizes = {}
+        self.anyCompiledModule = False
+        self.device_mem_capacity = torch.cuda.get_device_properties(0).total_memory
+        self.last_reserved_mem_check =0
+        self.loaded_blocks = {}
+        self.prev_blocks_names = {}
+        self.next_blocks_names = {}
+        self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
+        self.transfer_stream = torch.cuda.Stream()
+        self.async_transfers = False
+        global last_offload_obj
+        last_offload_obj = self
-        state_dict = safetensors2.torch_load_file(path)
-        keys = list(state_dict.keys())
-        if len(keys) == 0:
-            raise Exception(f"Empty Lora '{path}'")
+    def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name):
-        network_alphas = {}
-        for k in keys:
-            if "alpha" in k:
-                alpha_value = state_dict.pop(k)
-                if not ( (torch.is_tensor(alpha_value) and torch.is_floating_point(alpha_value)) or isinstance(
-                    alpha_value, float
-                )):
-                    network_alphas[k] =  torch.tensor( float(alpha_value.item() ) )
+        entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
+        if entry_name in self.blocks_of_modules:
+            blocks_params = self.blocks_of_modules[entry_name]
+            blocks_params_size = self.blocks_of_modules_sizes[entry_name]
+        else:
+            blocks_params = []
+            self.blocks_of_modules[entry_name] = blocks_params
+            blocks_params_size = 0
+            if blocks_name !=None:
-        pos = keys[0].find(".")
-        prefix = keys[0][0:pos]
-        if not any( prefix.startswith(some_prefix) for some_prefix in ["diffusion_model", "transformer"]):
-            msg = f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format."
-            raise Exception(msg)
+                prev_entry_name = None if prev_block_name == None else  model_id + "/" + prev_block_name
+                self.prev_blocks_names[entry_name] =  prev_entry_name
+                if not prev_block_name == None:
+                    self.next_blocks_names[prev_entry_name] = entry_name
-        transformer = model
-        transformer_keys = [k for k in keys if k.startswith(prefix)]
-        state_dict = {
-            k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in transformer_keys
-        }
+        for k,p in submodule.named_parameters(recurse=False):
+            if isinstance(p, QTensor):
+                blocks_params.append( (submodule, k, p, False ) )
-        sd_keys = state_dict.keys()
-        if len(sd_keys) == 0:
-            print(f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format.")
-            return
+                if p._qtype == qint4:
+                    if hasattr(p,"_scale_shift"):
+                        blocks_params_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
+                        blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
+                    else:
+                        blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
+                        blocks_params_size += torch.numel(p._shift) * p._shift.element_size()
+                        blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
+                else:
+                    blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
+                    blocks_params_size += torch.numel(p._data) * p._data.element_size()
+            else:
+                blocks_params.append( (submodule, k, p, False) )
+                blocks_params_size += torch.numel(p.data) * p.data.element_size()
-        # is_correct_format = all("lora" in key for key in state_dict.keys())
+        for k, p in submodule.named_buffers(recurse=False):
+            blocks_params.append( (submodule, k, p, True) )
+            blocks_params_size += p.data.nbytes
+        self.blocks_of_modules_sizes[entry_name] = blocks_params_size
+        return blocks_params_size
-        # check with first key if is not in peft format
-        # first_key = next(iter(state_dict.keys()))
-        # if "lora_A" not in first_key:
-        #     state_dict = convert_unet_state_dict_to_peft(state_dict)
-        if adapter_name in getattr(transformer, "peft_config", {}):
-            raise ValueError(
-                f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
-            )
+    def can_model_be_cotenant(self, model_id):
+        potential_cotenants= cotenants_map.get(model_id, None)
+        if potential_cotenants is None:
+            return False
+        for existing_cotenant in self.active_models_ids:
+            if existing_cotenant not in potential_cotenants:
+                return False
+        return True
-        rank = {}
-        for key, val in state_dict.items():
-            if "lora_B" in key:
-                rank[key] = val.shape[1]
+    @torch.compiler.disable()
+    def gpu_load_blocks(self, model_id, blocks_name):
+        # cl = clock.start()
-        if network_alphas is not None and len(network_alphas) >= 1:
-            alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
-            network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
+        if blocks_name != None:
+            self.loaded_blocks[model_id] = blocks_name
-        lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
+        entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
-        lora_config = LoraConfig(**lora_config_kwargs)
-        peft_kwargs = {}
-        peft_kwargs["low_cpu_mem_usage"] = True
-        inject_adapter_in_model(lora_config, model, adapter_name=adapter_name, **peft_kwargs)
-        incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name, **peft_kwargs)
-        warn_msg = ""
-        if incompatible_keys is not None:
-            # Check only for unexpected keys.
-            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
-            if unexpected_keys:
-                pass
-        if verboseLevel >=1:
-            print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
-    set_weights_and_activate_adapters(model,[ str(i) for i in range(len(lora_multi))], lora_multi)
+        def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
+            with torch.cuda.stream(stream_to_use):
+                for param in blocks_params:
+                    parent_module, n, p, is_buffer  = param
+                    q = p.to("cuda", non_blocking=True)
+                    if is_buffer:
+                        q = torch.nn.Buffer(q)
+                    else:
+                        q = torch.nn.Parameter(q , requires_grad=False)
+                    setattr(parent_module, n , q)
+                    # if record_for_stream != None:
+                    #     if isinstance(p, QTensor):
+                    #         q._data.record_stream(record_for_stream)
+                    #         q._scale.record_stream(record_for_stream)
+                    #     else:
+                    #         p.data.record_stream(record_for_stream)
-def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType =  qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
-    """
-    quick version of .LoadfromPretrained of  the transformers library
-    used to build a model and load the corresponding weights (quantized or not)
-    """
-    import os.path
-    from accelerate import init_empty_weights
+        if self.verboseLevel >=2:
+            model = self.models[model_id]
+            model_name = model._get_name()
+            print(f"Loading model {entry_name} ({model_name}) in GPU")
-    if not (model_path.endswith(".sft") or model_path.endswith(".safetensors")):
-        raise Exception("full model path to file expected")
-    model_path = _get_model(model_path)
-    verboseLevel = _compute_verbose_level(verboseLevel)
-    with safetensors2.safe_open(model_path) as f:
-        metadata = f.metadata()
-    if metadata is None:
-        transformer_config = None
-    else:
-        transformer_config = metadata.get("config", None)
-    if transformer_config == None:
-        config_fullpath =  os.path.join(os.path.dirname(model_path), "config.json")
-        if not os.path.isfile(config_fullpath):
-            raise Exception("a 'config.json' that describes the model is required in the directory of the model or inside the safetensor file")
-        with open(config_fullpath, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        transformer_config= json.loads(text)
+        if self.async_transfers and blocks_name != None:
+            first = self.prev_blocks_names[entry_name] == None
+            next_blocks_entry = self.next_blocks_names[entry_name] if entry_name in self.next_blocks_names else None
+            if first:
+                cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
+            torch.cuda.synchronize()
+            if next_blocks_entry != None:
+                cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
-    if "architectures" in transformer_config:
-        architectures = transformer_config["architectures"]
-        class_name = architectures[0]
+        else:
+            cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
+            torch.cuda.synchronize()
+        # cl.stop()
+        # print(f"load time: {cl.format_time_gap()}")
-        module = __import__("transformers")
-        transfomer_class = getattr(module, class_name)
-        from transformers import AutoConfig
+    @torch.compiler.disable()
+    def gpu_unload_blocks(self, model_id, blocks_name):
+        # cl = clock.start()
+        if blocks_name != None:
+            self.loaded_blocks[model_id] = None
-        import tempfile
-        with tempfile.NamedTemporaryFile("w", delete = False,  encoding ="utf-8") as fp:
-            fp.write(json.dumps(transformer_config))
-            fp.close()
-            config_obj = AutoConfig.from_pretrained(fp.name)
-        os.remove(fp.name)
+        blocks_name = model_id if blocks_name is None else model_id + "/" + blocks_name
-        #needed to keep inits of non persistent buffers
-        with init_empty_weights():
-            model = transfomer_class(config_obj)
-        model = model.base_model
+        if self.verboseLevel >=2:
+            model = self.models[model_id]
+            model_name = model._get_name()
+            print(f"Unloading model {blocks_name} ({model_name}) from GPU")
+        blocks_params = self.blocks_of_modules[blocks_name]
+        for param in blocks_params:
+            parent_module, n, p, is_buffer  = param
+            if is_buffer:
+                q = torch.nn.Buffer(p)
+            else:
+                q = torch.nn.Parameter(p , requires_grad=False)
+            setattr(parent_module, n , q)
+        # cl.stop()
+        # print(f"unload time: {cl.format_time_gap()}")
-    elif "_class_name" in transformer_config:
-        class_name = transformer_config["_class_name"]
+    # @torch.compiler.disable()
+    def gpu_load(self, model_id):
+        model = self.models[model_id]
+        self.active_models.append(model)
+        self.active_models_ids.append(model_id)
-        module = __import__("diffusers")
-        transfomer_class = getattr(module, class_name)
+        self.gpu_load_blocks(model_id, None)
-        with init_empty_weights():
-            model = transfomer_class.from_config(transformer_config)
+        # torch.cuda.current_stream().synchronize()
+    def unload_all(self):
+        for model_id in self.active_models_ids:
+            self.gpu_unload_blocks(model_id, None)
+            loaded_block = self.loaded_blocks[model_id]
+            if loaded_block != None:
+                self.gpu_unload_blocks(model_id, loaded_block)
+                self.loaded_blocks[model_id] = None
+        self.active_models = []
+        self.active_models_ids = []
+        self.active_subcaches = []
+        torch.cuda.empty_cache()
+        gc.collect()
+        self.last_reserved_mem_check = time.time()
-    torch.set_default_device('cpu')
+    def move_args_to_gpu(self, *args, **kwargs):
+        new_args= []
+        new_kwargs={}
+        for arg in args:
+            if torch.is_tensor(arg):
+                if arg.dtype == torch.float32:
+                    arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
+                elif not arg.is_cuda:
+                    arg = arg.cuda(non_blocking=True)
+            new_args.append(arg)
-    model._config = transformer_config
-    load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, verboseLevel=verboseLevel )
+        for k in kwargs:
+            arg = kwargs[k]
+            if torch.is_tensor(arg):
+                if arg.dtype == torch.float32:
+                    arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
+                elif not arg.is_cuda:
+                    arg = arg.cuda(non_blocking=True)
+            new_kwargs[k]= arg
+        return new_args, new_kwargs
+    def ready_to_check_mem(self):
+        if self.anyCompiledModule:
+             return
+        cur_clock = time.time()
+        # can't check at each call if we can empty the cuda cache as quering the reserved memory value is a time consuming operation
+        if (cur_clock - self.last_reserved_mem_check)<0.200:
+            return False
+        self.last_reserved_mem_check = cur_clock
+        return True
-    return model
+    def empty_cache_if_needed(self):
+        mem_reserved = torch.cuda.memory_reserved()
+        mem_threshold = 0.9*self.device_mem_capacity
+        if mem_reserved >= mem_threshold:
+            mem_allocated = torch.cuda.memory_allocated()
+            if mem_allocated <= 0.70 * mem_reserved:
+                # print(f"Cuda empty cache triggered as Allocated Memory ({mem_allocated/1024000:0f} MB) is lot less than Cached Memory ({mem_reserved/1024000:0f} MB)  ")
+                torch.cuda.empty_cache()
+                tm= time.time()
+                if self.verboseLevel >=2:
+                    print(f"Empty Cuda cache at {tm}")
+                # print(f"New cached memory after purge is {torch.cuda.memory_reserved()/1024000:0f} MB)  ")
-def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
-    """
-    Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
-    """
+    def any_param_or_buffer(self, target_module: torch.nn.Module):
+        for _ in target_module.parameters(recurse= False):
+            return True
+        for _ in target_module.buffers(recurse= False):
+            return True
+        return False
-    file_path = _get_model(file_path)
-    verboseLevel = _compute_verbose_level(verboseLevel)
+    def hook_preload_blocks_for_compilation(self, target_module, model_id,blocks_name, context):
-    model = _remove_model_wrapper(model)
+        # @torch.compiler.disable()
+        def preload_blocks_for_compile(module,  *args, **kwargs):
+            some_context = context #for debugging
+            if blocks_name == None:
+                if self.ready_to_check_mem():
+                    self.empty_cache_if_needed()
+            else:
+                loaded_block = self.loaded_blocks[model_id]
+                if (loaded_block == None or loaded_block != blocks_name) :
+                    if loaded_block != None:
+                        self.gpu_unload_blocks(model_id, loaded_block)
+                        if self.ready_to_check_mem():
+                            self.empty_cache_if_needed()
+                    self.loaded_blocks[model_id] = blocks_name
+                    self.gpu_load_blocks(model_id, blocks_name)
+        # need to be registered before the forward not to be break the efficiency of the compilation chain
+        # it should be at the top of the compilation as this type of hook in the middle of a chain seems to break memory performance
+        target_module.register_forward_pre_hook(preload_blocks_for_compile)
-    # if pinToMemory and do_quantize:
-    #     raise Exception("Pinning and Quantization can not be used at the same time")
-    if not (".safetensors" in file_path or ".sft" in file_path):
-        if pinToMemory:
-            raise Exception("Pinning to memory while loading only supported for safe tensors files")
-        state_dict = torch.load(file_path, weights_only=True)
-        if "module" in state_dict:
-            state_dict = state_dict["module"]
-    else:
-        state_dict, metadata = _safetensors_load_file(file_path)
-        if metadata is None:
-            quantization_map = None
-        else:
-            quantization_map = metadata.get("quantization_map", None)
-            config = metadata.get("config", None)
-            if config is not None:
-                model._config = config
+    def hook_check_empty_cache_needed(self, target_module, model_id,blocks_name, previous_method,  context):
+        qint4quantization =  isinstance(target_module, QModuleMixin) and  target_module.weight!= None and  target_module.weight.qtype == qint4
+        if qint4quantization:
+            pass
+        def check_empty_cuda_cache(module, *args, **kwargs):
+            # if self.ready_to_check_mem():
+            #     self.empty_cache_if_needed()
+            if blocks_name == None:
+                if self.ready_to_check_mem():
+                    self.empty_cache_if_needed()
+            else:
+                loaded_block = self.loaded_blocks[model_id]
+                if (loaded_block == None or loaded_block != blocks_name) :
+                    if loaded_block != None:
+                        self.gpu_unload_blocks(model_id, loaded_block)
+                        if self.ready_to_check_mem():
+                            self.empty_cache_if_needed()
+                    self.loaded_blocks[model_id] = blocks_name
+                    self.gpu_load_blocks(model_id, blocks_name)
+            if qint4quantization:
+                args, kwargs = self.move_args_to_gpu(*args, **kwargs)
-        if quantization_map is None:
-            pos = str.rfind(file_path, ".")
-            if pos > 0:
-                quantization_map_path = file_path[:pos]
-            quantization_map_path += "_map.json"
+            return previous_method(*args, **kwargs)
-            if os.path.isfile(quantization_map_path):
-                with open(quantization_map_path, 'r') as f:
-                    quantization_map = json.load(f)
+        if hasattr(target_module, "_mm_id"):
+            orig_model_id = getattr(target_module, "_mm_id")
+            if self.verboseLevel >=2:
+                print(f"Model '{model_id}' shares module '{target_module._get_name()}' with module '{orig_model_id}' ")
+            assert not self.any_param_or_buffer(target_module)
+            return
+        setattr(target_module, "_mm_id", model_id)
+        setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
-        if quantization_map is None :
-            if "quanto" in file_path and not do_quantize:
-                print("Model seems to be quantized by quanto but no quantization map was found whether inside the model or in a separate '{file_path[:json]}_map.json' file")
-        else:
-            _requantize(model, state_dict, quantization_map)
+    def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
+        def check_change_module(module, *args, **kwargs):
+            performEmptyCacheTest = False
+            if not model_id in self.active_models_ids:
+                new_model_id = getattr(module, "_mm_id")
+                # do not always unload existing models if it is more efficient to keep in them in the GPU
+                # (e.g: small modules whose calls are text encoders)
+                if not self.can_model_be_cotenant(new_model_id) :
+                    self.unload_all()
+                    performEmptyCacheTest = False
+                self.gpu_load(new_model_id)
+            # transfer leftovers inputs that were incorrectly created in the RAM (mostly due to some .device tests that returned incorrectly "cpu")
+            args, kwargs = self.move_args_to_gpu(*args, **kwargs)
+            if performEmptyCacheTest:
+                self.empty_cache_if_needed()
+            return previous_method(*args, **kwargs)
+        if hasattr(target_module, "_mm_id"):
+            return
+        setattr(target_module, "_mm_id", model_id)
-    missing_keys , unexpected_keys = model.load_state_dict(state_dict, strict = quantization_map is None,  assign = True )
-    del state_dict
+        setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_change_module, target_module), previous_method) )
-    if do_quantize:
-        if quantization_map is None:
-            if _quantize(model, quantizationType, verboseLevel=verboseLevel, model_id=file_path):
-                quantization_map = model._quanto_map
-        else:
-            if verboseLevel >=1:
-                print("Model already quantized")
+        if not self.verboseLevel >=1:
+            return
-    if pinToMemory:
-        _pin_to_memory(model, file_path, partialPinning = partialPinning, verboseLevel = verboseLevel)
+        if module_id == None or module_id =='':
+            model_name = model._get_name()
+            print(f"Hooked in model '{model_id}' ({model_name})")
-    return
-def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1 ):
+def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1, config_file_path = None ):
     """save the weights of a model and quantize them if requested
     These weights can be loaded again using 'load_model_data'
     """
     config = None
     verboseLevel = _compute_verbose_level(verboseLevel)
-    if hasattr(model, "_config"):
+    if config_file_path !=None:
+        with open(config_file_path, "r", encoding="utf-8") as reader:
+            text = reader.read()
+            config= json.loads(text)
+    elif hasattr(model, "_config"):
         config = model._config
     elif hasattr(model, "config"):
         config_fullpath = None
@@ -1195,7 +1308,7 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
         print(f"Saving file '{file_path}")
     safetensors2.torch_write_file(model.state_dict(),  file_path , quantization_map = quantization_map, config = config)
     if verboseLevel >=1:
-        print(f"File '{file_path} saved")
+        print(f"File '{file_path}' saved")
@@ -1286,7 +1399,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
     max_reservable_memory = _get_max_reservable_memory(perc_reserved_mem_max)
     estimatesBytesToPin = 0
     for model_id in models:
         current_model: torch.nn.Module = models[model_id]
         # make sure that no RAM or GPU memory is not allocated for gradiant / training
@@ -1302,7 +1414,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
         for n, p in current_model.named_parameters():
             p.requires_grad = False
-            p = p.detach()
             if isinstance(p, QTensor):
                 # # fix quanto bug (seems to have been fixed)
                 # if not modelPinned and p._scale.dtype == torch.float32:
@@ -1352,21 +1463,18 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
     #  Hook forward methods of modules
     for model_id in models:
         current_model: torch.nn.Module = models[model_id]
-        current_budget = model_budgets[model_id]
-        current_size = 0
-        cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
-        self.loaded_blocks[model_id] = None
         towers_names, towers_modules = _detect_main_towers(current_model)
-        towers_names = [n +"." for n in towers_names]
-        if self.verboseLevel>=2 and len(towers_names)>0:
-            print(f"Potential iterative blocks found in model '{model_id}':{towers_names}")
         # compile main iterative modules stacks ("towers")
-        if compileAllModels or model_id in modelsToCompile :
+        compilationInThisOne = compileAllModels or model_id in modelsToCompile
+        if compilationInThisOne:
             if self.verboseLevel>=1:
-                print(f"Pytorch compilation of model '{model_id}' is scheduled.")
-            for tower in towers_modules:
-                for submodel in tower:
-                    submodel.forward= torch.compile(submodel.forward,  backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
+                if len(towers_modules)>0:
+                    print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {towers_names}.")
+                else:
+                    print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
+            for submodel in towers_modules:
+                submodel.forward= torch.compile(submodel.forward,  backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
                     #dynamic=True,
         if pinAllModels or model_id in modelsToPin:
@@ -1376,6 +1484,11 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
             else:
                 _pin_to_memory(current_model, model_id, partialPinning= partialPinning, perc_reserved_mem_max=perc_reserved_mem_max, verboseLevel=verboseLevel)
+        current_budget = model_budgets[model_id]
+        current_size = 0
+        cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
+        self.loaded_blocks[model_id] = None
         for submodule_name, submodule in current_model.named_modules():
             # create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
             # (it is queried in many pipelines even if offloading is not properly implemented)
@@ -1384,44 +1497,43 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
             if submodule_name=='':
                 continue
-            newListItem = False
             if current_budget > 0:
-                if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)): #
-                    if cur_blocks_prefix == None:
-                        cur_blocks_prefix = submodule_name + "."
+                if cur_blocks_prefix != None:
+                    if submodule_name.startswith(cur_blocks_prefix):
+                        depth_prefix = cur_blocks_prefix.split(".")
+                        depth_name = submodule_name.split(".")
+                        level  =  depth_name[len(depth_prefix)-1]
+                        pre , num = _extract_num_from_str(level)
+                        if num != cur_blocks_seq and (cur_blocks_seq == -1 or current_size > current_budget):
+                            prev_blocks_name = cur_blocks_name
+                            cur_blocks_name =  cur_blocks_prefix + str(num)
+                            # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
+                        cur_blocks_seq = num
                     else:
-                        #if cur_blocks_prefix != submodule_name[:len(cur_blocks_prefix)]:
-                        if not submodule_name.startswith(cur_blocks_prefix):
-                            cur_blocks_prefix = submodule_name + "."
-                            cur_blocks_name,cur_blocks_seq = None, -1
-                else:
-                    if cur_blocks_prefix is not None:
-                        if submodule_name.startswith(cur_blocks_prefix):
-                            num = int(submodule_name[len(cur_blocks_prefix):].split(".")[0])
-                            newListItem= num != cur_blocks_seq
-                            if num != cur_blocks_seq and (cur_blocks_name == None or current_size > current_budget):
-                                prev_blocks_name = cur_blocks_name
-                                cur_blocks_name = cur_blocks_prefix + str(num)
-                                # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
-                            cur_blocks_seq = num
-                        else:
-                            cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
+                        cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
+                if cur_blocks_prefix == None:
+                    pre , num = _extract_num_from_str(submodule_name)
+                    if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+                        cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre + ".", None, -1
+                    elif num >=0:
+                        cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre, None, num
+                        cur_blocks_name = submodule_name
+                        # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
             if hasattr(submodule, "forward"):
                 submodule_method = getattr(submodule, "forward")
                 if callable(submodule_method):
                     if len(submodule_name.split("."))==1:
                         self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method)
-                    elif newListItem:
-                        self.hook_load_data_if_needed(submodule, model_id, cur_blocks_name, context = submodule_name )
+                    elif compilationInThisOne and submodule in towers_modules:
+                        self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
                     else:
                         self.hook_check_empty_cache_needed(submodule, model_id, cur_blocks_name, submodule_method, context = submodule_name )
-                    current_size = self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
+                current_size = self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
     if self.verboseLevel >=2:
@@ -1467,11 +1579,12 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type =  profile_type.Ve
     models_to_scan = ("text_encoder", "text_encoder_2")
     candidates_to_quantize = ("t5", "llama", "llm")
     for model_id  in models_to_scan:
-        name = module_names[model_id]
-        for candidate in candidates_to_quantize:
-            if candidate in name:
-                default_extraModelsToQuantize.append(model_id)
-                break
+        if model_id in module_names:
+            name = module_names[model_id]
+            for candidate in candidates_to_quantize:
+                if candidate in name:
+                    default_extraModelsToQuantize.append(model_id)
+                    break
     # transformer (video or image generator) should be as small as possible not to occupy space that could be used by actual image data
@@ -1480,6 +1593,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type =  profile_type.Ve
     default_budgets = { "transformer" : 600 , "text_encoder": 3000, "text_encoder_2": 3000 }
     extraModelsToQuantize = None
     asyncTransfers = True
+    budgets = None
     if profile_no == profile_type.HighRAM_HighVRAM:
         pinnedMemory= True