PyPI - mmgp - Versions diffs - 3.5.7__py3-none-any.whl → 3.6.11__py3-none-any.whl - Mend

mmgp 3.5.7py3-none-any.whl → 3.6.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

mmgp/fp8_quanto_bridge.py +645 -0
mmgp/fp8_quanto_bridge_old.py +498 -0
mmgp/offload.py +1038 -248
mmgp/quant_router.py +518 -0
mmgp/quanto_int8_cuda.py +97 -0
mmgp/quanto_int8_inject.py +335 -0
mmgp/safetensors2.py +57 -10
{mmgp-3.5.7.dist-info → mmgp-3.6.11.dist-info}/METADATA +2 -2
mmgp-3.6.11.dist-info/RECORD +14 -0
{mmgp-3.5.7.dist-info → mmgp-3.6.11.dist-info}/licenses/LICENSE.md +1 -1
mmgp-3.5.7.dist-info/RECORD +0 -9
{mmgp-3.5.7.dist-info → mmgp-3.6.11.dist-info}/WHEEL +0 -0
{mmgp-3.5.7.dist-info → mmgp-3.6.11.dist-info}/top_level.txt +0 -0

mmgp/offload.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# ------------------ Memory Management 3.5.7 for the GPU Poor by DeepBeepMeep (mmgp)------------------
+# ------------------ Memory Management 3.6.11 for the GPU Poor by DeepBeepMeep (mmgp)------------------
 #
 # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ...  can run smoothly on a 24 GB GPU limited card.
 # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -60,16 +60,23 @@ import functools
 import sys
 import os
 import json
+import inspect
 import psutil
 import builtins
 from accelerate import init_empty_weights
+from functools import wraps
+import functools
+import types
 from mmgp import safetensors2
 from mmgp import profile_type
+from .quant_router import (
+    apply_pre_quantization,
+    cache_quantization_for_file,
+    detect_and_convert,
+    detect_safetensors_format,
+)
 from optimum.quanto import freeze,  qfloat8, qint4 , qint8, quantize, QModuleMixin, QLinear, QTensor,  quantize_module, register_qmodule
 # support for Embedding module quantization that is not supported by default by quanto
 @register_qmodule(torch.nn.Embedding)
 class QEmbedding(QModuleMixin, torch.nn.Embedding):
@@ -83,8 +90,36 @@ class QEmbedding(QModuleMixin, torch.nn.Embedding):
         return torch.nn.functional.embedding( input, self.qweight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse )
+def cudacontext(device):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            with torch.device(device):
+                return func(*args, **kwargs)
+        return wrapper
+    return decorator
 shared_state = {}
+def get_cache(cache_name):
+    all_cache = shared_state.get("_cache",  None)
+    if all_cache is None:
+        all_cache = {}
+        shared_state["_cache"]=  all_cache
+    cache = all_cache.get(cache_name, None)
+    if cache is None:
+        cache = {}
+        all_cache[cache_name] = cache
+    return cache
+def clear_caches():
+    all_cache = shared_state.get("_cache",  None)
+    if all_cache is not None:
+        all_cache.clear()
 mmm = safetensors2.mmm
 default_verboseLevel = 1
@@ -277,23 +312,112 @@ def _safetensors_load_file(file_path, writable_tensors = True):
 def _force_load_buffer(p):
     # To do : check if buffer was persistent and transfer state, or maybe swap keep already this property ?
-    q = torch.nn.Buffer(p + 0)
+    q = torch.nn.Buffer(p.clone())
     torch.utils.swap_tensors(p, q)
     del q
 def _force_load_parameter(p):
-    q = torch.nn.Parameter(p + 0)
+    q = torch.nn.Parameter(p.clone())
     torch.utils.swap_tensors(p, q)
     del q
-def _get_tensor_ref(p):
-    if isinstance(p, QTensor):
-        if p._qtype == qint4:
-            return p._data._data.data_ptr()
+def _unwrap_quantized_tensor(tensor):
+    if hasattr(tensor, "_data") and torch.is_tensor(tensor._data):
+        return tensor._data
+    return tensor
+def _qtensor_get_quantized_subtensors(self):
+    subtensors = []
+    if getattr(self, "_qtype", None) == qint4:
+        data = _unwrap_quantized_tensor(self._data)
+        subtensors.append(("data", data))
+        if hasattr(self, "_scale_shift") and self._scale_shift is not None:
+            subtensors.append(("scale_shift", self._scale_shift))
+        else:
+            if hasattr(self, "_scale") and self._scale is not None:
+                subtensors.append(("scale", self._scale))
+            if hasattr(self, "_shift") and self._shift is not None:
+                subtensors.append(("shift", self._shift))
+        return subtensors
+    if hasattr(self, "_data"):
+        data = _unwrap_quantized_tensor(self._data)
+        subtensors.append(("data", data))
+    if hasattr(self, "_scale") and self._scale is not None:
+        subtensors.append(("scale", self._scale))
+    return subtensors
+def _qtensor_set_quantized_subtensors(self, sub_tensors):
+    if isinstance(sub_tensors, dict):
+        sub_map = sub_tensors
+    else:
+        sub_map = {name: tensor for name, tensor in sub_tensors}
+    data = sub_map.get("data", None)
+    if data is not None:
+        if hasattr(self, "_data") and hasattr(self._data, "_data") and torch.is_tensor(self._data._data):
+            self._data._data = data
+        else:
+            self._data = data
+    if getattr(self, "_qtype", None) == qint4:
+        if "scale_shift" in sub_map and sub_map["scale_shift"] is not None:
+            self._scale_shift = sub_map["scale_shift"]
         else:
-            return p._data.data_ptr()
-    else:
-        return p.data_ptr()
+            if "scale" in sub_map and sub_map["scale"] is not None:
+                self._scale = sub_map["scale"]
+            if "shift" in sub_map and sub_map["shift"] is not None:
+                self._shift = sub_map["shift"]
+    else:
+        if "scale" in sub_map and sub_map["scale"] is not None:
+            self._scale = sub_map["scale"]
+if not hasattr(QTensor, "get_quantized_subtensors"):
+    QTensor.get_quantized_subtensors = _qtensor_get_quantized_subtensors
+if not hasattr(QTensor, "set_quantized_subtensors"):
+    QTensor.set_quantized_subtensors = _qtensor_set_quantized_subtensors
+def _get_quantized_subtensors(p):
+    getter = getattr(p, "get_quantized_subtensors", None)
+    if getter is None:
+        return None
+    sub_tensors = getter()
+    if not sub_tensors:
+        return None
+    if isinstance(sub_tensors, dict):
+        sub_tensors = list(sub_tensors.items())
+    out = []
+    for name, tensor in sub_tensors:
+        if tensor is None:
+            continue
+        if torch.is_tensor(tensor):
+            out.append((name, tensor))
+    return out if out else None
+def _set_quantized_subtensors(p, sub_tensors):
+    setter = getattr(p, "set_quantized_subtensors", None)
+    if setter is None:
+        return False
+    setter(sub_tensors)
+    return True
+def _subtensors_nbytes(sub_tensors):
+    return sum(torch.numel(t) * t.element_size() for _, t in sub_tensors)
+def _subtensors_itemsize(sub_tensors, fallback):
+    for _, t in sub_tensors:
+        return t.element_size()
+    return fallback
+def _get_tensor_ref(p):
+    sub_tensors = _get_quantized_subtensors(p)
+    if sub_tensors:
+        for _, t in sub_tensors:
+            ref = t.data_ptr()
+            del sub_tensors
+            return ref
+        del sub_tensors
+    return p.data_ptr()
 BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
@@ -516,25 +640,18 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
                 tied_weights_last = f"{match_name} <-> {n}"
             tied_weights[n] = match_name
         else:
-            if isinstance(p, QTensor):
-                if p._qtype == qint4:
-                    if p._data._data.is_pinned():
-                        params_dict[n] = (None, False)
-                        continue
-                    if hasattr(p,"_scale_shift"):
-                        length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
-                    else:
-                        length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
-                else:
-                    length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
-                    if p._data.is_pinned():
-                        params_dict[n] = (None, False)
-                        continue
+            sub_tensors = _get_quantized_subtensors(p)
+            if sub_tensors:
+                if builtins.all(t.is_pinned() for _, t in sub_tensors):
+                    params_dict[n] = (None, False)
+                    del sub_tensors
+                    continue
+                length = _subtensors_nbytes(sub_tensors)
             else:
                 if p.data.is_pinned():
                     params_dict[n] = (None, False)
                     continue
-                length = torch.numel(p.data) * p.data.element_size()
+                length = torch.numel(p.data) * p.data.element_size()
             ref_cache[ref] = (n, length)
             if current_big_tensor_size + length > big_tensor_size and current_big_tensor_size !=0  :
@@ -542,8 +659,11 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
                 current_big_tensor_size = 0
                 big_tensor_no += 1
-            itemsize = p.data.dtype.itemsize
+            if sub_tensors:
+                itemsize = _subtensors_itemsize(sub_tensors, p.data.dtype.itemsize)
+                del sub_tensors
+            else:
+                itemsize = p.data.dtype.itemsize
             if current_big_tensor_size % itemsize:
                 current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
             tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length  ))
@@ -580,15 +700,11 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
         q_name = tied_weights.get(n,None)
         if q_name != None:
             q , _ = params_dict[q_name]
-            if isinstance(p, QTensor):
-                if p._qtype == qint4:
-                    p._data._data = q._data._data
-                    p._scale_shift = q._scale_shift
-                    assert p._data._data.data.is_pinned()
-                else:
-                    p._data = q._data
-                    p._scale = q._scale
-                    assert p._data.is_pinned()
+            sub_tensors = _get_quantized_subtensors(q)
+            if sub_tensors:
+                sub_map = {name: tensor for name, tensor in sub_tensors}
+                _set_quantized_subtensors(p, sub_map)
+                del sub_map, sub_tensors
             else:
                 p.data = q.data
                 assert p.data.is_pinned()
@@ -618,27 +734,21 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
                 total += size
             current_big_tensor = big_tensors[big_tensor_no]
             if is_buffer :
                 _force_load_buffer(p) # otherwise potential memory leak
-            if isinstance(p, QTensor):
-                if p._qtype == qint4:
-                    length1 = torch.numel(p._data._data) * p._data._data.element_size()
-                    p._data._data =  _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
-                    if hasattr(p,"_scale_shift"):
-                        length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
-                        p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
-                    else:
-                        length2 = torch.numel(p._scale) * p._scale.element_size()
-                        p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
-                        length3 = torch.numel(p._shift) * p._shift.element_size()
-                        p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
-                else:
-                    length1 = torch.numel(p._data) * p._data.element_size()
-                    p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
-                    length2 = torch.numel(p._scale) * p._scale.element_size()
-                    p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
+            sub_tensors = _get_quantized_subtensors(p)
+            if sub_tensors:
+                sub_offset = offset
+                new_subs = {}
+                for name, tensor in sub_tensors:
+                    length = torch.numel(tensor) * tensor.element_size()
+                    new_subs[name] = _move_to_pinned_tensor(tensor, current_big_tensor, sub_offset, length)
+                    sub_offset += length
+                _set_quantized_subtensors(p, new_subs)
+                del new_subs, sub_tensors
             else:
-                length = torch.numel(p.data) * p.data.element_size()
+                length = torch.numel(p.data) * p.data.element_size()
                 p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
             tensor_no += 1
@@ -666,18 +776,22 @@ def _welcome():
     if welcome_displayed:
          return
     welcome_displayed = True
-    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.7) by DeepBeepMeep ************{ENDC}{UNBOLD}")
+    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.11) by DeepBeepMeep ************{ENDC}{UNBOLD}")
 def change_dtype(model, new_dtype, exclude_buffers = False):
     for submodule_name, submodule in model.named_modules():
         if hasattr(submodule, "_lock_dtype"):
             continue
         for n, p in submodule.named_parameters(recurse = False):
+            if isinstance(p, QTensor):
+                continue
             if p.data.dtype != new_dtype:
                 p.data = p.data.to(new_dtype)
         if not exclude_buffers:
             for p in submodule.buffers(recurse=False):
+                if isinstance(p, QTensor):
+                    continue
                 if p.data.dtype != new_dtype:
                     p.data = p.data.to(new_dtype)
@@ -751,7 +865,7 @@ def _quantize_submodule(
             setattr(module, name, None)
             del param
-def _requantize(model: torch.nn.Module, state_dict: dict, quantization_map: dict):
+def _requantize(model: torch.nn.Module, state_dict: dict, quantization_map: dict, default_dtype=None):
     # change dtype of current meta model parameters because 'requantize' won't update the dtype on non quantized parameters
     for k, p in model.named_parameters():
         if not k in quantization_map and k in state_dict:
@@ -770,6 +884,11 @@ def _requantize(model: torch.nn.Module, state_dict: dict, quantization_map: dict
             if activations == "none":
                 activations = None
             _quantize_submodule(model, name, m, weights=weights, activations=activations)
+            if default_dtype is not None:
+                new_module = model.get_submodule(name)
+                setter = getattr(new_module, "set_default_dtype", None)
+                if callable(setter):
+                    setter(default_dtype)
     model._quanto_map = quantization_map
@@ -803,6 +922,7 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 2*
     cache_ref = {}
     tied_weights= {}
+    reversed_tied_weights= {}
     for submodule_name, submodule in model_to_quantize.named_modules():
         if isinstance(submodule, QModuleMixin):
@@ -815,7 +935,9 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 2*
             ref = _get_tensor_ref(p)
             match = cache_ref.get(ref, None)
             if match != None:
-                tied_weights[submodule_name]=  (n, ) + match
+                tied_weights[submodule_name]=  (n, ) + match
+                entries = reversed_tied_weights.get( match, [])
+                reversed_tied_weights[match] = entries + [ (p, submodule_name,n)]
             else:
                 cache_ref[ref] = (submodule_name, n)
                 size  += torch.numel(p.data) * sizeofhalffloat
@@ -883,6 +1005,7 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 2*
     # force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
     # otherwise we may end up keeping in memory both the quantized and the non quantize model
     named_modules = {n:m for n,m in model_to_quantize.named_modules()}
     for module_name, module in named_modules.items():
         # do not read quantized weights (detected them directly or behind an adapter)
         if isinstance(module, QModuleMixin) or hasattr(module, "base_layer") and  isinstance(module.base_layer, QModuleMixin):
@@ -891,12 +1014,18 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 2*
         else:
             tied_w = tied_weights.get(module_name, None)
             for n, p in module.named_parameters(recurse = False):
                 if tied_w != None and n == tied_w[0]:
                     if isinstance( named_modules[tied_w[1]], QModuleMixin) :
                         setattr(module, n, None) # release refs of tied weights if source is going to be quantized
                     # otherwise don't force load as it will be loaded in the source anyway
                 else:
                     _force_load_parameter(p)
+                    entries =  reversed_tied_weights.get( (module_name, n), [])
+                    for tied_weight, tied_module_name, tied_weight_name in entries:
+                        if n == tied_weight_name:
+                             tied_weight.data = p.data
                 del p #  del p if not it will still contain a ref to a tensor when leaving the loop
         for b in module.buffers(recurse = False):
             _force_load_buffer(b)
@@ -927,38 +1056,340 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 2*
     return True
-def split_linear_modules(model, map ):
-    from optimum.quanto import QModuleMixin, WeightQBytesTensor, QLinear
+def _as_field_tuple(value):
+    if not value:
+        return ()
+    if isinstance(value, str):
+        return (value,)
+    return tuple(value)
+def _get_split_handler(info, field, default_handlers):
+    handlers = info.get("split_handlers") or info.get("field_handlers") or {}
+    if handlers:
+        handler = handlers.get(field)
+        if handler is not None:
+            return handler
+    if default_handlers:
+        return default_handlers.get(field)
+    return None
+def _get_split_base_fields(info, split_fields):
+    base_fields = _as_field_tuple(info.get("base_fields") or info.get("base_field"))
+    if base_fields:
+        return base_fields
+    if split_fields:
+        return (next(iter(split_fields.keys())),)
+    return ()
+def _merge_share_fields(info, share_fields):
+    info_fields = _as_field_tuple(info.get("share_fields") or info.get("shared_fields"))
+    return tuple(sorted(set(info_fields).union(_as_field_tuple(share_fields))))
+def _call_split_handler(handler, *, src, dim, split_sizes, context):
+    if handler is None:
+        return None
+    try:
+        chunks = handler(src=src, dim=dim, split_sizes=split_sizes, context=context)
+    except Exception:
+        return None
+    if not isinstance(chunks, (list, tuple)) or len(chunks) != len(split_sizes):
+        return None
+    return chunks
+def _fill_sub_maps(sub_maps, name, value):
+    for sub_map in sub_maps:
+        sub_map[name] = value
+def sd_split_linear(
+    state_dict,
+    split_map,
+    split_fields=None,
+    share_fields=None,
+    verboseLevel=1,
+    split_handlers=None,
+):
+    if not split_map:
+        return state_dict
+    split_fields = split_fields or {}
+    share_fields = share_fields or ()
+    split_handlers = split_handlers or {}
+    base_fields_by_suffix = {
+        suffix: _get_split_base_fields(info or {}, split_fields)
+        for suffix, info in split_map.items()
+    }
+    def _skip(msg):
+        if verboseLevel >= 2:
+            print(f"[sd_split_linear] Skip {msg}")
+    bases = {}
+    for key in state_dict.keys():
+        for suffix, base_fields in base_fields_by_suffix.items():
+            for base_field in base_fields:
+                suffix_token = f"{suffix}.{base_field}"
+                if not key.endswith(suffix_token):
+                    continue
+                base = key[: -len("." + base_field)]
+                if base.endswith(suffix):
+                    bases[base] = suffix
+                break
+    if not bases:
+        return state_dict
+    for base, suffix in bases.items():
+        info = split_map.get(suffix) or {}
+        mapped = info.get("mapped_modules") or info.get("mapped_suffixes") or info.get("mapped") or []
+        if not mapped:
+            continue
+        base_fields = base_fields_by_suffix.get(suffix) or _get_split_base_fields(info, split_fields)
+        size_field = info.get("size_field") or (base_fields[0] if base_fields else None)
+        size_tensor = state_dict.get(base + "." + size_field) if size_field else None
+        split_dim = info.get("split_dim", 0)
+        split_sizes = list(info.get("split_sizes") or [])
+        if not split_sizes:
+            if size_tensor is None:
+                continue
+            if size_tensor.dim() <= split_dim:
+                _skip(f"{base}: dim={size_tensor.dim()} split_dim={split_dim}")
+                continue
+            out_dim = size_tensor.size(split_dim)
+            if out_dim % len(mapped) != 0:
+                _skip(f"{base}: out_dim={out_dim} not divisible by {len(mapped)}")
+                continue
+            split_sizes = [out_dim // len(mapped)] * len(mapped)
+        elif None in split_sizes:
+            if size_tensor is None:
+                continue
+            if size_tensor.dim() <= split_dim:
+                _skip(f"{base}: dim={size_tensor.dim()} split_dim={split_dim}")
+                continue
+            known = sum(size for size in split_sizes if size is not None)
+            none_count = split_sizes.count(None)
+            remaining = size_tensor.size(split_dim) - known
+            if remaining < 0 or remaining % none_count != 0:
+                _skip(f"{base}: cannot resolve split sizes")
+                continue
+            fill = remaining // none_count
+            split_sizes = [fill if size is None else size for size in split_sizes]
+        total = sum(split_sizes)
+        prefix = base[: -len(suffix)]
+        target_bases = [prefix + name for name in mapped]
+        added = 0
+        field_tensors = {
+            field: state_dict.get(base + "." + field)
+            for field in set(split_fields.keys()).union(share_fields)
+        }
+        base_ctx = {
+            "state_dict": state_dict,
+            "base": base,
+            "suffix": suffix,
+            "split_sizes": split_sizes,
+            "total": total,
+            "mapped": mapped,
+            "target_bases": target_bases,
+            "verboseLevel": verboseLevel,
+            "split_fields": split_fields,
+            "share_fields": share_fields,
+            "field_tensors": field_tensors,
+            "size_field": size_field,
+            "size_tensor": size_tensor,
+            "split_dim": split_dim,
+            "info": info,
+        }
+        fields_iter = list(split_fields.items()) + [(field, None) for field in share_fields]
+        for field, dim in fields_iter:
+            src = field_tensors.get(field)
+            if src is None:
+                continue
+            if dim is None:
+                for target_base in target_bases:
+                    dest_key = target_base + "." + field
+                    if dest_key not in state_dict:
+                        state_dict[dest_key] = src
+                        added += 1
+                continue
+            if src.dim() <= dim:
+                _skip(f"{base}.{field}: dim={src.dim()} split_dim={dim}")
+                continue
+            if src.size(dim) != total:
+                _skip(f"{base}.{field}: size({dim})={src.size(dim)} expected={total}")
+                continue
+            handler = _get_split_handler(info, field, split_handlers)
+            chunks = _call_split_handler(
+                handler,
+                src=src,
+                dim=dim,
+                split_sizes=split_sizes,
+                context=dict(base_ctx, field=field),
+            )
+            if chunks is None:
+                chunks = torch.split(src, split_sizes, dim=dim)
+            for target_base, chunk in zip(target_bases, chunks):
+                if torch.is_tensor(chunk) and not chunk.is_contiguous():
+                    chunk = chunk.contiguous()
+                dest_key = target_base + "." + field
+                if dest_key not in state_dict:
+                    state_dict[dest_key] = chunk
+                    added += 1
+        if added:
+            for field in list(split_fields.keys()) + list(share_fields):
+                state_dict.pop(base + "." + field, None)
+            if verboseLevel >= 2:
+                print(f"[sd_split_linear] Split {base} -> {', '.join(mapped)}")
+    return state_dict
+def split_linear_modules(model, map, split_handlers=None, share_fields=None):
+    from optimum.quanto import QModuleMixin
     from accelerate import init_empty_weights
+    split_handlers = split_handlers or {}
+    share_fields = share_fields or ()
     modules_dict = { k: m for k, m in model.named_modules()}
     for module_suffix, split_info in map.items():
         mapped_modules = split_info["mapped_modules"]
         split_sizes = split_info["split_sizes"]
+        split_share_fields = _merge_share_fields(split_info, share_fields)
+        split_dims = split_info.get("split_dims") or {}
         for k, module in modules_dict.items():
             if k.endswith("." + module_suffix):
                 parent_module = modules_dict[k[:len(k)-len(module_suffix)-1]]
                 weight = module.weight
                 bias = getattr(module, "bias", None)
                 if isinstance(module, QModuleMixin):
-                    _data = weight._data
-                    _scale = weight._scale
-                    sub_data = torch.split(_data, split_sizes, dim=0)
-                    sub_scale = torch.split(_scale, split_sizes, dim=0)
-                    sub_bias = torch.split(bias, split_sizes, dim=0)
-                    for sub_name, _subdata, _subbias, _subscale in zip(mapped_modules, sub_data, sub_bias, sub_scale):
+                    out_features_total = weight.size(0)
+                    if sum(split_sizes) != out_features_total:
+                        raise ValueError(
+                            f"Split sizes {split_sizes} do not match out_features {out_features_total} for '{k}'."
+                        )
+                    in_features = weight.size(1)
+                    sub_biases = None
+                    if bias is not None and bias.dim() > 0 and bias.size(0) == out_features_total:
+                        sub_biases = torch.split(bias, split_sizes, dim=0)
+                    else:
+                        sub_biases = [bias] * len(split_sizes)
+                    sub_tensors = _get_quantized_subtensors(weight)
+                    if not sub_tensors:
+                        raise ValueError(f"Unable to split quantized weight for '{k}'.")
+                    sub_maps = [dict() for _ in split_sizes]
+                    field_tensors = {name: tensor for name, tensor in sub_tensors}
+                    base_ctx = {
+                        "module": module,
+                        "module_name": k,
+                        "module_suffix": module_suffix,
+                        "mapped_modules": mapped_modules,
+                        "split_sizes": split_sizes,
+                        "out_features": out_features_total,
+                        "in_features": in_features,
+                        "field_tensors": field_tensors,
+                        "info": split_info,
+                    }
+                    for name, tensor in sub_tensors:
+                        if tensor is None or name in split_share_fields or tensor.dim() <= 1:
+                            _fill_sub_maps(sub_maps, name, tensor)
+                            continue
+                        split_dim = split_dims.get(name)
+                        if split_dim is None:
+                            if tensor.size(0) == out_features_total:
+                                split_dim = 0
+                            elif tensor.dim() > 1 and tensor.size(1) == out_features_total:
+                                split_dim = 1
+                            else:
+                                split_dim = 0
+                        handler = _get_split_handler(split_info, name, split_handlers)
+                        chunks = _call_split_handler(
+                            handler,
+                            src=tensor,
+                            dim=split_dim,
+                            split_sizes=split_sizes,
+                            context=dict(base_ctx, split_dim=split_dim),
+                        )
+                        if chunks is None:
+                            if tensor.dim() <= split_dim or tensor.size(split_dim) != out_features_total:
+                                got_size = "n/a" if tensor.dim() <= split_dim else tensor.size(split_dim)
+                                raise ValueError(
+                                    f"Cannot split '{k}' quantized tensor '{name}': "
+                                    f"expected size({split_dim})={out_features_total}, got {got_size}."
+                                )
+                            chunks = torch.split(tensor, split_sizes, dim=split_dim)
+                        for sub_map, chunk in zip(sub_maps, chunks):
+                            sub_map[name] = chunk
+                    create_fn = getattr(weight.__class__, "create", None)
+                    if not callable(create_fn):
+                        raise ValueError(f"Quantized weight class '{weight.__class__.__name__}' has no create()")
+                    create_sig = inspect.signature(create_fn)
+                    base_kwargs = {
+                        "qtype": getattr(weight, "qtype", None),
+                        "axis": getattr(weight, "axis", None),
+                        "stride": weight.stride(),
+                        "dtype": weight.dtype,
+                        "activation_qtype": getattr(weight, "activation_qtype", None),
+                        "requires_grad": weight.requires_grad,
+                        "group_size": getattr(weight, "_group_size", None),
+                        "device": weight.device,
+                    }
+                    qmodule_cls = module.__class__
+                    for sub_name, sub_size, sub_map, sub_bias in zip(
+                        mapped_modules, split_sizes, sub_maps, sub_biases
+                    ):
                         with init_empty_weights():
-                            sub_module = QLinear(_subdata.shape[1], _subdata.shape[0], bias=bias != None, device ="cpu", dtype=weight.dtype)
-                        sub_module.weight = torch.nn.Parameter(WeightQBytesTensor.create(weight.qtype, weight.axis, _subdata.size(), weight.stride(), _subdata, _subscale, activation_qtype=weight.activation_qtype, requires_grad=weight.requires_grad ))
-                        if bias != None:
-                            sub_module.bias = torch.nn.Parameter(_subbias)
+                            sub_module = qmodule_cls(
+                                in_features,
+                                sub_size,
+                                bias=bias is not None,
+                                device="cpu",
+                                dtype=weight.dtype,
+                                weights=module.weight_qtype,
+                                activations=module.activation_qtype,
+                                optimizer=module.optimizer,
+                                quantize_input=True,
+                            )
+                        size = list(weight.size())
+                        if size:
+                            size[0] = sub_size
+                        base_kwargs["size"] = tuple(size)
+                        create_kwargs = {}
+                        missing = []
+                        for name, param in create_sig.parameters.items():
+                            if name == "self":
+                                continue
+                            if name in sub_map:
+                                create_kwargs[name] = sub_map[name]
+                            elif name in base_kwargs and base_kwargs[name] is not None:
+                                create_kwargs[name] = base_kwargs[name]
+                            elif param.default is param.empty:
+                                missing.append(name)
+                        if missing:
+                            raise ValueError(
+                                f"Unable to rebuild quantized weight for '{k}.{sub_name}': "
+                                f"missing {missing}."
+                            )
+                        sub_weight = create_fn(**create_kwargs)
+                        sub_module.weight = torch.nn.Parameter(sub_weight, requires_grad=weight.requires_grad)
+                        if sub_bias is not None:
+                            sub_module.bias = torch.nn.Parameter(sub_bias)
                         sub_module.optimizer = module.optimizer
                         sub_module.weight_qtype = module.weight_qtype
+                        sub_module.activation_qtype = module.activation_qtype
                         setattr(parent_module, sub_name, sub_module)
-                    # del _data, _scale, _subdata, sub_d
                 else:
                     sub_data = torch.split(weight, split_sizes, dim=0)
-                    sub_bias = torch.split(bias, split_sizes, dim=0)
+                    sub_bias = torch.split(bias, split_sizes, dim=0) if bias is not None else [None] * len(split_sizes)
                     for sub_name, subdata, subbias in zip(mapped_modules, sub_data, sub_bias):
                         with init_empty_weights():
                             sub_module = torch.nn.Linear( subdata.shape[1], subdata.shape[0], bias=bias != None, device ="cpu", dtype=weight.dtype)
@@ -975,7 +1406,39 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
     loras_model_data = getattr(model, "_loras_model_data", None)
     if loras_model_data == None:
-        raise Exception(f"No Loras has been declared for this model while creating the corresponding offload object")
+        merged_loras_model_data = {}
+        merged_loras_shortcuts = {}
+        sub_loras = {}
+        for submodule_name, submodule in model.named_modules():
+            if submodule is model:
+                continue
+            sub_model_data = getattr(submodule, "_loras_model_data", None)
+            if sub_model_data:
+                submodule._lora_owner = model
+                sub_loras[submodule_name] = submodule
+                for k, v in sub_model_data.items():
+                    if k not in merged_loras_model_data:
+                        merged_loras_model_data[k] = v
+            sub_shortcuts = getattr(submodule, "_loras_model_shortcuts", None)
+            if sub_shortcuts:
+                prefix = f"{submodule_name}." if submodule_name else ""
+                for k, v in sub_shortcuts.items():
+                    merged_key = k
+                    if prefix:
+                        if k:
+                            merged_key = f"{prefix}{k}"
+                        else:
+                            merged_key = submodule_name
+                    if merged_key not in merged_loras_shortcuts:
+                        merged_loras_shortcuts[merged_key] = v
+        if merged_loras_model_data:
+            model._loras_model_data = merged_loras_model_data
+            if merged_loras_shortcuts:
+                model._loras_model_shortcuts = merged_loras_shortcuts
+            model._subloras = sub_loras
+            loras_model_data = merged_loras_model_data
+        else:
+            raise Exception(f"No Loras has been declared for this model while creating the corresponding offload object")
     if not check_only:
         unload_loras_from_model(model)
@@ -1027,7 +1490,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
         if split_linear_modules_map != None:
             new_state_dict = dict()
-            suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False)]
+            suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False),(".dora_scale", -2, False),]
             for module_name, module_data in state_dict.items():
                 name_parts = module_name.split(".")
                 for suffix, pos, any_split in suffixes:
@@ -1052,22 +1515,25 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
         if not fail:
             pos = first_key.find(".")
-            prefix = first_key[0:pos]
-            if prefix not in ["diffusion_model", "transformer"]:
-                msg = f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format."
-                error_msg = append(error_msg, msg)
-                fail = True
-        if not fail:
+            prefix = first_key[0:pos+1]
+            if prefix in ["diffusion_model.", "transformer."]:
+                prefixes = ("diffusion_model.", "transformer.")
+                new_state_dict = {}
+                for k, v in state_dict.items():
+                    for candidate in prefixes:
+                        if k.startswith(candidate):
+                            k = k[len(candidate) :]
+                            break
+                    new_state_dict[k] = v
+                state_dict = new_state_dict
-            state_dict = { k[ len(prefix) + 1:]: v for k, v in state_dict.items() if k.startswith(prefix) }
             clean_up = True
             keys = list(state_dict.keys())
             lora_alphas = {}
             for k in keys:
-                if "alpha" in k:
+                if k.endswith(".alpha"):
                     alpha_value = state_dict.pop(k)
                     if torch.is_tensor(alpha_value):
                         alpha_value = float(alpha_value.item())
@@ -1075,17 +1541,19 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
             invalid_keys = []
             unexpected_keys = []
-            for k, v in state_dict.items():
-                lora_A = None
-                lora_B = None
-                diff_b = None
-                diff = None
+            new_state_dict = {}
+            for k in list(state_dict.keys()):
+                v = state_dict.pop(k)
+                lora_A = lora_B = diff_b = diff = lora_key = dora_scale = None
                 if k.endswith(".diff"):
                     diff = v
                     module_name = k[ : -5]
                 elif k.endswith(".diff_b"):
                     diff_b = v
                     module_name = k[ : -7]
+                elif k.endswith(".dora_scale"):
+                    dora_scale = v
+                    module_name = k[ : -11]
                 else:
                     pos = k.rfind(".lora_")
                     if pos <=0:
@@ -1118,30 +1586,33 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
                         if ignore_model_variations:
                             skip = True
                         else:
-                            msg = f"Lora '{path}': Lora A dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[1]}, lora A = {v.shape[1]}). It is likely this Lora has been made for another version of this model."
+                            msg = f"Lora '{path}/{module_name}': Lora A dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[1]}, lora A = {v.shape[1]}). It is likely this Lora has been made for another version of this model."
                             error_msg = append(error_msg, msg)
                             fail = True
                         break
+                    v = lora_A = lora_A.to(module.weight.dtype)
                 elif lora_B != None:
                     rank = lora_B.shape[1]
                     if module_shape[0] != v.shape[0]:
                         if ignore_model_variations:
                             skip = True
                         else:
-                            msg = f"Lora '{path}': Lora B dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, lora B = {v.shape[0]}). It is likely this Lora has been made for another version of this model."
+                            msg = f"Lora '{path}/{module_name}': Lora B dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, lora B = {v.shape[0]}). It is likely this Lora has been made for another version of this model."
                             error_msg = append(error_msg, msg)
                             fail = True
                         break
+                    v = lora_B = lora_B.to(module.weight.dtype)
                 elif diff != None:
                     lora_B = diff
                     if module_shape != v.shape:
                         if ignore_model_variations:
                             skip = True
                         else:
-                            msg = f"Lora '{path}': Lora shape is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, lora = {v.shape[0]}). It is likely this Lora has been made for another version of this model."
+                            msg = f"Lora '{path}/{module_name}': Lora shape is not compatible with model '{_get_module_name(model)}' (model = {module_shape}, lora = {v.shape}). It is likely this Lora has been made for another version of this model."
                             error_msg = append(error_msg, msg)
                             fail = True
                         break
+                    v = lora_B = lora_B.to(module.weight.dtype)
                 elif diff_b != None:
                     rank = diff_b.shape[0]
                     if not hasattr(module, "bias"):
@@ -1160,26 +1631,42 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
                                 error_msg = append(error_msg, msg)
                                 fail = True
                             break
+                    v = diff_b = diff_b.to(module.weight.dtype)
+                elif dora_scale != None:
+                    rank = dora_scale.shape[1]
+                    if module_shape[0] != v.shape[0]:
+                        if ignore_model_variations:
+                            skip = True
+                        else:
+                            msg = f"Lora '{path}': Dora Scale dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, dora scale = {v.shape[0]}). It is likely this Dora has been made for another version of this model."
+                            error_msg = append(error_msg, msg)
+                            fail = True
+                        break
+                    v = dora_scale = dora_scale.to(module.weight.dtype)
                 if not check_only:
+                    new_state_dict[k] = v
+                    v = None
                     loras_module_data = loras_model_data.get(module, None)
                     assert loras_module_data != None
                     loras_adapter_data =  loras_module_data.get(adapter_name, None)
                     if loras_adapter_data == None:
-                        loras_adapter_data = [None, None, None, 1.]
+                        loras_adapter_data = [None, None, None, None, 1.]
+                        module.any_dora = False
                         loras_module_data[adapter_name] = loras_adapter_data
                     if lora_A != None:
-                        loras_adapter_data[0] = lora_A.to(module.weight.dtype)
+                        loras_adapter_data[0] = lora_A
                     elif lora_B != None:
-                        loras_adapter_data[1] = lora_B.to(module.weight.dtype)
+                        loras_adapter_data[1] = lora_B
+                    elif dora_scale != None:
+                        loras_adapter_data[3] = dora_scale
+                        loras_module_data["any_dora"] = True
                     else:
-                        loras_adapter_data[2] = diff_b.to(module.weight.dtype)
-                    if rank != None:
-                        alpha_key = k[:-len("lora_X.weight")] + "alpha"
+                        loras_adapter_data[2] = diff_b
+                    if rank != None and lora_key is not None and "lora" in lora_key:
+                        alpha_key = k[:-len(lora_key)] + "alpha"
                         alpha = lora_alphas.get(alpha_key, None)
-                        alpha = 1. if alpha == None else alpha / rank
-                        loras_adapter_data[3] = alpha
-            lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = None
+                        if alpha is not None: loras_adapter_data[4] = alpha / rank
+            lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = dora_scale = None
             if len(invalid_keys)  > 0:
                 msg = f"Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
@@ -1202,7 +1689,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
             if not check_only:
                 # model._loras_tied_weights[adapter_name] = tied_weights
                 if pinnedLora:
-                    pinned_sd_list.append(state_dict)
+                    pinned_sd_list.append(new_state_dict)
                     pinned_names_list.append(path)
                     # _pin_sd_to_memory(state_dict, path)
@@ -1250,6 +1737,7 @@ def sync_models_loras(model, model2):
 def unload_loras_from_model(model):
     if model is None: return
+    if not hasattr(model, "_loras_model_data"): return
     for _, v in model._loras_model_data.items():
         v.clear()
     for _, v in model._loras_model_shortcuts.items():
@@ -1264,9 +1752,25 @@ def unload_loras_from_model(model):
 def set_step_no_for_lora(model, step_no):
+    target = getattr(model, "_lora_owner", None)
+    while target is not None and target is not model:
+        model = target
+        target = getattr(model, "_lora_owner", None)
     model._lora_step_no = step_no
+    sub_loras = getattr(model, "_subloras", None)
+    if sub_loras:
+        submodules = sub_loras.values() if isinstance(sub_loras, dict) else sub_loras
+        for submodule in submodules:
+            if submodule is model:
+                continue
+            submodule._lora_step_no = step_no
 def activate_loras(model, lora_nos, lora_multi = None):
+    target = getattr(model, "_lora_owner", None)
+    while target is not None and target is not model:
+        model = target
+        target = getattr(model, "_lora_owner", None)
     if not isinstance(lora_nos, list):
         lora_nos = [lora_nos]
     lora_nos = [str(l) for l in lora_nos]
@@ -1281,6 +1785,15 @@ def activate_loras(model, lora_nos, lora_multi = None):
     model._lora_step_no = 0
     model._loras_active_adapters = lora_nos
     model._loras_scaling = lora_scaling_dict
+    sub_loras = getattr(model, "_subloras", None)
+    if sub_loras:
+        submodules = sub_loras.values() if isinstance(sub_loras, dict) else sub_loras
+        for submodule in submodules:
+            if submodule is model:
+                continue
+            submodule._lora_step_no = 0
+            submodule._loras_active_adapters = lora_nos
+            submodule._loras_scaling = lora_scaling_dict
 def move_loras_to_device(model, device="cpu" ):
@@ -1293,7 +1806,7 @@ def move_loras_to_device(model, device="cpu" ):
         if ".lora_" in k:
             m.to(device)
-def fast_load_transformers_model(model_path: str,  do_quantize = False, quantizationType =  qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, preprocess_sd  = None, modules = None,  return_shared_modules = None,  configKwargs ={}):
+def fast_load_transformers_model(model_path: str,  do_quantize = False, quantizationType =  qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, preprocess_sd  = None, modules = None,  return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, configKwargs ={}):
     """
     quick version of .LoadfromPretrained of  the transformers library
     used to build a model and load the corresponding weights (quantized or not)
@@ -1305,7 +1818,7 @@ def fast_load_transformers_model(model_path: str,  do_quantize = False, quantiza
         model_path = [model_path]
-    if not builtins.all(file_name.endswith(".sft") or file_name.endswith(".safetensors") or file_name.endswith(".pt") for file_name in model_path):
+    if not builtins.all(file_name.endswith(".sft") or file_name.endswith(".safetensors") or file_name.endswith(".pt") or file_name.endswith(".ckpt") for file_name in model_path):
         raise Exception("full model path to file expected")
     model_path = [ _get_model(file) for file in model_path]
@@ -1313,7 +1826,7 @@ def fast_load_transformers_model(model_path: str,  do_quantize = False, quantiza
         raise Exception("Unable to find file")
     verboseLevel = _compute_verbose_level(verboseLevel)
-    if model_path[-1].endswith(".pt"):
+    if model_path[-1].endswith(".pt") or model_path[-1].endswith(".ckpt"):
         metadata = None
     else:
         with safetensors2.safe_open(model_path[-1], writable_tensors =writable_tensors) as f:
@@ -1376,18 +1889,18 @@ def fast_load_transformers_model(model_path: str,  do_quantize = False, quantiza
             model = transfomer_class.from_config(transformer_config )
-    torch.set_default_device('cpu')
     model.eval().requires_grad_(False)
     model._config = transformer_config
-    load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, preprocess_sd = preprocess_sd , modules = modules, return_shared_modules =  return_shared_modules, verboseLevel=verboseLevel )
+    load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, preprocess_sd = preprocess_sd , modules = modules, return_shared_modules =  return_shared_modules, default_dtype = default_dtype, ignore_unused_weights = ignore_unused_weights, verboseLevel=verboseLevel )
     return model
-def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True,  preprocess_sd = None, modules = None, return_shared_modules = None, verboseLevel = -1):
+@cudacontext("cpu")
+def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True,  preprocess_sd = None, postprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1, ignore_missing_keys = False):
     """
     Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
     """
@@ -1423,8 +1936,14 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
         file_path += modules
         modules = None
-    file_path = [ _get_model(file) for file in file_path]
-    if any( file == None for file in file_path):
+    normalized_paths = []
+    for file in file_path:
+        if isinstance(file, (dict, tuple)):
+            normalized_paths.append(file)
+        else:
+            normalized_paths.append(_get_model(file))
+    file_path = normalized_paths
+    if any(file is None for file in file_path):
         raise Exception("Unable to find file")
     verboseLevel = _compute_verbose_level(verboseLevel)
@@ -1442,18 +1961,24 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
     for no, file in enumerate(file_path):
         quantization_map = None
         tied_weights_map = None
-        if not (".safetensors" in file or ".sft" in file):
+        metadata = None
+        detected_kind = None
+        if isinstance(file, tuple):
+            if len(file) != 2:
+                raise Exception("Expected a tuple of (state_dict, quantization_map)")
+            state_dict, quantization_map = file
+        elif isinstance(file, dict):
+            state_dict = file
+        elif not (".safetensors" in file or ".sft" in file):
             if pinToMemory:
                 raise Exception("Pinning to memory while loading only supported for safe tensors files")
-            state_dict = torch.load(file, weights_only=True, map_location="cpu")
+            state_dict = torch.load(file, weights_only=False, map_location="cpu")
             if "module" in state_dict:
                 state_dict = state_dict["module"]
         else:
             basename = os.path.basename(file)
             if "-of-" in basename:
-                metadata = None
                 file_parts= basename.split("-")
                 parts_max = int(file_parts[-1][:5])
                 state_dict = {}
@@ -1463,29 +1988,50 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
                     state_dict.update(sd)
             else:
                 state_dict, metadata = _safetensors_load_file(file, writable_tensors =writable_tensors)
-            if metadata !=  None:
-                quantization_map = metadata.get("quantization_map", None)
-                config = metadata.get("config", None)
-                if config is not None:
-                    model._config = config
-                tied_weights_map = metadata.get("tied_weights_map", None)
-                if tied_weights_map != None:
-                    for name, tied_weights_list in tied_weights_map.items():
-                        mapped_weight = state_dict[name]
-                        for tied_weights in tied_weights_list:
-                            state_dict[tied_weights] = mapped_weight
-            if quantization_map is None:
-                pos = str.rfind(file, ".")
-                if pos > 0:
-                    quantization_map_path = file[:pos]
-                quantization_map_path += "_map.json"
-                if os.path.isfile(quantization_map_path):
-                    with open(quantization_map_path, 'r') as f:
-                        quantization_map = json.load(f)
+        if preprocess_sd != None:
+            state_dict = preprocess_sd(state_dict)
+        if metadata !=  None:
+            quantization_map = metadata.get("quantization_map", None)
+            config = metadata.get("config", None)
+            if config is not None:
+                model._config = config
+            tied_weights_map = metadata.get("tied_weights_map", None)
+            if tied_weights_map != None:
+                for name, tied_weights_list in tied_weights_map.items():
+                    mapped_weight = state_dict[name]
+                    for tied_weights in tied_weights_list:
+                        state_dict[tied_weights] = mapped_weight
+        if quantization_map is None and isinstance(file, str):
+            pos = str.rfind(file, ".")
+            if pos > 0:
+                quantization_map_path = file[:pos]
+            quantization_map_path += "_map.json"
+            if os.path.isfile(quantization_map_path):
+                with open(quantization_map_path, 'r') as f:
+                    quantization_map = json.load(f)
+        if quantization_map is None:
+            conv_result = detect_and_convert(state_dict, default_dtype=default_dtype, verboseLevel=verboseLevel)
+            detected_kind = conv_result.get("kind")
+            if conv_result.get("kind") not in ("none", "quanto"):
+                state_dict = conv_result["state_dict"]
+                quantization_map = conv_result["quant_map"]
+                conv_result = None
+                # enable_fp8_fp32_scale_support()
+            if detected_kind in (None, "none") and isinstance(file, str) and (".safetensors" in file or ".sft" in file):
+                try:
+                    info = detect_safetensors_format(state_dict, verboseLevel=verboseLevel)
+                    detected_kind = info.get("kind")
+                except Exception:
+                    detected_kind = detected_kind or None
+            if detected_kind not in (None, "none") and isinstance(file, str):
+                cache_quantization_for_file(file, detected_kind or "none")
         full_state_dict.update(state_dict)
         if quantization_map != None:
@@ -1504,8 +2050,8 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
     full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
     # deal if we are trying to load just a sub part of a larger model
-    if preprocess_sd != None:
-        state_dict, quantization_map = preprocess_sd(state_dict, quantization_map)
+    if postprocess_sd != None:
+        state_dict, quantization_map = postprocess_sd(state_dict, quantization_map)
     if modelPrefix != None:
         base_model_prefix = modelPrefix + "."
@@ -1513,11 +2059,21 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
         if quantization_map != None:
             quantization_map = filter_state_dict(quantization_map,base_model_prefix)
+    post_load_hooks = []
+    if quantization_map:
+        quantization_map, post_load_hooks = apply_pre_quantization(
+            model,
+            state_dict,
+            quantization_map,
+            default_dtype=default_dtype,
+            verboseLevel=verboseLevel,
+        )
     if len(quantization_map) == 0:
-        if any("quanto" in file for file in file_path) and not do_quantize:
+        if any(isinstance(file, str) and "quanto" in file for file in file_path) and not do_quantize:
             print("Model seems to be quantized by quanto but no quantization map was found whether inside the model or in a separate '{file_path[:json]}_map.json' file")
     else:
-        _requantize(model, state_dict, quantization_map)
+        _requantize(model, state_dict, quantization_map, default_dtype=default_dtype)
@@ -1530,13 +2086,25 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
                 base_model_prefix = k[:-len(missing_keys[0])]
                 break
         if base_model_prefix == None:
-            raise Exception(f"Missing keys: {missing_keys}")
-        state_dict = filter_state_dict(state_dict, base_model_prefix)
-        missing_keys , unexpected_keys = model.load_state_dict(state_dict, False,  assign = True )
+            if not ignore_missing_keys:
+                raise Exception(f"Missing keys: {missing_keys}")
+        else:
+            state_dict = filter_state_dict(state_dict, base_model_prefix)
+            missing_keys , unexpected_keys = model.load_state_dict(state_dict, False,  assign = True )
+            if len(missing_keys) > 0 and not ignore_missing_keys:
+                raise Exception(f"Missing keys: {missing_keys}")
     del state_dict
-    if len(unexpected_keys) > 0 and verboseLevel >=2:
+    if post_load_hooks:
+        for hook in post_load_hooks:
+            try:
+                hook(model)
+            except Exception as e:
+                if verboseLevel >= 2:
+                    print(f"Post-load hook skipped: {e}")
+    if len(unexpected_keys) > 0 and verboseLevel >=2 and not ignore_unused_weights:
         print(f"Unexpected keys while loading '{file_path}': {unexpected_keys}")
     for k,p in model.named_parameters():
@@ -1728,6 +2296,35 @@ class HfHook:
     def detach_hook(self, module):
         return module
+def _mm_lora_linear_forward(module, *args, **kwargs):
+    loras_data = getattr(module, "_mm_lora_data", None)
+    if not loras_data:
+        return module._mm_lora_old_forward(*args, **kwargs)
+    if not hasattr(module, "_mm_manager"):
+        pass
+    return module._mm_manager._lora_linear_forward(
+        module._mm_lora_model,
+        module,
+        loras_data,
+        *args,
+        **kwargs,
+    )
+def _mm_lora_generic_forward(module, *args, **kwargs):
+    loras_data = getattr(module, "_mm_lora_data", None)
+    if not loras_data:
+        return module._mm_lora_old_forward(*args, **kwargs)
+    return module._mm_manager._lora_generic_forward(
+        module._mm_lora_model,
+        module,
+        loras_data,
+        module._mm_lora_old_forward,
+        *args,
+        **kwargs,
+    )
 last_offload_obj = None
 class offload:
     def __init__(self):
@@ -1757,6 +2354,7 @@ class offload:
         global last_offload_obj
         last_offload_obj = self
+        self._type_wrappers = {}
     def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
@@ -1781,22 +2379,12 @@ class offload:
             param_size = 0
             ref = _get_tensor_ref(p)
             tied_param =  self.parameters_ref.get(ref, None)
-            if isinstance(p, QTensor):
-                blocks_params.append( (submodule, k, p, False, tied_param ) )
-                if p._qtype == qint4:
-                    if hasattr(p,"_scale_shift"):
-                        param_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
-                        param_size += torch.numel(p._data._data) * p._data._data.element_size()
-                    else:
-                        param_size += torch.numel(p._scale) * p._scale.element_size()
-                        param_size += torch.numel(p._shift) * p._shift.element_size()
-                        param_size += torch.numel(p._data._data) * p._data._data.element_size()
-                else:
-                    param_size += torch.numel(p._scale) * p._scale.element_size()
-                    param_size += torch.numel(p._data) * p._data.element_size()
+            blocks_params.append((submodule, k, p, False, tied_param))
+            sub_tensors = _get_quantized_subtensors(p)
+            if sub_tensors:
+                param_size += _subtensors_nbytes(sub_tensors)
+                del sub_tensors
             else:
-                blocks_params.append( (submodule, k, p, False, tied_param) )
                 param_size += torch.numel(p.data) * p.data.element_size()
@@ -2091,7 +2679,7 @@ class offload:
             data = loras_data.get(active_adapter + '_GPU', None)
             if data == None:
                 continue
-            diff_w , _ , diff_b, alpha = data
+            diff_w , _ , diff_b, _, alpha = data
             scaling = self._get_lora_scaling( loras_scaling, model, active_adapter) * alpha
             if scaling == 0:
                 continue
@@ -2117,15 +2705,117 @@ class offload:
         return ret
+    def _dora_linear_forward(
+        self,
+        model,
+        submodule,
+        adapters_data,                # dict: name+"_GPU" -> (A, B, diff_b, g_abs, alpha); g_abs=None means LoRA
+        weight= None,
+        bias = None,
+        original_bias = True,
+        dora_mode: str = "blend",     # "ref_exact" | "blend"
+    ):
+        active_adapters = getattr(model, "_loras_active_adapters", [])
+        loras_scaling   = getattr(model, "_loras_scaling", {})
+        # Snapshot base weight (safe for quantized modules)
+        if weight is None:
+            bias = submodule.bias
+            original_bias = True
+            if isinstance(submodule, QModuleMixin):
+                weight = submodule.weight.view(submodule.weight.shape)
+            else:
+                weight = submodule.weight.clone()
+        base_dtype = weight.dtype
+        eps = 1e-8
+        W0 = weight.float()
+        g0 = torch.linalg.vector_norm(W0, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps)  # [out,1]
+        # Keep big mats in low precision
+        # Wc = W0 if W0.dtype == compute_dtype else W0.to(compute_dtype)
+        W0 /= g0
+        weight[...]  = W0.to(base_dtype)
+        W0 = None
+        dir_update = None          # Σ s * ((B@A)/g0)  in compute_dtype
+        g = None                   # final magnitude: set absolute (ref_exact) or blended (blend)
+        bias_delta = None          # Σ s * diff_b
+        # Accumulate DoRA adapters only (g_abs != None)
+        for name in active_adapters:
+            data = adapters_data.get(name + "_GPU", None)
+            if data is None: continue
+            A, B, diff_b, g_abs, alpha = data
+            if g_abs is None: continue
+            s = self._get_lora_scaling(loras_scaling, model, name) * float(alpha)
+            if s == 0: continue
+            # Direction update in V-space with row-wise 1/g0
+            if (A is not None) and (B is not None):
+                dV = torch.mm(B, A)      # [out,in], compute_dtype
+                dV /= g0               # row-wise divide
+                dV.mul_(s)
+                dir_update = dV if dir_update is None else dir_update.add_(dV)
+            if dora_mode == "ref_exact":
+                # absolute magnitude (last one wins if multiple DoRAs present)
+                g = g_abs
+            elif dora_mode == "blend":
+                # blend towards absolute magnitude proportional to s
+                if g is None:
+                    g = g0.clone()
+                g.add_(g_abs.sub(g0), alpha=s)
+            else:
+                raise ValueError(f"Unknown dora_mode: {dora_mode}")
+            # Optional bias deltas (not in reference, but harmless if present)
+            if diff_b is not None:
+                db = diff_b.mul(s)
+                bias_delta = db if bias_delta is None else bias_delta.add_(db)
+                db = None
+        if g is None:
+            g = g0  # no magnitude provided -> keep original
+        # Re-normalize rows if we changed direction
+        if dir_update is not None:
+            weight.add_(dir_update)
+            V = weight.float()
+            Vn = torch.linalg.vector_norm(V, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps)
+            V /= Vn
+            V *= g
+            weight[...] = V.to(base_dtype)
+            V = None
+        else:
+            weight *= g
+        # Recompose adapted weight; cast back to module dtype
+        # Merge DoRA bias delta safely
+        if bias_delta is not None:
+            if bias is None:
+                bias = bias_delta
+            else:
+                bias = bias.clone() if original_bias else bias
+                bias.add_(bias_delta)
+        return weight, bias
     def _lora_linear_forward(self, model, submodule, loras_data, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
         weight = submodule.weight
+        bias = submodule.bias
         active_adapters = model._loras_active_adapters
         loras_scaling = model._loras_scaling
+        any_dora = loras_data.get("any_dora", False)
+        is_nvfp4 = getattr(submodule, "is_nvfp4", False)
         training = False
-        dtype = weight.dtype
-        if weight.shape[-1] < x.shape[-2]: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
-            bias = submodule.bias
+        dtype = weight.dtype
+        if (weight.shape[-1] < x.shape[-2] and False or any_dora): # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
+            original_bias = True
             original_bias = True
             if len(active_adapters) > 0:
                 if isinstance(submodule, QModuleMixin):
@@ -2136,10 +2826,17 @@ class offload:
                     data = loras_data.get(active_adapter + '_GPU', None)
                     if data == None:
                         continue
-                    lora_A_weight, lora_B_weight, diff_b, alpha = data
+                    lora_A_weight, lora_B_weight, diff_b, g_abs, alpha = data
                     scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
-                    if scaling == 0:
+                    if scaling == 0 or g_abs is not None:
                         continue
+                    target_dtype = weight.dtype
+                    if lora_A_weight is not None and lora_A_weight.dtype != target_dtype:
+                        lora_A_weight = lora_A_weight.to(target_dtype)
+                    if lora_B_weight is not None and lora_B_weight.dtype != target_dtype:
+                        lora_B_weight = lora_B_weight.to(target_dtype)
+                    if diff_b is not None and diff_b.dtype != target_dtype:
+                        diff_b = diff_b.to(target_dtype)
                     if lora_A_weight != None:
                         weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
@@ -2152,41 +2849,60 @@ class offload:
                             original_bias = False
                         bias.add_(diff_b, alpha=scaling)
                     # base_weight += scaling * lora_B_weight @ lora_A_weight
+                if any_dora :
+                    weight, bias = self._dora_linear_forward(model, submodule, loras_data, weight, bias, original_bias)
             if training:
                 pass
                 # result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
             else:
-                result = torch.nn.functional.linear(x, weight, bias=bias)
+                base_bias = bias
+                if base_bias is not None and base_bias.dtype != x.dtype:
+                    base_bias = base_bias.to(x.dtype)
+                result = torch.nn.functional.linear(x, weight, bias=base_bias)
         else:
-            result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
+            base_bias = bias
+            if base_bias is not None and base_bias.dtype != x.dtype:
+                base_bias = base_bias.to(x.dtype)
+            result = torch.nn.functional.linear(x, weight, bias=base_bias)
             if len(active_adapters) > 0:
-                x = x.to(dtype)
+                compute_dtype = torch.float32 if is_nvfp4 else result.dtype
+                if result.dtype != compute_dtype:
+                    result = result.to(compute_dtype)
+                x = x.to(compute_dtype)
                 for active_adapter in active_adapters:
                     data = loras_data.get(active_adapter + '_GPU', None)
                     if data == None:
                         continue
-                    lora_A, lora_B, diff_b, alpha = data
+                    lora_A, lora_B, diff_b, g_abs, alpha = data
                     # dropout = self.lora_dropout[active_adapter]
                     scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
-                    if scaling == 0:
+                    if scaling == 0 or g_abs is not None:
                         continue
+                    target_dtype = result.dtype
+                    if lora_A is not None and lora_A.dtype != target_dtype:
+                        lora_A = lora_A.to(target_dtype)
+                    if lora_B is not None and lora_B.dtype != target_dtype:
+                        lora_B = lora_B.to(target_dtype)
+                    if diff_b is not None and diff_b.dtype != target_dtype:
+                        diff_b = diff_b.to(target_dtype)
                     if lora_A == None:
                         result.add_(diff_b, alpha=scaling)
                     else:
-                        x = x.to(lora_A.dtype)
-                        if training:
-                            pass
-                            # y = lora_A(dropout(x))
-                        else:
-                            y = torch.nn.functional.linear(x, lora_A, bias=None)
-                        y = torch.nn.functional.linear(y, lora_B, bias=diff_b)
-                        y*= scaling
-                        result+= y
+                        x_2d = x.reshape(-1, x.shape[-1])
+                        result_2d = result.reshape(-1, result.shape[-1])
+                        y = x_2d @ lora_A.T
+                        result_2d.addmm_(y, lora_B.T, beta=1, alpha=scaling)
+                        if diff_b is not None:
+                            result_2d.add_(diff_b, alpha=scaling)
                         del y
+                target_dtype = input_dtype if is_nvfp4 else dtype
+                if result.dtype != target_dtype:
+                    result = result.to(target_dtype)
         return result
@@ -2198,22 +2914,14 @@ class offload:
         assert submodule_name not in loras_model_shortcuts
         loras_model_shortcuts[submodule_name] = loras_data
         loras_model_data[submodule] = loras_data
+        submodule._mm_lora_data = loras_data
+        submodule._mm_lora_model = current_model
+        submodule._mm_lora_old_forward = old_forward
-        if isinstance(submodule,  torch.nn.Linear):
-            def lora_linear_forward(module,  *args, **kwargs):
-                if len(loras_data) == 0:
-                    return old_forward(*args, **kwargs)
-                else:
-                    submodule.aaa = submodule_name
-                    return self._lora_linear_forward(current_model, submodule, loras_data,  *args, **kwargs)
-            target_fn = lora_linear_forward
+        if isinstance(submodule,  torch.nn.Linear) or getattr(submodule, "is_nvfp4", False):
+            target_fn = _mm_lora_linear_forward
         else:
-            def lora_generic_forward(module,  *args, **kwargs):
-                if len(loras_data) == 0:
-                    return old_forward(*args, **kwargs)
-                else:
-                    return self._lora_generic_forward(current_model, submodule, loras_data, old_forward, *args, **kwargs)
-            target_fn = lora_generic_forward
+            target_fn = _mm_lora_generic_forward
         return functools.update_wrapper(functools.partial(target_fn, submodule), old_forward)
     def ensure_model_loaded(self, model_id):
@@ -2236,10 +2944,65 @@ class offload:
         # need to be registered before the forward not to be break the efficiency of the compilation chain
         # it should be at the top of the compilation as this type of hook in the middle of a chain seems to break memory performance
-        target_module.register_forward_pre_hook(preload_blocks_for_compile)
+        target_module.register_forward_pre_hook(preload_blocks_for_compile)
-    def hook_check_empty_cache_needed(self, target_module, model, model_id, blocks_name, previous_method,  context):
+    @torch._dynamo.disable
+    def _pre_check(self, module):
+        model_id    = getattr(module, "_mm_model_id", None)
+        blocks_name = getattr(module, "_mm_blocks_name", None)
+        self.ensure_model_loaded(model_id)
+        if blocks_name is None:
+            if self.ready_to_check_mem():
+                self.empty_cache_if_needed()
+        elif blocks_name != self.loaded_blocks[model_id] and \
+             blocks_name not in self.preloaded_blocks_per_model[model_id]:
+            self.gpu_load_blocks(model_id, blocks_name)
+    def _get_wrapper_for_type(self, mod_cls):
+        fn = self._type_wrappers.get(mod_cls)
+        if fn is not None:
+            return fn
+        # Unique function name per class -> unique compiled code object
+        fname = f"_mm_wrap_{mod_cls.__module__.replace('.', '_')}_{mod_cls.__name__}"
+        # Keep body minimal; all heavy/offload logic runs out-of-graph in _pre_check
+        # Include __TYPE_CONST in the code so the bytecode/consts differ per class.
+        src = f"""
+def {fname}(module, *args, **kwargs):
+    _ = __TYPE_CONST  # anchor type as a constant to make code object unique per class
+    nada = "{fname}"
+    mgr = module._mm_manager
+    mgr._pre_check(module)
+    return module._mm_forward(*args, **kwargs) #{fname}
+"""
+        ns = {"__TYPE_CONST": mod_cls}
+        exec(src, ns)                   # compile a new function object/code object for this class
+        fn = ns[fname]
+        self._type_wrappers[mod_cls] = fn
+        return fn
+    def hook_check_load_into_GPU_if_needed(
+        self, target_module, model, model_id, blocks_name, previous_method, context
+    ):
+        # store instance data on the module (not captured by the wrapper)
+        target_module._mm_manager     = self
+        target_module._mm_model_id    = model_id
+        target_module._mm_blocks_name = blocks_name
+        target_module._mm_forward     = previous_method
+        # per-TYPE wrapper (unique bytecode per class, reused across instances of that class)
+        wrapper_fn = self._get_wrapper_for_type(type(target_module))
+        # bind as a bound method (no partial/closures)
+        # target_module.forward = types.MethodType(wrapper_fn, target_module)
+        target_module.forward = functools.update_wrapper(functools.partial(wrapper_fn, target_module), previous_method)
+    def hook_check_load_into_GPU_if_needed_default(self, target_module, model, model_id, blocks_name, previous_method,  context):
         dtype = model._dtype
         qint4quantization =  isinstance(target_module, QModuleMixin) and  target_module.weight!= None and  target_module.weight.qtype == qint4
@@ -2259,22 +3022,33 @@ class offload:
             target_module.forward = target_module._mm_forward
             return
-        def check_empty_cuda_cache(module, *args, **kwargs):
+        def check_load_into_GPU_needed():
             self.ensure_model_loaded(model_id)
             if blocks_name == None:
                 if self.ready_to_check_mem():
                     self.empty_cache_if_needed()
             elif blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
                 self.gpu_load_blocks(model_id, blocks_name)
-            if qint4quantization and dtype !=None:
-                args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
-            return previous_method(*args, **kwargs)
+            # if qint4quantization and dtype !=None:
+            #     args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
+        if isinstance(target_module, torch.nn.Linear):
+            def check_load_into_GPU_needed_linear(module, *args, **kwargs):
+                check_load_into_GPU_needed()
+                return previous_method(*args, **kwargs) # linear
+            check_load_into_GPU_needed_module = check_load_into_GPU_needed_linear
+        else:
+            def check_load_into_GPU_needed_other(module, *args, **kwargs):
+                check_load_into_GPU_needed()
+                return previous_method(*args, **kwargs) # other
+            check_load_into_GPU_needed_module = check_load_into_GPU_needed_other
         setattr(target_module, "_mm_id", model_id)
+        setattr(target_module, "_mm_manager", self)
         setattr(target_module, "_mm_forward", previous_method)
-        setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
+        setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_load_into_GPU_needed_module, target_module), previous_method) )
+        # target_module.register_forward_pre_hook(check_empty_cuda_cache)
     def hook_change_module(self, target_module, model, model_id, module_id, previous_method, previous_method_name ):
@@ -2300,7 +3074,7 @@ class offload:
         if not self.verboseLevel >=1:
             return
-        if module_id == None or module_id =='':
+        if previous_method_name =="forward" and (module_id == None or module_id ==''):
             model_name = model._get_name()
             print(f"Hooked to model '{model_id}' ({model_name})")
@@ -2415,7 +3189,7 @@ class offload:
-def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True,  extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
+def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True,  extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, compile_mode ="default", verboseLevel = -1):
     """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
     pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
     quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -2424,6 +3198,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
     budgets: 0 by default (unlimited). If non 0, it corresponds to the maximum size in MB that every model will occupy at any moment
         (in fact the real usage is twice this number). It is very efficient to reduce VRAM consumption but this feature may be very slow
         if pinnedMemory is not enabled
+    vram_safety_coefficient: float between 0 and 1 (exclusive), default 0.8. Sets the maximum portion of VRAM that can be used for models.
+        Lower values provide more safety margin but may reduce performance.
     """
     self = offload()
     self.verboseLevel = verboseLevel
@@ -2439,7 +3215,11 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
             return float(b[:-1]) * self.device_mem_capacity
         else:
             return b * ONE_MB
+    # Validate vram_safety_coefficient
+    if not isinstance(vram_safety_coefficient, float) or vram_safety_coefficient <= 0 or vram_safety_coefficient >= 1:
+        raise ValueError("vram_safety_coefficient must be a float between 0 and 1 (exclusive)")
     budget = 0
     if not budgets is None:
         if isinstance(budgets , dict):
@@ -2523,26 +3303,22 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
         current_model_size = 0
         model_dtype = getattr(current_model, "_model_dtype", None)
+        fp8_fallback_dtype = None
         # if model_dtype == None:
         #     model_dtype = getattr(current_model, "dtype", None)
         for _ , m in current_model.named_modules():
             ignore_dtype = hasattr(m, "_lock_dtype")
             for n, p in m.named_parameters(recurse = False):
                 p.requires_grad = False
-                if isinstance(p, QTensor):
-                    if p._qtype == qint4:
-                        if hasattr(p,"_scale_shift"):
-                            current_model_size +=  torch.numel(p._scale_shift) * p._scale_shift.element_size()
-                        else:
-                            current_model_size +=  torch.numel(p._scale) * p._shift.element_size() + torch.numel(p._scale) * p._shift.element_size()
-                        current_model_size +=  torch.numel(p._data._data) * p._data._data.element_size()
-                    else:
-                        current_model_size +=  torch.numel(p._scale) * p._scale.element_size()
-                        current_model_size +=  torch.numel(p._data) * p._data.element_size()
-                    dtype = p._scale.dtype
+                sub_tensors = _get_quantized_subtensors(p)
+                if sub_tensors:
+                    current_model_size += _subtensors_nbytes(sub_tensors)
+                    dtype = sub_tensors[0][1].dtype
+                    for name, tensor in sub_tensors:
+                        if name in ("scale", "scale_shift"):
+                            dtype = tensor.dtype
+                            break
+                    del sub_tensors
                 else:
                     if not ignore_dtype:
                         dtype = p.data.dtype
@@ -2551,14 +3327,25 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
                             dtype = convertWeightsFloatTo if model_dtype == None else model_dtype
                             if dtype != torch.float32:
                                 p.data = p.data.to(dtype)
-                        if model_dtype== None:
-                            model_dtype = dtype
+                        if model_dtype is None:
+                            if dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+                                if fp8_fallback_dtype is None:
+                                    fp8_fallback_dtype = dtype
+                            else:
+                                model_dtype = dtype
                         else:
                             if model_dtype != dtype:
-                                pass
-                            assert model_dtype == dtype
+                                if (
+                                    dtype in (torch.float8_e4m3fn, torch.float8_e5m2)
+                                    or model_dtype in (torch.float8_e4m3fn, torch.float8_e5m2)
+                                ):
+                                    pass
+                                else:
+                                    assert model_dtype == dtype
                     current_model_size +=  torch.numel(p.data) * p.data.element_size()
-                current_model._dtype = model_dtype
+        if model_dtype is None and fp8_fallback_dtype is not None:
+            model_dtype = fp8_fallback_dtype
+        current_model._dtype = model_dtype
         for b in current_model.buffers():
             # do not convert 32 bits float to 16 bits since buffers are few (and potential gain low) and usually they are needed for precision calculation (for instance Rope)
             current_model_size +=  torch.numel(b.data) * b.data.element_size()
@@ -2584,14 +3371,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
                 model_budget =  new_budget if model_budget == 0 or new_budget < model_budget else model_budget
         if  model_budget > 0 and model_budget > current_model_size:
             model_budget = 0
-        coef =0.8
+        coef =vram_safety_coefficient
         if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
             if verboseLevel >= 1:
                 if model_budget == 0:
-                    print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
+                    print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100:.0f}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB:0.1f}) MB)")
                 else:
                     print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
-                print(f"Budget allocation for this model has been consequently reduced to the 80% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
+                print(f"Budget allocation for this model has been consequently reduced to the {coef * 100:.0f}% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
                 model_budget = coef * self.device_mem_capacity
@@ -2607,19 +3394,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
     for model_id in models:
         current_model: torch.nn.Module = models[model_id]
         towers_names, towers_modules = _detect_main_towers(current_model)
-        # compile main iterative modules stacks ("towers")
         compilationInThisOne = compileAllModels or model_id in modelsToCompile
-        if compilationInThisOne:
-            if self.verboseLevel>=1:
-                if len(towers_modules)>0:
-                    formated_tower_names = [name + '*' for name in towers_names]
-                    print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
-                else:
-                    print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
-            for submodel in towers_modules:
-                submodel.forward= torch.compile(submodel.forward,  backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
-                    #dynamic=True,
         if pinAllModels or model_id in modelsToPin:
             if hasattr(current_model,"_already_pinned"):
@@ -2627,6 +3402,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
                     print(f"Model '{model_id}' already pinned to reserved memory")
             else:
                 _pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, perc_reserved_mem_max = perc_reserved_mem_max, verboseLevel=verboseLevel)
         current_budget = model_budgets[model_id]
         cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
         self.loaded_blocks[model_id] = None
@@ -2665,8 +3441,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
                         # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
             top_submodule = len(submodule_name.split("."))==1
             offload_hooks = submodule._offload_hooks if hasattr(submodule, "_offload_hooks") else []
-            if len(offload_hooks) > 0:
-                pass
             assert top_submodule or len(offload_hooks) == 0, "custom offload hooks can only be set at the of the module"
             submodule_method_names = ["forward"] +  offload_hooks
             for submodule_method_name in submodule_method_names:
@@ -2676,16 +3450,32 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
                 else:
                     submodule_method = getattr(submodule, submodule_method_name)
                 if callable(submodule_method):
-                    if top_submodule and cur_blocks_name is None:
+                    if top_submodule and cur_blocks_name is None and not (any_lora and len(submodule._parameters)):
                         self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method, submodule_method_name)
                     elif compilationInThisOne and submodule in towers_modules:
                         self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
                     else:
-                        self.hook_check_empty_cache_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
+                        if compilationInThisOne: #and False
+                            self.hook_check_load_into_GPU_if_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
+                        else:
+                            self.hook_check_load_into_GPU_if_needed_default(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
                     self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
+        # compile main iterative modules stacks ("towers")
+        if compilationInThisOne:
+            if self.verboseLevel>=1:
+                if len(towers_modules)>0:
+                    formated_tower_names = [name + '*' for name in towers_names]
+                    print(f"Pytorch compilation of '{model_id}' is scheduled for these modules : {formated_tower_names}.")
+                else:
+                    print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
+            for submodel in towers_modules:
+                submodel.forward= torch.compile(submodel.forward,  backend= "inductor", mode= compile_mode) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
+                    #dynamic=True,
         self.tune_preloading(model_id, current_budget, towers_names)
         self.parameters_ref  = {}

mmgp 3.5.7__py3-none-any.whl → 3.6.11__py3-none-any.whl

mmgp 3.5.7py3-none-any.whl → 3.6.11py3-none-any.whl