PyPI - mmgp - Versions diffs - 3.6.2__tar.gz → 3.6.3__tar.gz - Mend

mmgp 3.6.2tar.gz → 3.6.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mmgp might be problematic. Click here for more details.

Files changed (15) hide show

{mmgp-3.6.2/src/mmgp.egg-info → mmgp-3.6.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mmgp
-Version: 3.6.2
+Version: 3.6.3
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
 <p align="center">
-  <H2>Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
 </p>

{mmgp-3.6.2 → mmgp-3.6.3}/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
 <p align="center">
-  <H2>Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
 </p>

{mmgp-3.6.2 → mmgp-3.6.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mmgp"
-version = "3.6.2"
+version = "3.6.3"
 authors = [
   { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
 ]

{mmgp-3.6.2 → mmgp-3.6.3}/src/mmgp/fp8_quanto_bridge.py RENAMED Viewed

@@ -168,7 +168,7 @@ def convert_scaled_fp8_to_quanto(
     require_scale: bool = False,
     allow_default_scale: bool = True,
     default_missing_scale: float = 1.0,
-    scale_dtype: Union[str, torch.dtype] = "float32",
+    dtype: Union[str, torch.dtype] = "float32",
     add_activation_placeholders: bool = True,
     # dict mode options
     sd_metadata: Optional[Dict[str, str]] = None,
@@ -176,7 +176,7 @@ def convert_scaled_fp8_to_quanto(
     free_cuda_cache: bool = False,
     cuda_cache_interval: int = 32,
 ) -> ConvertResult:
-    sd_scale_dtype = _normalize_scale_dtype(scale_dtype)
+    sd_scale_dtype = _normalize_scale_dtype(dtype)
     patch_needed = (sd_scale_dtype == torch.float32)
     acc, closer = _as_accessor(
@@ -186,6 +186,7 @@ def convert_scaled_fp8_to_quanto(
         free_cuda_cache=free_cuda_cache,
         cuda_cache_interval=cuda_cache_interval,
     )
+    if not acc.can_delete(): in_place = False
     try:
         meta = acc.metadata() or {}
         meta_scale_map = _maybe_parse_scale_map(meta) or {}
@@ -216,6 +217,7 @@ def convert_scaled_fp8_to_quanto(
             sk = scale_weight_map.get(wk)
             if sk is not None:
                 s_t = acc.get_tensor(sk).to(torch.float32)
+                if in_place: acc.delete(s_t)
                 if s_t.numel() == 1:
                     return torch.full((out_ch,), float(s_t.item()), dtype=torch.float32)
                 if s_t.numel() == out_ch:
@@ -231,49 +233,41 @@ def convert_scaled_fp8_to_quanto(
                     return torch.full((out_ch,), float(meta_scale_map[alt]), dtype=torch.float32)
             return None
-        # out dict: mutate original dict if in_place, else new dict
-        out_sd: Dict[str, torch.Tensor] = acc.sd if isinstance(acc, DictAccessor) and in_place else {}
+        out_sd: Dict[str, torch.Tensor] = {}
         qmap: Dict[str, Dict] = {}
         # Single pass: rewrite FP8 weights, copy-through others
         for k in keys:
             # Drop source-only artifacts
-            if k == "scaled_fp8" or k.endswith(".scale_weight"):
-                if acc.can_delete(): acc.delete(k)
+            if k == "scaled_fp8" or k.endswith(".scale_weight") :
                 continue
-            if _is_weight_key(k):
-                t = acc.get_tensor(k)
-                if t.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
-                    # Quantized: keep original FP8 tensor as _data
-                    out_sd[k + DATA_SUFFIX] = t
+            t = acc.get_tensor(k)
+            if in_place: acc.delete(k)
+            if _is_weight_key(k) and t.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+                # Quantized: keep original FP8 tensor as _data
+                out_sd[k + DATA_SUFFIX] = t
-                    out_ch = int(t.shape[0])
-                    s_vec = get_scale_vec_for_weight(k, out_ch)
-                    if s_vec is None:
-                        if require_scale and not allow_default_scale:
-                            raise KeyError(f"No scale found for '{k}' (looked for '.scale_weight' and metadata).")
-                        s_vec = torch.full((out_ch,), float(default_missing_scale), dtype=torch.float32)
+                out_ch = int(t.shape[0])
+                s_vec = get_scale_vec_for_weight(k, out_ch)
+                if s_vec is None:
+                    if require_scale and not allow_default_scale:
+                        raise KeyError(f"No scale found for '{k}' (looked for '.scale_weight' and metadata).")
+                    s_vec = torch.full((out_ch,), float(default_missing_scale), dtype=torch.float32)
-                    s_grid = _per_channel_reshape(s_vec, t).to(sd_scale_dtype)
-                    out_sd[k + SCALE_SUFFIX] = s_grid
-                    if add_activation_placeholders:
-                        base = k[:-len(".weight")]
-                        out_sd[base + IN_SCALE]  = torch.tensor([1], dtype=sd_scale_dtype)
-                        out_sd[base + OUT_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
+                s_grid = _per_channel_reshape(s_vec, t).to(sd_scale_dtype)
+                out_sd[k + SCALE_SUFFIX] = s_grid
+                if add_activation_placeholders:
                     base = k[:-len(".weight")]
-                    qmap[base] = {"weights": _QTYPE_NAME[fmt], "activations": "none"}
-                    if acc.can_delete():
-                        acc.delete(k)
-                    continue  # don't copy original .weight
-            # Copy-through
-            if not (isinstance(acc, DictAccessor) and in_place):
-                out_sd[k] = acc.get_tensor(k)
+                    out_sd[base + IN_SCALE]  = torch.tensor([1], dtype=sd_scale_dtype)
+                    out_sd[base + OUT_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
+                base = k[:-len(".weight")]
+                qmap[base] = {"weights": _QTYPE_NAME[fmt], "activations": "none"}
+            else:
+                out_sd[k] =  t if t.dtype == dtype or t.dtype == torch.float32 else t.to(dtype)
+            t = None
         return ConvertResult(state_dict=out_sd, quant_map=qmap, fp8_format=fmt, patch_needed=patch_needed)
     finally:
         closer()
@@ -481,7 +475,7 @@ def _cli():
         res = convert_scaled_fp8_to_quanto(
             args.in_path,
             fp8_format=args.fp8_format,
-            scale_dtype=args.scale_dtype,
+            dtype=args.scale_dtype,
             add_activation_placeholders=not args.no_activation_placeholders,
             default_missing_scale=args.default_missing_scale,
         )

{mmgp-3.6.2 → mmgp-3.6.3}/src/mmgp/offload.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# ------------------ Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep (mmgp)------------------
+# ------------------ Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep (mmgp)------------------
 #
 # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ...  can run smoothly on a 24 GB GPU limited card.
 # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -688,7 +688,7 @@ def _welcome():
     if welcome_displayed:
          return
     welcome_displayed = True
-    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.2) by DeepBeepMeep ************{ENDC}{UNBOLD}")
+    print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
 def change_dtype(model, new_dtype, exclude_buffers = False):
     for submodule_name, submodule in model.named_modules():
@@ -1413,7 +1413,7 @@ def fast_load_transformers_model(model_path: str,  do_quantize = False, quantiza
-def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True,  preprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1):
+def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True,  preprocess_sd = None, postprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1):
     """
     Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
     """
@@ -1489,38 +1489,41 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
                     state_dict.update(sd)
             else:
                 state_dict, metadata = _safetensors_load_file(file, writable_tensors =writable_tensors)
-            if metadata !=  None:
-                quantization_map = metadata.get("quantization_map", None)
-                config = metadata.get("config", None)
-                if config is not None:
-                    model._config = config
-                tied_weights_map = metadata.get("tied_weights_map", None)
-                if tied_weights_map != None:
-                    for name, tied_weights_list in tied_weights_map.items():
-                        mapped_weight = state_dict[name]
-                        for tied_weights in tied_weights_list:
-                            state_dict[tied_weights] = mapped_weight
-            if quantization_map is None:
-                detection_type = detect_safetensors_format(state_dict)
-                if detection_type["kind"] in ['scaled_fp8','fp8']:
-                    conv_result = convert_scaled_fp8_to_quanto(state_dict, scale_dtype = default_dtype)
-                    state_dict = conv_result["state_dict"]
-                    quantization_map = conv_result["quant_map"]
-                    conv_result = None
-                    # enable_fp8_fp32_scale_support()
-            if quantization_map is None:
-                pos = str.rfind(file, ".")
-                if pos > 0:
-                    quantization_map_path = file[:pos]
-                quantization_map_path += "_map.json"
-                if os.path.isfile(quantization_map_path):
-                    with open(quantization_map_path, 'r') as f:
-                        quantization_map = json.load(f)
+        if preprocess_sd != None:
+            state_dict = preprocess_sd(state_dict)
+        if metadata !=  None:
+            quantization_map = metadata.get("quantization_map", None)
+            config = metadata.get("config", None)
+            if config is not None:
+                model._config = config
+            tied_weights_map = metadata.get("tied_weights_map", None)
+            if tied_weights_map != None:
+                for name, tied_weights_list in tied_weights_map.items():
+                    mapped_weight = state_dict[name]
+                    for tied_weights in tied_weights_list:
+                        state_dict[tied_weights] = mapped_weight
+        if quantization_map is None:
+            detection_type = detect_safetensors_format(state_dict)
+            if detection_type["kind"] in ['scaled_fp8','fp8']:
+                conv_result = convert_scaled_fp8_to_quanto(state_dict, dtype = default_dtype, in_place= True)
+                state_dict = conv_result["state_dict"]
+                quantization_map = conv_result["quant_map"]
+                conv_result = None
+                # enable_fp8_fp32_scale_support()
+        if quantization_map is None:
+            pos = str.rfind(file, ".")
+            if pos > 0:
+                quantization_map_path = file[:pos]
+            quantization_map_path += "_map.json"
+            if os.path.isfile(quantization_map_path):
+                with open(quantization_map_path, 'r') as f:
+                    quantization_map = json.load(f)
         full_state_dict.update(state_dict)
         if quantization_map != None:
@@ -1539,8 +1542,8 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
     full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
     # deal if we are trying to load just a sub part of a larger model
-    if preprocess_sd != None:
-        state_dict, quantization_map = preprocess_sd(state_dict, quantization_map)
+    if postprocess_sd != None:
+        state_dict, quantization_map = postprocess_sd(state_dict, quantization_map)
     if modelPrefix != None:
         base_model_prefix = modelPrefix + "."

{mmgp-3.6.2 → mmgp-3.6.3/src/mmgp.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mmgp
-Version: 3.6.2
+Version: 3.6.3
 Summary: Memory Management for the GPU Poor
 Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
 Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
 <p align="center">
-  <H2>Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep</H2>
+  <H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
 </p>