mmgp 3.6.2__tar.gz → 3.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.6.2
3
+ Version: 3.6.3
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  <p align="center">
3
- <H2>Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep</H2>
3
+ <H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
4
4
  </p>
5
5
 
6
6
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.6.2"
3
+ version = "3.6.3"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -168,7 +168,7 @@ def convert_scaled_fp8_to_quanto(
168
168
  require_scale: bool = False,
169
169
  allow_default_scale: bool = True,
170
170
  default_missing_scale: float = 1.0,
171
- scale_dtype: Union[str, torch.dtype] = "float32",
171
+ dtype: Union[str, torch.dtype] = "float32",
172
172
  add_activation_placeholders: bool = True,
173
173
  # dict mode options
174
174
  sd_metadata: Optional[Dict[str, str]] = None,
@@ -176,7 +176,7 @@ def convert_scaled_fp8_to_quanto(
176
176
  free_cuda_cache: bool = False,
177
177
  cuda_cache_interval: int = 32,
178
178
  ) -> ConvertResult:
179
- sd_scale_dtype = _normalize_scale_dtype(scale_dtype)
179
+ sd_scale_dtype = _normalize_scale_dtype(dtype)
180
180
  patch_needed = (sd_scale_dtype == torch.float32)
181
181
 
182
182
  acc, closer = _as_accessor(
@@ -186,6 +186,7 @@ def convert_scaled_fp8_to_quanto(
186
186
  free_cuda_cache=free_cuda_cache,
187
187
  cuda_cache_interval=cuda_cache_interval,
188
188
  )
189
+ if not acc.can_delete(): in_place = False
189
190
  try:
190
191
  meta = acc.metadata() or {}
191
192
  meta_scale_map = _maybe_parse_scale_map(meta) or {}
@@ -216,6 +217,7 @@ def convert_scaled_fp8_to_quanto(
216
217
  sk = scale_weight_map.get(wk)
217
218
  if sk is not None:
218
219
  s_t = acc.get_tensor(sk).to(torch.float32)
220
+ if in_place: acc.delete(s_t)
219
221
  if s_t.numel() == 1:
220
222
  return torch.full((out_ch,), float(s_t.item()), dtype=torch.float32)
221
223
  if s_t.numel() == out_ch:
@@ -231,49 +233,41 @@ def convert_scaled_fp8_to_quanto(
231
233
  return torch.full((out_ch,), float(meta_scale_map[alt]), dtype=torch.float32)
232
234
  return None
233
235
 
234
- # out dict: mutate original dict if in_place, else new dict
235
- out_sd: Dict[str, torch.Tensor] = acc.sd if isinstance(acc, DictAccessor) and in_place else {}
236
+ out_sd: Dict[str, torch.Tensor] = {}
236
237
  qmap: Dict[str, Dict] = {}
237
238
 
238
239
  # Single pass: rewrite FP8 weights, copy-through others
239
240
  for k in keys:
240
241
  # Drop source-only artifacts
241
- if k == "scaled_fp8" or k.endswith(".scale_weight"):
242
- if acc.can_delete(): acc.delete(k)
242
+ if k == "scaled_fp8" or k.endswith(".scale_weight") :
243
243
  continue
244
244
 
245
- if _is_weight_key(k):
246
- t = acc.get_tensor(k)
247
- if t.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
248
- # Quantized: keep original FP8 tensor as _data
249
- out_sd[k + DATA_SUFFIX] = t
245
+ t = acc.get_tensor(k)
246
+ if in_place: acc.delete(k)
247
+ if _is_weight_key(k) and t.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
248
+ # Quantized: keep original FP8 tensor as _data
249
+ out_sd[k + DATA_SUFFIX] = t
250
250
 
251
- out_ch = int(t.shape[0])
252
- s_vec = get_scale_vec_for_weight(k, out_ch)
253
- if s_vec is None:
254
- if require_scale and not allow_default_scale:
255
- raise KeyError(f"No scale found for '{k}' (looked for '.scale_weight' and metadata).")
256
- s_vec = torch.full((out_ch,), float(default_missing_scale), dtype=torch.float32)
251
+ out_ch = int(t.shape[0])
252
+ s_vec = get_scale_vec_for_weight(k, out_ch)
253
+ if s_vec is None:
254
+ if require_scale and not allow_default_scale:
255
+ raise KeyError(f"No scale found for '{k}' (looked for '.scale_weight' and metadata).")
256
+ s_vec = torch.full((out_ch,), float(default_missing_scale), dtype=torch.float32)
257
257
 
258
- s_grid = _per_channel_reshape(s_vec, t).to(sd_scale_dtype)
259
- out_sd[k + SCALE_SUFFIX] = s_grid
260
-
261
- if add_activation_placeholders:
262
- base = k[:-len(".weight")]
263
- out_sd[base + IN_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
264
- out_sd[base + OUT_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
258
+ s_grid = _per_channel_reshape(s_vec, t).to(sd_scale_dtype)
259
+ out_sd[k + SCALE_SUFFIX] = s_grid
265
260
 
261
+ if add_activation_placeholders:
266
262
  base = k[:-len(".weight")]
267
- qmap[base] = {"weights": _QTYPE_NAME[fmt], "activations": "none"}
268
-
269
- if acc.can_delete():
270
- acc.delete(k)
271
- continue # don't copy original .weight
272
-
273
- # Copy-through
274
- if not (isinstance(acc, DictAccessor) and in_place):
275
- out_sd[k] = acc.get_tensor(k)
276
-
263
+ out_sd[base + IN_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
264
+ out_sd[base + OUT_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
265
+
266
+ base = k[:-len(".weight")]
267
+ qmap[base] = {"weights": _QTYPE_NAME[fmt], "activations": "none"}
268
+ else:
269
+ out_sd[k] = t if t.dtype == dtype or t.dtype == torch.float32 else t.to(dtype)
270
+ t = None
277
271
  return ConvertResult(state_dict=out_sd, quant_map=qmap, fp8_format=fmt, patch_needed=patch_needed)
278
272
  finally:
279
273
  closer()
@@ -481,7 +475,7 @@ def _cli():
481
475
  res = convert_scaled_fp8_to_quanto(
482
476
  args.in_path,
483
477
  fp8_format=args.fp8_format,
484
- scale_dtype=args.scale_dtype,
478
+ dtype=args.scale_dtype,
485
479
  add_activation_placeholders=not args.no_activation_placeholders,
486
480
  default_missing_scale=args.default_missing_scale,
487
481
  )
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -688,7 +688,7 @@ def _welcome():
688
688
  if welcome_displayed:
689
689
  return
690
690
  welcome_displayed = True
691
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.2) by DeepBeepMeep ************{ENDC}{UNBOLD}")
691
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
692
692
 
693
693
  def change_dtype(model, new_dtype, exclude_buffers = False):
694
694
  for submodule_name, submodule in model.named_modules():
@@ -1413,7 +1413,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
1413
1413
 
1414
1414
 
1415
1415
 
1416
- def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1):
1416
+ def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, postprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1):
1417
1417
  """
1418
1418
  Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
1419
1419
  """
@@ -1489,38 +1489,41 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
1489
1489
  state_dict.update(sd)
1490
1490
  else:
1491
1491
  state_dict, metadata = _safetensors_load_file(file, writable_tensors =writable_tensors)
1492
-
1493
- if metadata != None:
1494
- quantization_map = metadata.get("quantization_map", None)
1495
- config = metadata.get("config", None)
1496
- if config is not None:
1497
- model._config = config
1498
-
1499
- tied_weights_map = metadata.get("tied_weights_map", None)
1500
- if tied_weights_map != None:
1501
- for name, tied_weights_list in tied_weights_map.items():
1502
- mapped_weight = state_dict[name]
1503
- for tied_weights in tied_weights_list:
1504
- state_dict[tied_weights] = mapped_weight
1505
-
1506
- if quantization_map is None:
1507
- detection_type = detect_safetensors_format(state_dict)
1508
- if detection_type["kind"] in ['scaled_fp8','fp8']:
1509
- conv_result = convert_scaled_fp8_to_quanto(state_dict, scale_dtype = default_dtype)
1510
- state_dict = conv_result["state_dict"]
1511
- quantization_map = conv_result["quant_map"]
1512
- conv_result = None
1513
- # enable_fp8_fp32_scale_support()
1514
-
1515
- if quantization_map is None:
1516
- pos = str.rfind(file, ".")
1517
- if pos > 0:
1518
- quantization_map_path = file[:pos]
1519
- quantization_map_path += "_map.json"
1520
-
1521
- if os.path.isfile(quantization_map_path):
1522
- with open(quantization_map_path, 'r') as f:
1523
- quantization_map = json.load(f)
1492
+
1493
+ if preprocess_sd != None:
1494
+ state_dict = preprocess_sd(state_dict)
1495
+
1496
+ if metadata != None:
1497
+ quantization_map = metadata.get("quantization_map", None)
1498
+ config = metadata.get("config", None)
1499
+ if config is not None:
1500
+ model._config = config
1501
+
1502
+ tied_weights_map = metadata.get("tied_weights_map", None)
1503
+ if tied_weights_map != None:
1504
+ for name, tied_weights_list in tied_weights_map.items():
1505
+ mapped_weight = state_dict[name]
1506
+ for tied_weights in tied_weights_list:
1507
+ state_dict[tied_weights] = mapped_weight
1508
+
1509
+ if quantization_map is None:
1510
+ detection_type = detect_safetensors_format(state_dict)
1511
+ if detection_type["kind"] in ['scaled_fp8','fp8']:
1512
+ conv_result = convert_scaled_fp8_to_quanto(state_dict, dtype = default_dtype, in_place= True)
1513
+ state_dict = conv_result["state_dict"]
1514
+ quantization_map = conv_result["quant_map"]
1515
+ conv_result = None
1516
+ # enable_fp8_fp32_scale_support()
1517
+
1518
+ if quantization_map is None:
1519
+ pos = str.rfind(file, ".")
1520
+ if pos > 0:
1521
+ quantization_map_path = file[:pos]
1522
+ quantization_map_path += "_map.json"
1523
+
1524
+ if os.path.isfile(quantization_map_path):
1525
+ with open(quantization_map_path, 'r') as f:
1526
+ quantization_map = json.load(f)
1524
1527
 
1525
1528
  full_state_dict.update(state_dict)
1526
1529
  if quantization_map != None:
@@ -1539,8 +1542,8 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
1539
1542
  full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
1540
1543
 
1541
1544
  # deal if we are trying to load just a sub part of a larger model
1542
- if preprocess_sd != None:
1543
- state_dict, quantization_map = preprocess_sd(state_dict, quantization_map)
1545
+ if postprocess_sd != None:
1546
+ state_dict, quantization_map = postprocess_sd(state_dict, quantization_map)
1544
1547
 
1545
1548
  if modelPrefix != None:
1546
1549
  base_model_prefix = modelPrefix + "."
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.6.2
3
+ Version: 3.6.3
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes