mmgp 3.6.2__tar.gz → 3.6.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.6.2/src/mmgp.egg-info → mmgp-3.6.3}/PKG-INFO +2 -2
- {mmgp-3.6.2 → mmgp-3.6.3}/README.md +1 -1
- {mmgp-3.6.2 → mmgp-3.6.3}/pyproject.toml +1 -1
- {mmgp-3.6.2 → mmgp-3.6.3}/src/mmgp/fp8_quanto_bridge.py +29 -35
- {mmgp-3.6.2 → mmgp-3.6.3}/src/mmgp/offload.py +40 -37
- {mmgp-3.6.2 → mmgp-3.6.3/src/mmgp.egg-info}/PKG-INFO +2 -2
- {mmgp-3.6.2 → mmgp-3.6.3}/LICENSE.md +0 -0
- {mmgp-3.6.2 → mmgp-3.6.3}/setup.cfg +0 -0
- {mmgp-3.6.2 → mmgp-3.6.3}/src/__init__.py +0 -0
- {mmgp-3.6.2 → mmgp-3.6.3}/src/mmgp/__init__.py +0 -0
- {mmgp-3.6.2 → mmgp-3.6.3}/src/mmgp/safetensors2.py +0 -0
- {mmgp-3.6.2 → mmgp-3.6.3}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.6.2 → mmgp-3.6.3}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.6.2 → mmgp-3.6.3}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.6.2 → mmgp-3.6.3}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.6.
|
|
3
|
+
Version: 3.6.3
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.6.
|
|
18
|
+
<H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -168,7 +168,7 @@ def convert_scaled_fp8_to_quanto(
|
|
|
168
168
|
require_scale: bool = False,
|
|
169
169
|
allow_default_scale: bool = True,
|
|
170
170
|
default_missing_scale: float = 1.0,
|
|
171
|
-
|
|
171
|
+
dtype: Union[str, torch.dtype] = "float32",
|
|
172
172
|
add_activation_placeholders: bool = True,
|
|
173
173
|
# dict mode options
|
|
174
174
|
sd_metadata: Optional[Dict[str, str]] = None,
|
|
@@ -176,7 +176,7 @@ def convert_scaled_fp8_to_quanto(
|
|
|
176
176
|
free_cuda_cache: bool = False,
|
|
177
177
|
cuda_cache_interval: int = 32,
|
|
178
178
|
) -> ConvertResult:
|
|
179
|
-
sd_scale_dtype = _normalize_scale_dtype(
|
|
179
|
+
sd_scale_dtype = _normalize_scale_dtype(dtype)
|
|
180
180
|
patch_needed = (sd_scale_dtype == torch.float32)
|
|
181
181
|
|
|
182
182
|
acc, closer = _as_accessor(
|
|
@@ -186,6 +186,7 @@ def convert_scaled_fp8_to_quanto(
|
|
|
186
186
|
free_cuda_cache=free_cuda_cache,
|
|
187
187
|
cuda_cache_interval=cuda_cache_interval,
|
|
188
188
|
)
|
|
189
|
+
if not acc.can_delete(): in_place = False
|
|
189
190
|
try:
|
|
190
191
|
meta = acc.metadata() or {}
|
|
191
192
|
meta_scale_map = _maybe_parse_scale_map(meta) or {}
|
|
@@ -216,6 +217,7 @@ def convert_scaled_fp8_to_quanto(
|
|
|
216
217
|
sk = scale_weight_map.get(wk)
|
|
217
218
|
if sk is not None:
|
|
218
219
|
s_t = acc.get_tensor(sk).to(torch.float32)
|
|
220
|
+
if in_place: acc.delete(s_t)
|
|
219
221
|
if s_t.numel() == 1:
|
|
220
222
|
return torch.full((out_ch,), float(s_t.item()), dtype=torch.float32)
|
|
221
223
|
if s_t.numel() == out_ch:
|
|
@@ -231,49 +233,41 @@ def convert_scaled_fp8_to_quanto(
|
|
|
231
233
|
return torch.full((out_ch,), float(meta_scale_map[alt]), dtype=torch.float32)
|
|
232
234
|
return None
|
|
233
235
|
|
|
234
|
-
|
|
235
|
-
out_sd: Dict[str, torch.Tensor] = acc.sd if isinstance(acc, DictAccessor) and in_place else {}
|
|
236
|
+
out_sd: Dict[str, torch.Tensor] = {}
|
|
236
237
|
qmap: Dict[str, Dict] = {}
|
|
237
238
|
|
|
238
239
|
# Single pass: rewrite FP8 weights, copy-through others
|
|
239
240
|
for k in keys:
|
|
240
241
|
# Drop source-only artifacts
|
|
241
|
-
if k == "scaled_fp8" or k.endswith(".scale_weight"):
|
|
242
|
-
if acc.can_delete(): acc.delete(k)
|
|
242
|
+
if k == "scaled_fp8" or k.endswith(".scale_weight") :
|
|
243
243
|
continue
|
|
244
244
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
245
|
+
t = acc.get_tensor(k)
|
|
246
|
+
if in_place: acc.delete(k)
|
|
247
|
+
if _is_weight_key(k) and t.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
|
|
248
|
+
# Quantized: keep original FP8 tensor as _data
|
|
249
|
+
out_sd[k + DATA_SUFFIX] = t
|
|
250
250
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
251
|
+
out_ch = int(t.shape[0])
|
|
252
|
+
s_vec = get_scale_vec_for_weight(k, out_ch)
|
|
253
|
+
if s_vec is None:
|
|
254
|
+
if require_scale and not allow_default_scale:
|
|
255
|
+
raise KeyError(f"No scale found for '{k}' (looked for '.scale_weight' and metadata).")
|
|
256
|
+
s_vec = torch.full((out_ch,), float(default_missing_scale), dtype=torch.float32)
|
|
257
257
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
if add_activation_placeholders:
|
|
262
|
-
base = k[:-len(".weight")]
|
|
263
|
-
out_sd[base + IN_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
|
|
264
|
-
out_sd[base + OUT_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
|
|
258
|
+
s_grid = _per_channel_reshape(s_vec, t).to(sd_scale_dtype)
|
|
259
|
+
out_sd[k + SCALE_SUFFIX] = s_grid
|
|
265
260
|
|
|
261
|
+
if add_activation_placeholders:
|
|
266
262
|
base = k[:-len(".weight")]
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
out_sd[k] = acc.get_tensor(k)
|
|
276
|
-
|
|
263
|
+
out_sd[base + IN_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
|
|
264
|
+
out_sd[base + OUT_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
|
|
265
|
+
|
|
266
|
+
base = k[:-len(".weight")]
|
|
267
|
+
qmap[base] = {"weights": _QTYPE_NAME[fmt], "activations": "none"}
|
|
268
|
+
else:
|
|
269
|
+
out_sd[k] = t if t.dtype == dtype or t.dtype == torch.float32 else t.to(dtype)
|
|
270
|
+
t = None
|
|
277
271
|
return ConvertResult(state_dict=out_sd, quant_map=qmap, fp8_format=fmt, patch_needed=patch_needed)
|
|
278
272
|
finally:
|
|
279
273
|
closer()
|
|
@@ -481,7 +475,7 @@ def _cli():
|
|
|
481
475
|
res = convert_scaled_fp8_to_quanto(
|
|
482
476
|
args.in_path,
|
|
483
477
|
fp8_format=args.fp8_format,
|
|
484
|
-
|
|
478
|
+
dtype=args.scale_dtype,
|
|
485
479
|
add_activation_placeholders=not args.no_activation_placeholders,
|
|
486
480
|
default_missing_scale=args.default_missing_scale,
|
|
487
481
|
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.6.
|
|
1
|
+
# ------------------ Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -688,7 +688,7 @@ def _welcome():
|
|
|
688
688
|
if welcome_displayed:
|
|
689
689
|
return
|
|
690
690
|
welcome_displayed = True
|
|
691
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.
|
|
691
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
692
692
|
|
|
693
693
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
694
694
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -1413,7 +1413,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
|
|
|
1413
1413
|
|
|
1414
1414
|
|
|
1415
1415
|
|
|
1416
|
-
def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1):
|
|
1416
|
+
def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, postprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1):
|
|
1417
1417
|
"""
|
|
1418
1418
|
Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
|
|
1419
1419
|
"""
|
|
@@ -1489,38 +1489,41 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
|
|
|
1489
1489
|
state_dict.update(sd)
|
|
1490
1490
|
else:
|
|
1491
1491
|
state_dict, metadata = _safetensors_load_file(file, writable_tensors =writable_tensors)
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1492
|
+
|
|
1493
|
+
if preprocess_sd != None:
|
|
1494
|
+
state_dict = preprocess_sd(state_dict)
|
|
1495
|
+
|
|
1496
|
+
if metadata != None:
|
|
1497
|
+
quantization_map = metadata.get("quantization_map", None)
|
|
1498
|
+
config = metadata.get("config", None)
|
|
1499
|
+
if config is not None:
|
|
1500
|
+
model._config = config
|
|
1501
|
+
|
|
1502
|
+
tied_weights_map = metadata.get("tied_weights_map", None)
|
|
1503
|
+
if tied_weights_map != None:
|
|
1504
|
+
for name, tied_weights_list in tied_weights_map.items():
|
|
1505
|
+
mapped_weight = state_dict[name]
|
|
1506
|
+
for tied_weights in tied_weights_list:
|
|
1507
|
+
state_dict[tied_weights] = mapped_weight
|
|
1508
|
+
|
|
1509
|
+
if quantization_map is None:
|
|
1510
|
+
detection_type = detect_safetensors_format(state_dict)
|
|
1511
|
+
if detection_type["kind"] in ['scaled_fp8','fp8']:
|
|
1512
|
+
conv_result = convert_scaled_fp8_to_quanto(state_dict, dtype = default_dtype, in_place= True)
|
|
1513
|
+
state_dict = conv_result["state_dict"]
|
|
1514
|
+
quantization_map = conv_result["quant_map"]
|
|
1515
|
+
conv_result = None
|
|
1516
|
+
# enable_fp8_fp32_scale_support()
|
|
1517
|
+
|
|
1518
|
+
if quantization_map is None:
|
|
1519
|
+
pos = str.rfind(file, ".")
|
|
1520
|
+
if pos > 0:
|
|
1521
|
+
quantization_map_path = file[:pos]
|
|
1522
|
+
quantization_map_path += "_map.json"
|
|
1523
|
+
|
|
1524
|
+
if os.path.isfile(quantization_map_path):
|
|
1525
|
+
with open(quantization_map_path, 'r') as f:
|
|
1526
|
+
quantization_map = json.load(f)
|
|
1524
1527
|
|
|
1525
1528
|
full_state_dict.update(state_dict)
|
|
1526
1529
|
if quantization_map != None:
|
|
@@ -1539,8 +1542,8 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
|
|
|
1539
1542
|
full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
|
|
1540
1543
|
|
|
1541
1544
|
# deal if we are trying to load just a sub part of a larger model
|
|
1542
|
-
if
|
|
1543
|
-
state_dict, quantization_map =
|
|
1545
|
+
if postprocess_sd != None:
|
|
1546
|
+
state_dict, quantization_map = postprocess_sd(state_dict, quantization_map)
|
|
1544
1547
|
|
|
1545
1548
|
if modelPrefix != None:
|
|
1546
1549
|
base_model_prefix = modelPrefix + "."
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.6.
|
|
3
|
+
Version: 3.6.3
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.6.
|
|
18
|
+
<H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|