mmgp 3.6.2__tar.gz → 3.6.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.6.2/src/mmgp.egg-info → mmgp-3.6.4}/PKG-INFO +2 -2
- {mmgp-3.6.2 → mmgp-3.6.4}/README.md +1 -1
- {mmgp-3.6.2 → mmgp-3.6.4}/pyproject.toml +1 -1
- {mmgp-3.6.2 → mmgp-3.6.4}/src/mmgp/fp8_quanto_bridge.py +29 -35
- {mmgp-3.6.2 → mmgp-3.6.4}/src/mmgp/offload.py +180 -54
- {mmgp-3.6.2 → mmgp-3.6.4/src/mmgp.egg-info}/PKG-INFO +2 -2
- {mmgp-3.6.2 → mmgp-3.6.4}/LICENSE.md +0 -0
- {mmgp-3.6.2 → mmgp-3.6.4}/setup.cfg +0 -0
- {mmgp-3.6.2 → mmgp-3.6.4}/src/__init__.py +0 -0
- {mmgp-3.6.2 → mmgp-3.6.4}/src/mmgp/__init__.py +0 -0
- {mmgp-3.6.2 → mmgp-3.6.4}/src/mmgp/safetensors2.py +0 -0
- {mmgp-3.6.2 → mmgp-3.6.4}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.6.2 → mmgp-3.6.4}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.6.2 → mmgp-3.6.4}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.6.2 → mmgp-3.6.4}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.6.
|
|
3
|
+
Version: 3.6.4
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.6.
|
|
18
|
+
<H2>Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -168,7 +168,7 @@ def convert_scaled_fp8_to_quanto(
|
|
|
168
168
|
require_scale: bool = False,
|
|
169
169
|
allow_default_scale: bool = True,
|
|
170
170
|
default_missing_scale: float = 1.0,
|
|
171
|
-
|
|
171
|
+
dtype: Union[str, torch.dtype] = "float32",
|
|
172
172
|
add_activation_placeholders: bool = True,
|
|
173
173
|
# dict mode options
|
|
174
174
|
sd_metadata: Optional[Dict[str, str]] = None,
|
|
@@ -176,7 +176,7 @@ def convert_scaled_fp8_to_quanto(
|
|
|
176
176
|
free_cuda_cache: bool = False,
|
|
177
177
|
cuda_cache_interval: int = 32,
|
|
178
178
|
) -> ConvertResult:
|
|
179
|
-
sd_scale_dtype = _normalize_scale_dtype(
|
|
179
|
+
sd_scale_dtype = _normalize_scale_dtype(dtype)
|
|
180
180
|
patch_needed = (sd_scale_dtype == torch.float32)
|
|
181
181
|
|
|
182
182
|
acc, closer = _as_accessor(
|
|
@@ -186,6 +186,7 @@ def convert_scaled_fp8_to_quanto(
|
|
|
186
186
|
free_cuda_cache=free_cuda_cache,
|
|
187
187
|
cuda_cache_interval=cuda_cache_interval,
|
|
188
188
|
)
|
|
189
|
+
if not acc.can_delete(): in_place = False
|
|
189
190
|
try:
|
|
190
191
|
meta = acc.metadata() or {}
|
|
191
192
|
meta_scale_map = _maybe_parse_scale_map(meta) or {}
|
|
@@ -216,6 +217,7 @@ def convert_scaled_fp8_to_quanto(
|
|
|
216
217
|
sk = scale_weight_map.get(wk)
|
|
217
218
|
if sk is not None:
|
|
218
219
|
s_t = acc.get_tensor(sk).to(torch.float32)
|
|
220
|
+
if in_place: acc.delete(s_t)
|
|
219
221
|
if s_t.numel() == 1:
|
|
220
222
|
return torch.full((out_ch,), float(s_t.item()), dtype=torch.float32)
|
|
221
223
|
if s_t.numel() == out_ch:
|
|
@@ -231,49 +233,41 @@ def convert_scaled_fp8_to_quanto(
|
|
|
231
233
|
return torch.full((out_ch,), float(meta_scale_map[alt]), dtype=torch.float32)
|
|
232
234
|
return None
|
|
233
235
|
|
|
234
|
-
|
|
235
|
-
out_sd: Dict[str, torch.Tensor] = acc.sd if isinstance(acc, DictAccessor) and in_place else {}
|
|
236
|
+
out_sd: Dict[str, torch.Tensor] = {}
|
|
236
237
|
qmap: Dict[str, Dict] = {}
|
|
237
238
|
|
|
238
239
|
# Single pass: rewrite FP8 weights, copy-through others
|
|
239
240
|
for k in keys:
|
|
240
241
|
# Drop source-only artifacts
|
|
241
|
-
if k == "scaled_fp8" or k.endswith(".scale_weight"):
|
|
242
|
-
if acc.can_delete(): acc.delete(k)
|
|
242
|
+
if k == "scaled_fp8" or k.endswith(".scale_weight") :
|
|
243
243
|
continue
|
|
244
244
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
245
|
+
t = acc.get_tensor(k)
|
|
246
|
+
if in_place: acc.delete(k)
|
|
247
|
+
if _is_weight_key(k) and t.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
|
|
248
|
+
# Quantized: keep original FP8 tensor as _data
|
|
249
|
+
out_sd[k + DATA_SUFFIX] = t
|
|
250
250
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
251
|
+
out_ch = int(t.shape[0])
|
|
252
|
+
s_vec = get_scale_vec_for_weight(k, out_ch)
|
|
253
|
+
if s_vec is None:
|
|
254
|
+
if require_scale and not allow_default_scale:
|
|
255
|
+
raise KeyError(f"No scale found for '{k}' (looked for '.scale_weight' and metadata).")
|
|
256
|
+
s_vec = torch.full((out_ch,), float(default_missing_scale), dtype=torch.float32)
|
|
257
257
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
if add_activation_placeholders:
|
|
262
|
-
base = k[:-len(".weight")]
|
|
263
|
-
out_sd[base + IN_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
|
|
264
|
-
out_sd[base + OUT_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
|
|
258
|
+
s_grid = _per_channel_reshape(s_vec, t).to(sd_scale_dtype)
|
|
259
|
+
out_sd[k + SCALE_SUFFIX] = s_grid
|
|
265
260
|
|
|
261
|
+
if add_activation_placeholders:
|
|
266
262
|
base = k[:-len(".weight")]
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
out_sd[k] = acc.get_tensor(k)
|
|
276
|
-
|
|
263
|
+
out_sd[base + IN_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
|
|
264
|
+
out_sd[base + OUT_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
|
|
265
|
+
|
|
266
|
+
base = k[:-len(".weight")]
|
|
267
|
+
qmap[base] = {"weights": _QTYPE_NAME[fmt], "activations": "none"}
|
|
268
|
+
else:
|
|
269
|
+
out_sd[k] = t if t.dtype == dtype or t.dtype == torch.float32 else t.to(dtype)
|
|
270
|
+
t = None
|
|
277
271
|
return ConvertResult(state_dict=out_sd, quant_map=qmap, fp8_format=fmt, patch_needed=patch_needed)
|
|
278
272
|
finally:
|
|
279
273
|
closer()
|
|
@@ -481,7 +475,7 @@ def _cli():
|
|
|
481
475
|
res = convert_scaled_fp8_to_quanto(
|
|
482
476
|
args.in_path,
|
|
483
477
|
fp8_format=args.fp8_format,
|
|
484
|
-
|
|
478
|
+
dtype=args.scale_dtype,
|
|
485
479
|
add_activation_placeholders=not args.no_activation_placeholders,
|
|
486
480
|
default_missing_scale=args.default_missing_scale,
|
|
487
481
|
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.6.
|
|
1
|
+
# ------------------ Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -66,8 +66,6 @@ from accelerate import init_empty_weights
|
|
|
66
66
|
|
|
67
67
|
import functools
|
|
68
68
|
import types
|
|
69
|
-
import torch
|
|
70
|
-
|
|
71
69
|
|
|
72
70
|
from mmgp import safetensors2
|
|
73
71
|
from mmgp import profile_type
|
|
@@ -87,6 +85,9 @@ class QEmbedding(QModuleMixin, torch.nn.Embedding):
|
|
|
87
85
|
return torch.nn.functional.embedding( input, self.qweight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse )
|
|
88
86
|
|
|
89
87
|
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
90
91
|
shared_state = {}
|
|
91
92
|
|
|
92
93
|
def get_cache(cache_name):
|
|
@@ -688,7 +689,7 @@ def _welcome():
|
|
|
688
689
|
if welcome_displayed:
|
|
689
690
|
return
|
|
690
691
|
welcome_displayed = True
|
|
691
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.
|
|
692
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
692
693
|
|
|
693
694
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
694
695
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -1049,7 +1050,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1049
1050
|
|
|
1050
1051
|
if split_linear_modules_map != None:
|
|
1051
1052
|
new_state_dict = dict()
|
|
1052
|
-
suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False)]
|
|
1053
|
+
suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False),(".dora_scale", -2, False),]
|
|
1053
1054
|
for module_name, module_data in state_dict.items():
|
|
1054
1055
|
name_parts = module_name.split(".")
|
|
1055
1056
|
for suffix, pos, any_split in suffixes:
|
|
@@ -1089,7 +1090,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1089
1090
|
|
|
1090
1091
|
lora_alphas = {}
|
|
1091
1092
|
for k in keys:
|
|
1092
|
-
if "alpha"
|
|
1093
|
+
if k.endswith(".alpha"):
|
|
1093
1094
|
alpha_value = state_dict.pop(k)
|
|
1094
1095
|
if torch.is_tensor(alpha_value):
|
|
1095
1096
|
alpha_value = float(alpha_value.item())
|
|
@@ -1100,13 +1101,16 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1100
1101
|
new_state_dict = {}
|
|
1101
1102
|
for k in list(state_dict.keys()):
|
|
1102
1103
|
v = state_dict.pop(k)
|
|
1103
|
-
lora_A = lora_B = diff_b = diff = lora_key = None
|
|
1104
|
+
lora_A = lora_B = diff_b = diff = lora_key = dora_scale = None
|
|
1104
1105
|
if k.endswith(".diff"):
|
|
1105
1106
|
diff = v
|
|
1106
1107
|
module_name = k[ : -5]
|
|
1107
1108
|
elif k.endswith(".diff_b"):
|
|
1108
1109
|
diff_b = v
|
|
1109
1110
|
module_name = k[ : -7]
|
|
1111
|
+
elif k.endswith(".dora_scale"):
|
|
1112
|
+
dora_scale = v
|
|
1113
|
+
module_name = k[ : -11]
|
|
1110
1114
|
else:
|
|
1111
1115
|
pos = k.rfind(".lora_")
|
|
1112
1116
|
if pos <=0:
|
|
@@ -1185,7 +1189,17 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1185
1189
|
fail = True
|
|
1186
1190
|
break
|
|
1187
1191
|
v = diff_b = diff_b.to(module.weight.dtype)
|
|
1188
|
-
|
|
1192
|
+
elif dora_scale != None:
|
|
1193
|
+
rank = dora_scale.shape[1]
|
|
1194
|
+
if module_shape[0] != v.shape[0]:
|
|
1195
|
+
if ignore_model_variations:
|
|
1196
|
+
skip = True
|
|
1197
|
+
else:
|
|
1198
|
+
msg = f"Lora '{path}': Dora Scale dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, dora scale = {v.shape[0]}). It is likely this Dora has been made for another version of this model."
|
|
1199
|
+
error_msg = append(error_msg, msg)
|
|
1200
|
+
fail = True
|
|
1201
|
+
break
|
|
1202
|
+
v = dora_scale = dora_scale.to(module.weight.dtype)
|
|
1189
1203
|
if not check_only:
|
|
1190
1204
|
new_state_dict[k] = v
|
|
1191
1205
|
v = None
|
|
@@ -1193,19 +1207,23 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1193
1207
|
assert loras_module_data != None
|
|
1194
1208
|
loras_adapter_data = loras_module_data.get(adapter_name, None)
|
|
1195
1209
|
if loras_adapter_data == None:
|
|
1196
|
-
loras_adapter_data = [None, None, None, 1.]
|
|
1210
|
+
loras_adapter_data = [None, None, None, None, 1.]
|
|
1211
|
+
module.any_dora = False
|
|
1197
1212
|
loras_module_data[adapter_name] = loras_adapter_data
|
|
1198
1213
|
if lora_A != None:
|
|
1199
1214
|
loras_adapter_data[0] = lora_A
|
|
1200
1215
|
elif lora_B != None:
|
|
1201
1216
|
loras_adapter_data[1] = lora_B
|
|
1217
|
+
elif dora_scale != None:
|
|
1218
|
+
loras_adapter_data[3] = dora_scale
|
|
1219
|
+
loras_module_data["any_dora"] = True
|
|
1202
1220
|
else:
|
|
1203
1221
|
loras_adapter_data[2] = diff_b
|
|
1204
1222
|
if rank != None and lora_key is not None and "lora" in lora_key:
|
|
1205
1223
|
alpha_key = k[:-len(lora_key)] + "alpha"
|
|
1206
1224
|
alpha = lora_alphas.get(alpha_key, None)
|
|
1207
|
-
if alpha is not None: loras_adapter_data[
|
|
1208
|
-
lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = None
|
|
1225
|
+
if alpha is not None: loras_adapter_data[4] = alpha / rank
|
|
1226
|
+
lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = dora_scale = None
|
|
1209
1227
|
|
|
1210
1228
|
if len(invalid_keys) > 0:
|
|
1211
1229
|
msg = f"Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
|
|
@@ -1413,7 +1431,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
|
|
|
1413
1431
|
|
|
1414
1432
|
|
|
1415
1433
|
|
|
1416
|
-
def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1):
|
|
1434
|
+
def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, postprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1):
|
|
1417
1435
|
"""
|
|
1418
1436
|
Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
|
|
1419
1437
|
"""
|
|
@@ -1489,38 +1507,41 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
|
|
|
1489
1507
|
state_dict.update(sd)
|
|
1490
1508
|
else:
|
|
1491
1509
|
state_dict, metadata = _safetensors_load_file(file, writable_tensors =writable_tensors)
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1510
|
+
|
|
1511
|
+
if preprocess_sd != None:
|
|
1512
|
+
state_dict = preprocess_sd(state_dict)
|
|
1513
|
+
|
|
1514
|
+
if metadata != None:
|
|
1515
|
+
quantization_map = metadata.get("quantization_map", None)
|
|
1516
|
+
config = metadata.get("config", None)
|
|
1517
|
+
if config is not None:
|
|
1518
|
+
model._config = config
|
|
1519
|
+
|
|
1520
|
+
tied_weights_map = metadata.get("tied_weights_map", None)
|
|
1521
|
+
if tied_weights_map != None:
|
|
1522
|
+
for name, tied_weights_list in tied_weights_map.items():
|
|
1523
|
+
mapped_weight = state_dict[name]
|
|
1524
|
+
for tied_weights in tied_weights_list:
|
|
1525
|
+
state_dict[tied_weights] = mapped_weight
|
|
1526
|
+
|
|
1527
|
+
if quantization_map is None:
|
|
1528
|
+
detection_type = detect_safetensors_format(state_dict)
|
|
1529
|
+
if detection_type["kind"] in ['scaled_fp8','fp8']:
|
|
1530
|
+
conv_result = convert_scaled_fp8_to_quanto(state_dict, dtype = default_dtype, in_place= True)
|
|
1531
|
+
state_dict = conv_result["state_dict"]
|
|
1532
|
+
quantization_map = conv_result["quant_map"]
|
|
1533
|
+
conv_result = None
|
|
1534
|
+
# enable_fp8_fp32_scale_support()
|
|
1535
|
+
|
|
1536
|
+
if quantization_map is None:
|
|
1537
|
+
pos = str.rfind(file, ".")
|
|
1538
|
+
if pos > 0:
|
|
1539
|
+
quantization_map_path = file[:pos]
|
|
1540
|
+
quantization_map_path += "_map.json"
|
|
1541
|
+
|
|
1542
|
+
if os.path.isfile(quantization_map_path):
|
|
1543
|
+
with open(quantization_map_path, 'r') as f:
|
|
1544
|
+
quantization_map = json.load(f)
|
|
1524
1545
|
|
|
1525
1546
|
full_state_dict.update(state_dict)
|
|
1526
1547
|
if quantization_map != None:
|
|
@@ -1539,8 +1560,8 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
|
|
|
1539
1560
|
full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
|
|
1540
1561
|
|
|
1541
1562
|
# deal if we are trying to load just a sub part of a larger model
|
|
1542
|
-
if
|
|
1543
|
-
state_dict, quantization_map =
|
|
1563
|
+
if postprocess_sd != None:
|
|
1564
|
+
state_dict, quantization_map = postprocess_sd(state_dict, quantization_map)
|
|
1544
1565
|
|
|
1545
1566
|
if modelPrefix != None:
|
|
1546
1567
|
base_model_prefix = modelPrefix + "."
|
|
@@ -2127,7 +2148,7 @@ class offload:
|
|
|
2127
2148
|
data = loras_data.get(active_adapter + '_GPU', None)
|
|
2128
2149
|
if data == None:
|
|
2129
2150
|
continue
|
|
2130
|
-
diff_w , _ , diff_b, alpha = data
|
|
2151
|
+
diff_w , _ , diff_b, _, alpha = data
|
|
2131
2152
|
scaling = self._get_lora_scaling( loras_scaling, model, active_adapter) * alpha
|
|
2132
2153
|
if scaling == 0:
|
|
2133
2154
|
continue
|
|
@@ -2153,15 +2174,116 @@ class offload:
|
|
|
2153
2174
|
return ret
|
|
2154
2175
|
|
|
2155
2176
|
|
|
2177
|
+
def _dora_linear_forward(
|
|
2178
|
+
self,
|
|
2179
|
+
model,
|
|
2180
|
+
submodule,
|
|
2181
|
+
adapters_data, # dict: name+"_GPU" -> (A, B, diff_b, g_abs, alpha); g_abs=None means LoRA
|
|
2182
|
+
weight= None,
|
|
2183
|
+
bias = None,
|
|
2184
|
+
original_bias = True,
|
|
2185
|
+
dora_mode: str = "blend", # "ref_exact" | "blend"
|
|
2186
|
+
):
|
|
2187
|
+
active_adapters = getattr(model, "_loras_active_adapters", [])
|
|
2188
|
+
loras_scaling = getattr(model, "_loras_scaling", {})
|
|
2189
|
+
# Snapshot base weight (safe for quantized modules)
|
|
2190
|
+
if weight is None:
|
|
2191
|
+
bias = submodule.bias
|
|
2192
|
+
original_bias = True
|
|
2193
|
+
if isinstance(submodule, QModuleMixin):
|
|
2194
|
+
weight = submodule.weight.view(submodule.weight.shape)
|
|
2195
|
+
else:
|
|
2196
|
+
weight = submodule.weight.clone()
|
|
2197
|
+
|
|
2198
|
+
base_dtype = weight.dtype
|
|
2199
|
+
eps = 1e-8
|
|
2200
|
+
W0 = weight.float()
|
|
2201
|
+
g0 = torch.linalg.vector_norm(W0, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps) # [out,1]
|
|
2202
|
+
|
|
2203
|
+
# Keep big mats in low precision
|
|
2204
|
+
# Wc = W0 if W0.dtype == compute_dtype else W0.to(compute_dtype)
|
|
2205
|
+
W0 /= g0
|
|
2206
|
+
weight[...] = W0.to(base_dtype)
|
|
2207
|
+
W0 = None
|
|
2208
|
+
|
|
2209
|
+
dir_update = None # Σ s * ((B@A)/g0) in compute_dtype
|
|
2210
|
+
g = None # final magnitude: set absolute (ref_exact) or blended (blend)
|
|
2211
|
+
bias_delta = None # Σ s * diff_b
|
|
2212
|
+
|
|
2213
|
+
# Accumulate DoRA adapters only (g_abs != None)
|
|
2214
|
+
for name in active_adapters:
|
|
2215
|
+
data = adapters_data.get(name + "_GPU", None)
|
|
2216
|
+
if data is None: continue
|
|
2217
|
+
A, B, diff_b, g_abs, alpha = data
|
|
2218
|
+
if g_abs is None: continue
|
|
2219
|
+
|
|
2220
|
+
s = self._get_lora_scaling(loras_scaling, model, name) * float(alpha)
|
|
2221
|
+
if s == 0: continue
|
|
2222
|
+
|
|
2223
|
+
# Direction update in V-space with row-wise 1/g0
|
|
2224
|
+
if (A is not None) and (B is not None):
|
|
2225
|
+
dV = torch.mm(B, A) # [out,in], compute_dtype
|
|
2226
|
+
dV /= g0 # row-wise divide
|
|
2227
|
+
dV.mul_(s)
|
|
2228
|
+
dir_update = dV if dir_update is None else dir_update.add_(dV)
|
|
2229
|
+
|
|
2230
|
+
|
|
2231
|
+
if dora_mode == "ref_exact":
|
|
2232
|
+
# absolute magnitude (last one wins if multiple DoRAs present)
|
|
2233
|
+
g = g_abs
|
|
2234
|
+
elif dora_mode == "blend":
|
|
2235
|
+
# blend towards absolute magnitude proportional to s
|
|
2236
|
+
if g is None:
|
|
2237
|
+
g = g0.clone()
|
|
2238
|
+
g.add_(g_abs.sub(g0), alpha=s)
|
|
2239
|
+
else:
|
|
2240
|
+
raise ValueError(f"Unknown dora_mode: {dora_mode}")
|
|
2241
|
+
|
|
2242
|
+
# Optional bias deltas (not in reference, but harmless if present)
|
|
2243
|
+
if diff_b is not None:
|
|
2244
|
+
db = diff_b.mul(s)
|
|
2245
|
+
bias_delta = db if bias_delta is None else bias_delta.add_(db)
|
|
2246
|
+
db = None
|
|
2247
|
+
|
|
2248
|
+
if g is None:
|
|
2249
|
+
g = g0 # no magnitude provided -> keep original
|
|
2250
|
+
|
|
2251
|
+
# Re-normalize rows if we changed direction
|
|
2252
|
+
if dir_update is not None:
|
|
2253
|
+
weight.add_(dir_update)
|
|
2254
|
+
V = weight.float()
|
|
2255
|
+
Vn = torch.linalg.vector_norm(V, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps)
|
|
2256
|
+
V /= Vn
|
|
2257
|
+
V *= g
|
|
2258
|
+
weight[...] = V.to(base_dtype)
|
|
2259
|
+
V = None
|
|
2260
|
+
else:
|
|
2261
|
+
weight *= g
|
|
2262
|
+
# Recompose adapted weight; cast back to module dtype
|
|
2263
|
+
|
|
2264
|
+
# Merge DoRA bias delta safely
|
|
2265
|
+
if bias_delta is not None:
|
|
2266
|
+
if bias is None:
|
|
2267
|
+
bias = bias_delta
|
|
2268
|
+
else:
|
|
2269
|
+
bias = bias.clone() if original_bias else bias
|
|
2270
|
+
bias.add_(bias_delta)
|
|
2271
|
+
|
|
2272
|
+
return weight, bias
|
|
2273
|
+
|
|
2274
|
+
|
|
2275
|
+
|
|
2156
2276
|
def _lora_linear_forward(self, model, submodule, loras_data, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
|
2157
2277
|
weight = submodule.weight
|
|
2278
|
+
bias = submodule.bias
|
|
2158
2279
|
active_adapters = model._loras_active_adapters
|
|
2159
2280
|
loras_scaling = model._loras_scaling
|
|
2281
|
+
any_dora = loras_data.get("any_dora", False)
|
|
2160
2282
|
training = False
|
|
2161
2283
|
|
|
2162
2284
|
dtype = weight.dtype
|
|
2163
|
-
if weight.shape[-1] < x.shape[-2]: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
2164
|
-
|
|
2285
|
+
if weight.shape[-1] < x.shape[-2] or any_dora: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
2286
|
+
original_bias = True
|
|
2165
2287
|
original_bias = True
|
|
2166
2288
|
if len(active_adapters) > 0:
|
|
2167
2289
|
if isinstance(submodule, QModuleMixin):
|
|
@@ -2172,9 +2294,9 @@ class offload:
|
|
|
2172
2294
|
data = loras_data.get(active_adapter + '_GPU', None)
|
|
2173
2295
|
if data == None:
|
|
2174
2296
|
continue
|
|
2175
|
-
lora_A_weight, lora_B_weight, diff_b, alpha = data
|
|
2297
|
+
lora_A_weight, lora_B_weight, diff_b, g_abs, alpha = data
|
|
2176
2298
|
scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
|
|
2177
|
-
if scaling == 0:
|
|
2299
|
+
if scaling == 0 or g_abs is not None:
|
|
2178
2300
|
continue
|
|
2179
2301
|
if lora_A_weight != None:
|
|
2180
2302
|
weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
|
|
@@ -2188,6 +2310,10 @@ class offload:
|
|
|
2188
2310
|
original_bias = False
|
|
2189
2311
|
bias.add_(diff_b, alpha=scaling)
|
|
2190
2312
|
# base_weight += scaling * lora_B_weight @ lora_A_weight
|
|
2313
|
+
|
|
2314
|
+
if any_dora :
|
|
2315
|
+
weight, bias = self._dora_linear_forward(model, submodule, loras_data, weight, bias, original_bias)
|
|
2316
|
+
|
|
2191
2317
|
if training:
|
|
2192
2318
|
pass
|
|
2193
2319
|
# result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
|
|
@@ -2195,7 +2321,7 @@ class offload:
|
|
|
2195
2321
|
result = torch.nn.functional.linear(x, weight, bias=bias)
|
|
2196
2322
|
|
|
2197
2323
|
else:
|
|
2198
|
-
result = torch.nn.functional.linear(x, weight, bias=
|
|
2324
|
+
result = torch.nn.functional.linear(x, weight, bias=bias)
|
|
2199
2325
|
|
|
2200
2326
|
if len(active_adapters) > 0:
|
|
2201
2327
|
x = x.to(dtype)
|
|
@@ -2204,10 +2330,10 @@ class offload:
|
|
|
2204
2330
|
data = loras_data.get(active_adapter + '_GPU', None)
|
|
2205
2331
|
if data == None:
|
|
2206
2332
|
continue
|
|
2207
|
-
lora_A, lora_B, diff_b, alpha = data
|
|
2333
|
+
lora_A, lora_B, diff_b, g_abs, alpha = data
|
|
2208
2334
|
# dropout = self.lora_dropout[active_adapter]
|
|
2209
2335
|
scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
|
|
2210
|
-
if scaling == 0:
|
|
2336
|
+
if scaling == 0 or g_abs is not None:
|
|
2211
2337
|
continue
|
|
2212
2338
|
|
|
2213
2339
|
if lora_A == None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.6.
|
|
3
|
+
Version: 3.6.4
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.6.
|
|
18
|
+
<H2>Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|