mmgp 3.6.3__tar.gz → 3.6.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.6.3/src/mmgp.egg-info → mmgp-3.6.4}/PKG-INFO +2 -2
- {mmgp-3.6.3 → mmgp-3.6.4}/README.md +1 -1
- {mmgp-3.6.3 → mmgp-3.6.4}/pyproject.toml +1 -1
- {mmgp-3.6.3 → mmgp-3.6.4}/src/mmgp/offload.py +142 -19
- {mmgp-3.6.3 → mmgp-3.6.4/src/mmgp.egg-info}/PKG-INFO +2 -2
- {mmgp-3.6.3 → mmgp-3.6.4}/LICENSE.md +0 -0
- {mmgp-3.6.3 → mmgp-3.6.4}/setup.cfg +0 -0
- {mmgp-3.6.3 → mmgp-3.6.4}/src/__init__.py +0 -0
- {mmgp-3.6.3 → mmgp-3.6.4}/src/mmgp/__init__.py +0 -0
- {mmgp-3.6.3 → mmgp-3.6.4}/src/mmgp/fp8_quanto_bridge.py +0 -0
- {mmgp-3.6.3 → mmgp-3.6.4}/src/mmgp/safetensors2.py +0 -0
- {mmgp-3.6.3 → mmgp-3.6.4}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.6.3 → mmgp-3.6.4}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.6.3 → mmgp-3.6.4}/src/mmgp.egg-info/requires.txt +0 -0
- {mmgp-3.6.3 → mmgp-3.6.4}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.6.
|
|
3
|
+
Version: 3.6.4
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.6.
|
|
18
|
+
<H2>Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.6.
|
|
1
|
+
# ------------------ Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -66,8 +66,6 @@ from accelerate import init_empty_weights
|
|
|
66
66
|
|
|
67
67
|
import functools
|
|
68
68
|
import types
|
|
69
|
-
import torch
|
|
70
|
-
|
|
71
69
|
|
|
72
70
|
from mmgp import safetensors2
|
|
73
71
|
from mmgp import profile_type
|
|
@@ -87,6 +85,9 @@ class QEmbedding(QModuleMixin, torch.nn.Embedding):
|
|
|
87
85
|
return torch.nn.functional.embedding( input, self.qweight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse )
|
|
88
86
|
|
|
89
87
|
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
90
91
|
shared_state = {}
|
|
91
92
|
|
|
92
93
|
def get_cache(cache_name):
|
|
@@ -688,7 +689,7 @@ def _welcome():
|
|
|
688
689
|
if welcome_displayed:
|
|
689
690
|
return
|
|
690
691
|
welcome_displayed = True
|
|
691
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.
|
|
692
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
692
693
|
|
|
693
694
|
def change_dtype(model, new_dtype, exclude_buffers = False):
|
|
694
695
|
for submodule_name, submodule in model.named_modules():
|
|
@@ -1049,7 +1050,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1049
1050
|
|
|
1050
1051
|
if split_linear_modules_map != None:
|
|
1051
1052
|
new_state_dict = dict()
|
|
1052
|
-
suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False)]
|
|
1053
|
+
suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False),(".dora_scale", -2, False),]
|
|
1053
1054
|
for module_name, module_data in state_dict.items():
|
|
1054
1055
|
name_parts = module_name.split(".")
|
|
1055
1056
|
for suffix, pos, any_split in suffixes:
|
|
@@ -1089,7 +1090,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1089
1090
|
|
|
1090
1091
|
lora_alphas = {}
|
|
1091
1092
|
for k in keys:
|
|
1092
|
-
if "alpha"
|
|
1093
|
+
if k.endswith(".alpha"):
|
|
1093
1094
|
alpha_value = state_dict.pop(k)
|
|
1094
1095
|
if torch.is_tensor(alpha_value):
|
|
1095
1096
|
alpha_value = float(alpha_value.item())
|
|
@@ -1100,13 +1101,16 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1100
1101
|
new_state_dict = {}
|
|
1101
1102
|
for k in list(state_dict.keys()):
|
|
1102
1103
|
v = state_dict.pop(k)
|
|
1103
|
-
lora_A = lora_B = diff_b = diff = lora_key = None
|
|
1104
|
+
lora_A = lora_B = diff_b = diff = lora_key = dora_scale = None
|
|
1104
1105
|
if k.endswith(".diff"):
|
|
1105
1106
|
diff = v
|
|
1106
1107
|
module_name = k[ : -5]
|
|
1107
1108
|
elif k.endswith(".diff_b"):
|
|
1108
1109
|
diff_b = v
|
|
1109
1110
|
module_name = k[ : -7]
|
|
1111
|
+
elif k.endswith(".dora_scale"):
|
|
1112
|
+
dora_scale = v
|
|
1113
|
+
module_name = k[ : -11]
|
|
1110
1114
|
else:
|
|
1111
1115
|
pos = k.rfind(".lora_")
|
|
1112
1116
|
if pos <=0:
|
|
@@ -1185,7 +1189,17 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1185
1189
|
fail = True
|
|
1186
1190
|
break
|
|
1187
1191
|
v = diff_b = diff_b.to(module.weight.dtype)
|
|
1188
|
-
|
|
1192
|
+
elif dora_scale != None:
|
|
1193
|
+
rank = dora_scale.shape[1]
|
|
1194
|
+
if module_shape[0] != v.shape[0]:
|
|
1195
|
+
if ignore_model_variations:
|
|
1196
|
+
skip = True
|
|
1197
|
+
else:
|
|
1198
|
+
msg = f"Lora '{path}': Dora Scale dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, dora scale = {v.shape[0]}). It is likely this Dora has been made for another version of this model."
|
|
1199
|
+
error_msg = append(error_msg, msg)
|
|
1200
|
+
fail = True
|
|
1201
|
+
break
|
|
1202
|
+
v = dora_scale = dora_scale.to(module.weight.dtype)
|
|
1189
1203
|
if not check_only:
|
|
1190
1204
|
new_state_dict[k] = v
|
|
1191
1205
|
v = None
|
|
@@ -1193,19 +1207,23 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1193
1207
|
assert loras_module_data != None
|
|
1194
1208
|
loras_adapter_data = loras_module_data.get(adapter_name, None)
|
|
1195
1209
|
if loras_adapter_data == None:
|
|
1196
|
-
loras_adapter_data = [None, None, None, 1.]
|
|
1210
|
+
loras_adapter_data = [None, None, None, None, 1.]
|
|
1211
|
+
module.any_dora = False
|
|
1197
1212
|
loras_module_data[adapter_name] = loras_adapter_data
|
|
1198
1213
|
if lora_A != None:
|
|
1199
1214
|
loras_adapter_data[0] = lora_A
|
|
1200
1215
|
elif lora_B != None:
|
|
1201
1216
|
loras_adapter_data[1] = lora_B
|
|
1217
|
+
elif dora_scale != None:
|
|
1218
|
+
loras_adapter_data[3] = dora_scale
|
|
1219
|
+
loras_module_data["any_dora"] = True
|
|
1202
1220
|
else:
|
|
1203
1221
|
loras_adapter_data[2] = diff_b
|
|
1204
1222
|
if rank != None and lora_key is not None and "lora" in lora_key:
|
|
1205
1223
|
alpha_key = k[:-len(lora_key)] + "alpha"
|
|
1206
1224
|
alpha = lora_alphas.get(alpha_key, None)
|
|
1207
|
-
if alpha is not None: loras_adapter_data[
|
|
1208
|
-
lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = None
|
|
1225
|
+
if alpha is not None: loras_adapter_data[4] = alpha / rank
|
|
1226
|
+
lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = dora_scale = None
|
|
1209
1227
|
|
|
1210
1228
|
if len(invalid_keys) > 0:
|
|
1211
1229
|
msg = f"Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
|
|
@@ -2130,7 +2148,7 @@ class offload:
|
|
|
2130
2148
|
data = loras_data.get(active_adapter + '_GPU', None)
|
|
2131
2149
|
if data == None:
|
|
2132
2150
|
continue
|
|
2133
|
-
diff_w , _ , diff_b, alpha = data
|
|
2151
|
+
diff_w , _ , diff_b, _, alpha = data
|
|
2134
2152
|
scaling = self._get_lora_scaling( loras_scaling, model, active_adapter) * alpha
|
|
2135
2153
|
if scaling == 0:
|
|
2136
2154
|
continue
|
|
@@ -2156,15 +2174,116 @@ class offload:
|
|
|
2156
2174
|
return ret
|
|
2157
2175
|
|
|
2158
2176
|
|
|
2177
|
+
def _dora_linear_forward(
|
|
2178
|
+
self,
|
|
2179
|
+
model,
|
|
2180
|
+
submodule,
|
|
2181
|
+
adapters_data, # dict: name+"_GPU" -> (A, B, diff_b, g_abs, alpha); g_abs=None means LoRA
|
|
2182
|
+
weight= None,
|
|
2183
|
+
bias = None,
|
|
2184
|
+
original_bias = True,
|
|
2185
|
+
dora_mode: str = "blend", # "ref_exact" | "blend"
|
|
2186
|
+
):
|
|
2187
|
+
active_adapters = getattr(model, "_loras_active_adapters", [])
|
|
2188
|
+
loras_scaling = getattr(model, "_loras_scaling", {})
|
|
2189
|
+
# Snapshot base weight (safe for quantized modules)
|
|
2190
|
+
if weight is None:
|
|
2191
|
+
bias = submodule.bias
|
|
2192
|
+
original_bias = True
|
|
2193
|
+
if isinstance(submodule, QModuleMixin):
|
|
2194
|
+
weight = submodule.weight.view(submodule.weight.shape)
|
|
2195
|
+
else:
|
|
2196
|
+
weight = submodule.weight.clone()
|
|
2197
|
+
|
|
2198
|
+
base_dtype = weight.dtype
|
|
2199
|
+
eps = 1e-8
|
|
2200
|
+
W0 = weight.float()
|
|
2201
|
+
g0 = torch.linalg.vector_norm(W0, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps) # [out,1]
|
|
2202
|
+
|
|
2203
|
+
# Keep big mats in low precision
|
|
2204
|
+
# Wc = W0 if W0.dtype == compute_dtype else W0.to(compute_dtype)
|
|
2205
|
+
W0 /= g0
|
|
2206
|
+
weight[...] = W0.to(base_dtype)
|
|
2207
|
+
W0 = None
|
|
2208
|
+
|
|
2209
|
+
dir_update = None # Σ s * ((B@A)/g0) in compute_dtype
|
|
2210
|
+
g = None # final magnitude: set absolute (ref_exact) or blended (blend)
|
|
2211
|
+
bias_delta = None # Σ s * diff_b
|
|
2212
|
+
|
|
2213
|
+
# Accumulate DoRA adapters only (g_abs != None)
|
|
2214
|
+
for name in active_adapters:
|
|
2215
|
+
data = adapters_data.get(name + "_GPU", None)
|
|
2216
|
+
if data is None: continue
|
|
2217
|
+
A, B, diff_b, g_abs, alpha = data
|
|
2218
|
+
if g_abs is None: continue
|
|
2219
|
+
|
|
2220
|
+
s = self._get_lora_scaling(loras_scaling, model, name) * float(alpha)
|
|
2221
|
+
if s == 0: continue
|
|
2222
|
+
|
|
2223
|
+
# Direction update in V-space with row-wise 1/g0
|
|
2224
|
+
if (A is not None) and (B is not None):
|
|
2225
|
+
dV = torch.mm(B, A) # [out,in], compute_dtype
|
|
2226
|
+
dV /= g0 # row-wise divide
|
|
2227
|
+
dV.mul_(s)
|
|
2228
|
+
dir_update = dV if dir_update is None else dir_update.add_(dV)
|
|
2229
|
+
|
|
2230
|
+
|
|
2231
|
+
if dora_mode == "ref_exact":
|
|
2232
|
+
# absolute magnitude (last one wins if multiple DoRAs present)
|
|
2233
|
+
g = g_abs
|
|
2234
|
+
elif dora_mode == "blend":
|
|
2235
|
+
# blend towards absolute magnitude proportional to s
|
|
2236
|
+
if g is None:
|
|
2237
|
+
g = g0.clone()
|
|
2238
|
+
g.add_(g_abs.sub(g0), alpha=s)
|
|
2239
|
+
else:
|
|
2240
|
+
raise ValueError(f"Unknown dora_mode: {dora_mode}")
|
|
2241
|
+
|
|
2242
|
+
# Optional bias deltas (not in reference, but harmless if present)
|
|
2243
|
+
if diff_b is not None:
|
|
2244
|
+
db = diff_b.mul(s)
|
|
2245
|
+
bias_delta = db if bias_delta is None else bias_delta.add_(db)
|
|
2246
|
+
db = None
|
|
2247
|
+
|
|
2248
|
+
if g is None:
|
|
2249
|
+
g = g0 # no magnitude provided -> keep original
|
|
2250
|
+
|
|
2251
|
+
# Re-normalize rows if we changed direction
|
|
2252
|
+
if dir_update is not None:
|
|
2253
|
+
weight.add_(dir_update)
|
|
2254
|
+
V = weight.float()
|
|
2255
|
+
Vn = torch.linalg.vector_norm(V, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps)
|
|
2256
|
+
V /= Vn
|
|
2257
|
+
V *= g
|
|
2258
|
+
weight[...] = V.to(base_dtype)
|
|
2259
|
+
V = None
|
|
2260
|
+
else:
|
|
2261
|
+
weight *= g
|
|
2262
|
+
# Recompose adapted weight; cast back to module dtype
|
|
2263
|
+
|
|
2264
|
+
# Merge DoRA bias delta safely
|
|
2265
|
+
if bias_delta is not None:
|
|
2266
|
+
if bias is None:
|
|
2267
|
+
bias = bias_delta
|
|
2268
|
+
else:
|
|
2269
|
+
bias = bias.clone() if original_bias else bias
|
|
2270
|
+
bias.add_(bias_delta)
|
|
2271
|
+
|
|
2272
|
+
return weight, bias
|
|
2273
|
+
|
|
2274
|
+
|
|
2275
|
+
|
|
2159
2276
|
def _lora_linear_forward(self, model, submodule, loras_data, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
|
2160
2277
|
weight = submodule.weight
|
|
2278
|
+
bias = submodule.bias
|
|
2161
2279
|
active_adapters = model._loras_active_adapters
|
|
2162
2280
|
loras_scaling = model._loras_scaling
|
|
2281
|
+
any_dora = loras_data.get("any_dora", False)
|
|
2163
2282
|
training = False
|
|
2164
2283
|
|
|
2165
2284
|
dtype = weight.dtype
|
|
2166
|
-
if weight.shape[-1] < x.shape[-2]: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
2167
|
-
|
|
2285
|
+
if weight.shape[-1] < x.shape[-2] or any_dora: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
2286
|
+
original_bias = True
|
|
2168
2287
|
original_bias = True
|
|
2169
2288
|
if len(active_adapters) > 0:
|
|
2170
2289
|
if isinstance(submodule, QModuleMixin):
|
|
@@ -2175,9 +2294,9 @@ class offload:
|
|
|
2175
2294
|
data = loras_data.get(active_adapter + '_GPU', None)
|
|
2176
2295
|
if data == None:
|
|
2177
2296
|
continue
|
|
2178
|
-
lora_A_weight, lora_B_weight, diff_b, alpha = data
|
|
2297
|
+
lora_A_weight, lora_B_weight, diff_b, g_abs, alpha = data
|
|
2179
2298
|
scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
|
|
2180
|
-
if scaling == 0:
|
|
2299
|
+
if scaling == 0 or g_abs is not None:
|
|
2181
2300
|
continue
|
|
2182
2301
|
if lora_A_weight != None:
|
|
2183
2302
|
weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
|
|
@@ -2191,6 +2310,10 @@ class offload:
|
|
|
2191
2310
|
original_bias = False
|
|
2192
2311
|
bias.add_(diff_b, alpha=scaling)
|
|
2193
2312
|
# base_weight += scaling * lora_B_weight @ lora_A_weight
|
|
2313
|
+
|
|
2314
|
+
if any_dora :
|
|
2315
|
+
weight, bias = self._dora_linear_forward(model, submodule, loras_data, weight, bias, original_bias)
|
|
2316
|
+
|
|
2194
2317
|
if training:
|
|
2195
2318
|
pass
|
|
2196
2319
|
# result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
|
|
@@ -2198,7 +2321,7 @@ class offload:
|
|
|
2198
2321
|
result = torch.nn.functional.linear(x, weight, bias=bias)
|
|
2199
2322
|
|
|
2200
2323
|
else:
|
|
2201
|
-
result = torch.nn.functional.linear(x, weight, bias=
|
|
2324
|
+
result = torch.nn.functional.linear(x, weight, bias=bias)
|
|
2202
2325
|
|
|
2203
2326
|
if len(active_adapters) > 0:
|
|
2204
2327
|
x = x.to(dtype)
|
|
@@ -2207,10 +2330,10 @@ class offload:
|
|
|
2207
2330
|
data = loras_data.get(active_adapter + '_GPU', None)
|
|
2208
2331
|
if data == None:
|
|
2209
2332
|
continue
|
|
2210
|
-
lora_A, lora_B, diff_b, alpha = data
|
|
2333
|
+
lora_A, lora_B, diff_b, g_abs, alpha = data
|
|
2211
2334
|
# dropout = self.lora_dropout[active_adapter]
|
|
2212
2335
|
scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
|
|
2213
|
-
if scaling == 0:
|
|
2336
|
+
if scaling == 0 or g_abs is not None:
|
|
2214
2337
|
continue
|
|
2215
2338
|
|
|
2216
2339
|
if lora_A == None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.6.
|
|
3
|
+
Version: 3.6.4
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,7 +15,7 @@ Dynamic: license-file
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<H2>Memory Management 3.6.
|
|
18
|
+
<H2>Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep</H2>
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|