mmgp 3.6.3__py3-none-any.whl → 3.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/offload.py CHANGED
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.6.5 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -66,8 +66,6 @@ from accelerate import init_empty_weights
66
66
 
67
67
  import functools
68
68
  import types
69
- import torch
70
-
71
69
 
72
70
  from mmgp import safetensors2
73
71
  from mmgp import profile_type
@@ -87,6 +85,9 @@ class QEmbedding(QModuleMixin, torch.nn.Embedding):
87
85
  return torch.nn.functional.embedding( input, self.qweight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse )
88
86
 
89
87
 
88
+
89
+
90
+
90
91
  shared_state = {}
91
92
 
92
93
  def get_cache(cache_name):
@@ -688,7 +689,7 @@ def _welcome():
688
689
  if welcome_displayed:
689
690
  return
690
691
  welcome_displayed = True
691
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
692
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.5) by DeepBeepMeep ************{ENDC}{UNBOLD}")
692
693
 
693
694
  def change_dtype(model, new_dtype, exclude_buffers = False):
694
695
  for submodule_name, submodule in model.named_modules():
@@ -1049,7 +1050,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1049
1050
 
1050
1051
  if split_linear_modules_map != None:
1051
1052
  new_state_dict = dict()
1052
- suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False)]
1053
+ suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False),(".dora_scale", -2, False),]
1053
1054
  for module_name, module_data in state_dict.items():
1054
1055
  name_parts = module_name.split(".")
1055
1056
  for suffix, pos, any_split in suffixes:
@@ -1089,7 +1090,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1089
1090
 
1090
1091
  lora_alphas = {}
1091
1092
  for k in keys:
1092
- if "alpha" in k:
1093
+ if k.endswith(".alpha"):
1093
1094
  alpha_value = state_dict.pop(k)
1094
1095
  if torch.is_tensor(alpha_value):
1095
1096
  alpha_value = float(alpha_value.item())
@@ -1100,13 +1101,16 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1100
1101
  new_state_dict = {}
1101
1102
  for k in list(state_dict.keys()):
1102
1103
  v = state_dict.pop(k)
1103
- lora_A = lora_B = diff_b = diff = lora_key = None
1104
+ lora_A = lora_B = diff_b = diff = lora_key = dora_scale = None
1104
1105
  if k.endswith(".diff"):
1105
1106
  diff = v
1106
1107
  module_name = k[ : -5]
1107
1108
  elif k.endswith(".diff_b"):
1108
1109
  diff_b = v
1109
1110
  module_name = k[ : -7]
1111
+ elif k.endswith(".dora_scale"):
1112
+ dora_scale = v
1113
+ module_name = k[ : -11]
1110
1114
  else:
1111
1115
  pos = k.rfind(".lora_")
1112
1116
  if pos <=0:
@@ -1185,7 +1189,17 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1185
1189
  fail = True
1186
1190
  break
1187
1191
  v = diff_b = diff_b.to(module.weight.dtype)
1188
-
1192
+ elif dora_scale != None:
1193
+ rank = dora_scale.shape[1]
1194
+ if module_shape[0] != v.shape[0]:
1195
+ if ignore_model_variations:
1196
+ skip = True
1197
+ else:
1198
+ msg = f"Lora '{path}': Dora Scale dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, dora scale = {v.shape[0]}). It is likely this Dora has been made for another version of this model."
1199
+ error_msg = append(error_msg, msg)
1200
+ fail = True
1201
+ break
1202
+ v = dora_scale = dora_scale.to(module.weight.dtype)
1189
1203
  if not check_only:
1190
1204
  new_state_dict[k] = v
1191
1205
  v = None
@@ -1193,19 +1207,23 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1193
1207
  assert loras_module_data != None
1194
1208
  loras_adapter_data = loras_module_data.get(adapter_name, None)
1195
1209
  if loras_adapter_data == None:
1196
- loras_adapter_data = [None, None, None, 1.]
1210
+ loras_adapter_data = [None, None, None, None, 1.]
1211
+ module.any_dora = False
1197
1212
  loras_module_data[adapter_name] = loras_adapter_data
1198
1213
  if lora_A != None:
1199
1214
  loras_adapter_data[0] = lora_A
1200
1215
  elif lora_B != None:
1201
1216
  loras_adapter_data[1] = lora_B
1217
+ elif dora_scale != None:
1218
+ loras_adapter_data[3] = dora_scale
1219
+ loras_module_data["any_dora"] = True
1202
1220
  else:
1203
1221
  loras_adapter_data[2] = diff_b
1204
1222
  if rank != None and lora_key is not None and "lora" in lora_key:
1205
1223
  alpha_key = k[:-len(lora_key)] + "alpha"
1206
1224
  alpha = lora_alphas.get(alpha_key, None)
1207
- if alpha is not None: loras_adapter_data[3] = alpha / rank
1208
- lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = None
1225
+ if alpha is not None: loras_adapter_data[4] = alpha / rank
1226
+ lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = dora_scale = None
1209
1227
 
1210
1228
  if len(invalid_keys) > 0:
1211
1229
  msg = f"Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
@@ -1276,6 +1294,7 @@ def sync_models_loras(model, model2):
1276
1294
 
1277
1295
  def unload_loras_from_model(model):
1278
1296
  if model is None: return
1297
+ if not hasattr(model, "_loras_model_data"): return
1279
1298
  for _, v in model._loras_model_data.items():
1280
1299
  v.clear()
1281
1300
  for _, v in model._loras_model_shortcuts.items():
@@ -1468,6 +1487,7 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
1468
1487
  for no, file in enumerate(file_path):
1469
1488
  quantization_map = None
1470
1489
  tied_weights_map = None
1490
+ metadata = None
1471
1491
  if not (".safetensors" in file or ".sft" in file):
1472
1492
  if pinToMemory:
1473
1493
  raise Exception("Pinning to memory while loading only supported for safe tensors files")
@@ -1479,7 +1499,6 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
1479
1499
  basename = os.path.basename(file)
1480
1500
 
1481
1501
  if "-of-" in basename:
1482
- metadata = None
1483
1502
  file_parts= basename.split("-")
1484
1503
  parts_max = int(file_parts[-1][:5])
1485
1504
  state_dict = {}
@@ -2130,7 +2149,7 @@ class offload:
2130
2149
  data = loras_data.get(active_adapter + '_GPU', None)
2131
2150
  if data == None:
2132
2151
  continue
2133
- diff_w , _ , diff_b, alpha = data
2152
+ diff_w , _ , diff_b, _, alpha = data
2134
2153
  scaling = self._get_lora_scaling( loras_scaling, model, active_adapter) * alpha
2135
2154
  if scaling == 0:
2136
2155
  continue
@@ -2156,15 +2175,116 @@ class offload:
2156
2175
  return ret
2157
2176
 
2158
2177
 
2178
+ def _dora_linear_forward(
2179
+ self,
2180
+ model,
2181
+ submodule,
2182
+ adapters_data, # dict: name+"_GPU" -> (A, B, diff_b, g_abs, alpha); g_abs=None means LoRA
2183
+ weight= None,
2184
+ bias = None,
2185
+ original_bias = True,
2186
+ dora_mode: str = "blend", # "ref_exact" | "blend"
2187
+ ):
2188
+ active_adapters = getattr(model, "_loras_active_adapters", [])
2189
+ loras_scaling = getattr(model, "_loras_scaling", {})
2190
+ # Snapshot base weight (safe for quantized modules)
2191
+ if weight is None:
2192
+ bias = submodule.bias
2193
+ original_bias = True
2194
+ if isinstance(submodule, QModuleMixin):
2195
+ weight = submodule.weight.view(submodule.weight.shape)
2196
+ else:
2197
+ weight = submodule.weight.clone()
2198
+
2199
+ base_dtype = weight.dtype
2200
+ eps = 1e-8
2201
+ W0 = weight.float()
2202
+ g0 = torch.linalg.vector_norm(W0, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps) # [out,1]
2203
+
2204
+ # Keep big mats in low precision
2205
+ # Wc = W0 if W0.dtype == compute_dtype else W0.to(compute_dtype)
2206
+ W0 /= g0
2207
+ weight[...] = W0.to(base_dtype)
2208
+ W0 = None
2209
+
2210
+ dir_update = None # Σ s * ((B@A)/g0) in compute_dtype
2211
+ g = None # final magnitude: set absolute (ref_exact) or blended (blend)
2212
+ bias_delta = None # Σ s * diff_b
2213
+
2214
+ # Accumulate DoRA adapters only (g_abs != None)
2215
+ for name in active_adapters:
2216
+ data = adapters_data.get(name + "_GPU", None)
2217
+ if data is None: continue
2218
+ A, B, diff_b, g_abs, alpha = data
2219
+ if g_abs is None: continue
2220
+
2221
+ s = self._get_lora_scaling(loras_scaling, model, name) * float(alpha)
2222
+ if s == 0: continue
2223
+
2224
+ # Direction update in V-space with row-wise 1/g0
2225
+ if (A is not None) and (B is not None):
2226
+ dV = torch.mm(B, A) # [out,in], compute_dtype
2227
+ dV /= g0 # row-wise divide
2228
+ dV.mul_(s)
2229
+ dir_update = dV if dir_update is None else dir_update.add_(dV)
2230
+
2231
+
2232
+ if dora_mode == "ref_exact":
2233
+ # absolute magnitude (last one wins if multiple DoRAs present)
2234
+ g = g_abs
2235
+ elif dora_mode == "blend":
2236
+ # blend towards absolute magnitude proportional to s
2237
+ if g is None:
2238
+ g = g0.clone()
2239
+ g.add_(g_abs.sub(g0), alpha=s)
2240
+ else:
2241
+ raise ValueError(f"Unknown dora_mode: {dora_mode}")
2242
+
2243
+ # Optional bias deltas (not in reference, but harmless if present)
2244
+ if diff_b is not None:
2245
+ db = diff_b.mul(s)
2246
+ bias_delta = db if bias_delta is None else bias_delta.add_(db)
2247
+ db = None
2248
+
2249
+ if g is None:
2250
+ g = g0 # no magnitude provided -> keep original
2251
+
2252
+ # Re-normalize rows if we changed direction
2253
+ if dir_update is not None:
2254
+ weight.add_(dir_update)
2255
+ V = weight.float()
2256
+ Vn = torch.linalg.vector_norm(V, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps)
2257
+ V /= Vn
2258
+ V *= g
2259
+ weight[...] = V.to(base_dtype)
2260
+ V = None
2261
+ else:
2262
+ weight *= g
2263
+ # Recompose adapted weight; cast back to module dtype
2264
+
2265
+ # Merge DoRA bias delta safely
2266
+ if bias_delta is not None:
2267
+ if bias is None:
2268
+ bias = bias_delta
2269
+ else:
2270
+ bias = bias.clone() if original_bias else bias
2271
+ bias.add_(bias_delta)
2272
+
2273
+ return weight, bias
2274
+
2275
+
2276
+
2159
2277
  def _lora_linear_forward(self, model, submodule, loras_data, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
2160
2278
  weight = submodule.weight
2279
+ bias = submodule.bias
2161
2280
  active_adapters = model._loras_active_adapters
2162
2281
  loras_scaling = model._loras_scaling
2282
+ any_dora = loras_data.get("any_dora", False)
2163
2283
  training = False
2164
2284
 
2165
2285
  dtype = weight.dtype
2166
- if weight.shape[-1] < x.shape[-2]: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
2167
- bias = submodule.bias
2286
+ if weight.shape[-1] < x.shape[-2] or any_dora: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
2287
+ original_bias = True
2168
2288
  original_bias = True
2169
2289
  if len(active_adapters) > 0:
2170
2290
  if isinstance(submodule, QModuleMixin):
@@ -2175,9 +2295,9 @@ class offload:
2175
2295
  data = loras_data.get(active_adapter + '_GPU', None)
2176
2296
  if data == None:
2177
2297
  continue
2178
- lora_A_weight, lora_B_weight, diff_b, alpha = data
2298
+ lora_A_weight, lora_B_weight, diff_b, g_abs, alpha = data
2179
2299
  scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
2180
- if scaling == 0:
2300
+ if scaling == 0 or g_abs is not None:
2181
2301
  continue
2182
2302
  if lora_A_weight != None:
2183
2303
  weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
@@ -2191,6 +2311,10 @@ class offload:
2191
2311
  original_bias = False
2192
2312
  bias.add_(diff_b, alpha=scaling)
2193
2313
  # base_weight += scaling * lora_B_weight @ lora_A_weight
2314
+
2315
+ if any_dora :
2316
+ weight, bias = self._dora_linear_forward(model, submodule, loras_data, weight, bias, original_bias)
2317
+
2194
2318
  if training:
2195
2319
  pass
2196
2320
  # result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
@@ -2198,7 +2322,7 @@ class offload:
2198
2322
  result = torch.nn.functional.linear(x, weight, bias=bias)
2199
2323
 
2200
2324
  else:
2201
- result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
2325
+ result = torch.nn.functional.linear(x, weight, bias=bias)
2202
2326
 
2203
2327
  if len(active_adapters) > 0:
2204
2328
  x = x.to(dtype)
@@ -2207,10 +2331,10 @@ class offload:
2207
2331
  data = loras_data.get(active_adapter + '_GPU', None)
2208
2332
  if data == None:
2209
2333
  continue
2210
- lora_A, lora_B, diff_b, alpha = data
2334
+ lora_A, lora_B, diff_b, g_abs, alpha = data
2211
2335
  # dropout = self.lora_dropout[active_adapter]
2212
2336
  scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
2213
- if scaling == 0:
2337
+ if scaling == 0 or g_abs is not None:
2214
2338
  continue
2215
2339
 
2216
2340
  if lora_A == None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.6.3
3
+ Version: 3.6.5
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.5 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -0,0 +1,10 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/fp8_quanto_bridge.py,sha256=VtUaD6wzo7Yn9vGY0LMtbhwt6KMWRpSWLc65bU_sfZU,21155
4
+ mmgp/offload.py,sha256=GwM0o0rWUwY3tb_HDl_pO6S0XvMpVZsre3DzwFCLQh4,138988
5
+ mmgp/safetensors2.py,sha256=zYNMprt1KoxgVALbcz6DawxsQDNNRImvgO9cYRChUiY,19028
6
+ mmgp-3.6.5.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
7
+ mmgp-3.6.5.dist-info/METADATA,sha256=9wXPfJYiHExhfM-kSeA4mLoNh0laDRXJ3ZrBFSpQlUg,16309
8
+ mmgp-3.6.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ mmgp-3.6.5.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
10
+ mmgp-3.6.5.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/fp8_quanto_bridge.py,sha256=VtUaD6wzo7Yn9vGY0LMtbhwt6KMWRpSWLc65bU_sfZU,21155
4
- mmgp/offload.py,sha256=lIxXrPyhUCOfrf2iKGPkP8LDpUr_iKmtm9jPYjSigqY,133693
5
- mmgp/safetensors2.py,sha256=zYNMprt1KoxgVALbcz6DawxsQDNNRImvgO9cYRChUiY,19028
6
- mmgp-3.6.3.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
7
- mmgp-3.6.3.dist-info/METADATA,sha256=REF-3mSKg_D7epiYmeQgNXDMrRXkUO4s2UpeUjvHMSo,16309
8
- mmgp-3.6.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- mmgp-3.6.3.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
10
- mmgp-3.6.3.dist-info/RECORD,,
File without changes