mmgp 3.6.3__tar.gz → 3.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.6.3
3
+ Version: 3.6.4
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  <p align="center">
3
- <H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
3
+ <H2>Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep</H2>
4
4
  </p>
5
5
 
6
6
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.6.3"
3
+ version = "3.6.4"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -66,8 +66,6 @@ from accelerate import init_empty_weights
66
66
 
67
67
  import functools
68
68
  import types
69
- import torch
70
-
71
69
 
72
70
  from mmgp import safetensors2
73
71
  from mmgp import profile_type
@@ -87,6 +85,9 @@ class QEmbedding(QModuleMixin, torch.nn.Embedding):
87
85
  return torch.nn.functional.embedding( input, self.qweight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse )
88
86
 
89
87
 
88
+
89
+
90
+
90
91
  shared_state = {}
91
92
 
92
93
  def get_cache(cache_name):
@@ -688,7 +689,7 @@ def _welcome():
688
689
  if welcome_displayed:
689
690
  return
690
691
  welcome_displayed = True
691
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
692
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
692
693
 
693
694
  def change_dtype(model, new_dtype, exclude_buffers = False):
694
695
  for submodule_name, submodule in model.named_modules():
@@ -1049,7 +1050,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1049
1050
 
1050
1051
  if split_linear_modules_map != None:
1051
1052
  new_state_dict = dict()
1052
- suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False)]
1053
+ suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False),(".dora_scale", -2, False),]
1053
1054
  for module_name, module_data in state_dict.items():
1054
1055
  name_parts = module_name.split(".")
1055
1056
  for suffix, pos, any_split in suffixes:
@@ -1089,7 +1090,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1089
1090
 
1090
1091
  lora_alphas = {}
1091
1092
  for k in keys:
1092
- if "alpha" in k:
1093
+ if k.endswith(".alpha"):
1093
1094
  alpha_value = state_dict.pop(k)
1094
1095
  if torch.is_tensor(alpha_value):
1095
1096
  alpha_value = float(alpha_value.item())
@@ -1100,13 +1101,16 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1100
1101
  new_state_dict = {}
1101
1102
  for k in list(state_dict.keys()):
1102
1103
  v = state_dict.pop(k)
1103
- lora_A = lora_B = diff_b = diff = lora_key = None
1104
+ lora_A = lora_B = diff_b = diff = lora_key = dora_scale = None
1104
1105
  if k.endswith(".diff"):
1105
1106
  diff = v
1106
1107
  module_name = k[ : -5]
1107
1108
  elif k.endswith(".diff_b"):
1108
1109
  diff_b = v
1109
1110
  module_name = k[ : -7]
1111
+ elif k.endswith(".dora_scale"):
1112
+ dora_scale = v
1113
+ module_name = k[ : -11]
1110
1114
  else:
1111
1115
  pos = k.rfind(".lora_")
1112
1116
  if pos <=0:
@@ -1185,7 +1189,17 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1185
1189
  fail = True
1186
1190
  break
1187
1191
  v = diff_b = diff_b.to(module.weight.dtype)
1188
-
1192
+ elif dora_scale != None:
1193
+ rank = dora_scale.shape[1]
1194
+ if module_shape[0] != v.shape[0]:
1195
+ if ignore_model_variations:
1196
+ skip = True
1197
+ else:
1198
+ msg = f"Lora '{path}': Dora Scale dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, dora scale = {v.shape[0]}). It is likely this Dora has been made for another version of this model."
1199
+ error_msg = append(error_msg, msg)
1200
+ fail = True
1201
+ break
1202
+ v = dora_scale = dora_scale.to(module.weight.dtype)
1189
1203
  if not check_only:
1190
1204
  new_state_dict[k] = v
1191
1205
  v = None
@@ -1193,19 +1207,23 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1193
1207
  assert loras_module_data != None
1194
1208
  loras_adapter_data = loras_module_data.get(adapter_name, None)
1195
1209
  if loras_adapter_data == None:
1196
- loras_adapter_data = [None, None, None, 1.]
1210
+ loras_adapter_data = [None, None, None, None, 1.]
1211
+ module.any_dora = False
1197
1212
  loras_module_data[adapter_name] = loras_adapter_data
1198
1213
  if lora_A != None:
1199
1214
  loras_adapter_data[0] = lora_A
1200
1215
  elif lora_B != None:
1201
1216
  loras_adapter_data[1] = lora_B
1217
+ elif dora_scale != None:
1218
+ loras_adapter_data[3] = dora_scale
1219
+ loras_module_data["any_dora"] = True
1202
1220
  else:
1203
1221
  loras_adapter_data[2] = diff_b
1204
1222
  if rank != None and lora_key is not None and "lora" in lora_key:
1205
1223
  alpha_key = k[:-len(lora_key)] + "alpha"
1206
1224
  alpha = lora_alphas.get(alpha_key, None)
1207
- if alpha is not None: loras_adapter_data[3] = alpha / rank
1208
- lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = None
1225
+ if alpha is not None: loras_adapter_data[4] = alpha / rank
1226
+ lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = dora_scale = None
1209
1227
 
1210
1228
  if len(invalid_keys) > 0:
1211
1229
  msg = f"Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
@@ -2130,7 +2148,7 @@ class offload:
2130
2148
  data = loras_data.get(active_adapter + '_GPU', None)
2131
2149
  if data == None:
2132
2150
  continue
2133
- diff_w , _ , diff_b, alpha = data
2151
+ diff_w , _ , diff_b, _, alpha = data
2134
2152
  scaling = self._get_lora_scaling( loras_scaling, model, active_adapter) * alpha
2135
2153
  if scaling == 0:
2136
2154
  continue
@@ -2156,15 +2174,116 @@ class offload:
2156
2174
  return ret
2157
2175
 
2158
2176
 
2177
+ def _dora_linear_forward(
2178
+ self,
2179
+ model,
2180
+ submodule,
2181
+ adapters_data, # dict: name+"_GPU" -> (A, B, diff_b, g_abs, alpha); g_abs=None means LoRA
2182
+ weight= None,
2183
+ bias = None,
2184
+ original_bias = True,
2185
+ dora_mode: str = "blend", # "ref_exact" | "blend"
2186
+ ):
2187
+ active_adapters = getattr(model, "_loras_active_adapters", [])
2188
+ loras_scaling = getattr(model, "_loras_scaling", {})
2189
+ # Snapshot base weight (safe for quantized modules)
2190
+ if weight is None:
2191
+ bias = submodule.bias
2192
+ original_bias = True
2193
+ if isinstance(submodule, QModuleMixin):
2194
+ weight = submodule.weight.view(submodule.weight.shape)
2195
+ else:
2196
+ weight = submodule.weight.clone()
2197
+
2198
+ base_dtype = weight.dtype
2199
+ eps = 1e-8
2200
+ W0 = weight.float()
2201
+ g0 = torch.linalg.vector_norm(W0, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps) # [out,1]
2202
+
2203
+ # Keep big mats in low precision
2204
+ # Wc = W0 if W0.dtype == compute_dtype else W0.to(compute_dtype)
2205
+ W0 /= g0
2206
+ weight[...] = W0.to(base_dtype)
2207
+ W0 = None
2208
+
2209
+ dir_update = None # Σ s * ((B@A)/g0) in compute_dtype
2210
+ g = None # final magnitude: set absolute (ref_exact) or blended (blend)
2211
+ bias_delta = None # Σ s * diff_b
2212
+
2213
+ # Accumulate DoRA adapters only (g_abs != None)
2214
+ for name in active_adapters:
2215
+ data = adapters_data.get(name + "_GPU", None)
2216
+ if data is None: continue
2217
+ A, B, diff_b, g_abs, alpha = data
2218
+ if g_abs is None: continue
2219
+
2220
+ s = self._get_lora_scaling(loras_scaling, model, name) * float(alpha)
2221
+ if s == 0: continue
2222
+
2223
+ # Direction update in V-space with row-wise 1/g0
2224
+ if (A is not None) and (B is not None):
2225
+ dV = torch.mm(B, A) # [out,in], compute_dtype
2226
+ dV /= g0 # row-wise divide
2227
+ dV.mul_(s)
2228
+ dir_update = dV if dir_update is None else dir_update.add_(dV)
2229
+
2230
+
2231
+ if dora_mode == "ref_exact":
2232
+ # absolute magnitude (last one wins if multiple DoRAs present)
2233
+ g = g_abs
2234
+ elif dora_mode == "blend":
2235
+ # blend towards absolute magnitude proportional to s
2236
+ if g is None:
2237
+ g = g0.clone()
2238
+ g.add_(g_abs.sub(g0), alpha=s)
2239
+ else:
2240
+ raise ValueError(f"Unknown dora_mode: {dora_mode}")
2241
+
2242
+ # Optional bias deltas (not in reference, but harmless if present)
2243
+ if diff_b is not None:
2244
+ db = diff_b.mul(s)
2245
+ bias_delta = db if bias_delta is None else bias_delta.add_(db)
2246
+ db = None
2247
+
2248
+ if g is None:
2249
+ g = g0 # no magnitude provided -> keep original
2250
+
2251
+ # Re-normalize rows if we changed direction
2252
+ if dir_update is not None:
2253
+ weight.add_(dir_update)
2254
+ V = weight.float()
2255
+ Vn = torch.linalg.vector_norm(V, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps)
2256
+ V /= Vn
2257
+ V *= g
2258
+ weight[...] = V.to(base_dtype)
2259
+ V = None
2260
+ else:
2261
+ weight *= g
2262
+ # Recompose adapted weight; cast back to module dtype
2263
+
2264
+ # Merge DoRA bias delta safely
2265
+ if bias_delta is not None:
2266
+ if bias is None:
2267
+ bias = bias_delta
2268
+ else:
2269
+ bias = bias.clone() if original_bias else bias
2270
+ bias.add_(bias_delta)
2271
+
2272
+ return weight, bias
2273
+
2274
+
2275
+
2159
2276
  def _lora_linear_forward(self, model, submodule, loras_data, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
2160
2277
  weight = submodule.weight
2278
+ bias = submodule.bias
2161
2279
  active_adapters = model._loras_active_adapters
2162
2280
  loras_scaling = model._loras_scaling
2281
+ any_dora = loras_data.get("any_dora", False)
2163
2282
  training = False
2164
2283
 
2165
2284
  dtype = weight.dtype
2166
- if weight.shape[-1] < x.shape[-2]: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
2167
- bias = submodule.bias
2285
+ if weight.shape[-1] < x.shape[-2] or any_dora: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
2286
+ original_bias = True
2168
2287
  original_bias = True
2169
2288
  if len(active_adapters) > 0:
2170
2289
  if isinstance(submodule, QModuleMixin):
@@ -2175,9 +2294,9 @@ class offload:
2175
2294
  data = loras_data.get(active_adapter + '_GPU', None)
2176
2295
  if data == None:
2177
2296
  continue
2178
- lora_A_weight, lora_B_weight, diff_b, alpha = data
2297
+ lora_A_weight, lora_B_weight, diff_b, g_abs, alpha = data
2179
2298
  scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
2180
- if scaling == 0:
2299
+ if scaling == 0 or g_abs is not None:
2181
2300
  continue
2182
2301
  if lora_A_weight != None:
2183
2302
  weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
@@ -2191,6 +2310,10 @@ class offload:
2191
2310
  original_bias = False
2192
2311
  bias.add_(diff_b, alpha=scaling)
2193
2312
  # base_weight += scaling * lora_B_weight @ lora_A_weight
2313
+
2314
+ if any_dora :
2315
+ weight, bias = self._dora_linear_forward(model, submodule, loras_data, weight, bias, original_bias)
2316
+
2194
2317
  if training:
2195
2318
  pass
2196
2319
  # result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
@@ -2198,7 +2321,7 @@ class offload:
2198
2321
  result = torch.nn.functional.linear(x, weight, bias=bias)
2199
2322
 
2200
2323
  else:
2201
- result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
2324
+ result = torch.nn.functional.linear(x, weight, bias=bias)
2202
2325
 
2203
2326
  if len(active_adapters) > 0:
2204
2327
  x = x.to(dtype)
@@ -2207,10 +2330,10 @@ class offload:
2207
2330
  data = loras_data.get(active_adapter + '_GPU', None)
2208
2331
  if data == None:
2209
2332
  continue
2210
- lora_A, lora_B, diff_b, alpha = data
2333
+ lora_A, lora_B, diff_b, g_abs, alpha = data
2211
2334
  # dropout = self.lora_dropout[active_adapter]
2212
2335
  scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
2213
- if scaling == 0:
2336
+ if scaling == 0 or g_abs is not None:
2214
2337
  continue
2215
2338
 
2216
2339
  if lora_A == None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.6.3
3
+ Version: 3.6.4
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.6.3 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes