mmgp 3.6.2__tar.gz → 3.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.6.2
3
+ Version: 3.6.4
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  <p align="center">
3
- <H2>Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep</H2>
3
+ <H2>Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep</H2>
4
4
  </p>
5
5
 
6
6
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.6.2"
3
+ version = "3.6.4"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -168,7 +168,7 @@ def convert_scaled_fp8_to_quanto(
168
168
  require_scale: bool = False,
169
169
  allow_default_scale: bool = True,
170
170
  default_missing_scale: float = 1.0,
171
- scale_dtype: Union[str, torch.dtype] = "float32",
171
+ dtype: Union[str, torch.dtype] = "float32",
172
172
  add_activation_placeholders: bool = True,
173
173
  # dict mode options
174
174
  sd_metadata: Optional[Dict[str, str]] = None,
@@ -176,7 +176,7 @@ def convert_scaled_fp8_to_quanto(
176
176
  free_cuda_cache: bool = False,
177
177
  cuda_cache_interval: int = 32,
178
178
  ) -> ConvertResult:
179
- sd_scale_dtype = _normalize_scale_dtype(scale_dtype)
179
+ sd_scale_dtype = _normalize_scale_dtype(dtype)
180
180
  patch_needed = (sd_scale_dtype == torch.float32)
181
181
 
182
182
  acc, closer = _as_accessor(
@@ -186,6 +186,7 @@ def convert_scaled_fp8_to_quanto(
186
186
  free_cuda_cache=free_cuda_cache,
187
187
  cuda_cache_interval=cuda_cache_interval,
188
188
  )
189
+ if not acc.can_delete(): in_place = False
189
190
  try:
190
191
  meta = acc.metadata() or {}
191
192
  meta_scale_map = _maybe_parse_scale_map(meta) or {}
@@ -216,6 +217,7 @@ def convert_scaled_fp8_to_quanto(
216
217
  sk = scale_weight_map.get(wk)
217
218
  if sk is not None:
218
219
  s_t = acc.get_tensor(sk).to(torch.float32)
220
+ if in_place: acc.delete(s_t)
219
221
  if s_t.numel() == 1:
220
222
  return torch.full((out_ch,), float(s_t.item()), dtype=torch.float32)
221
223
  if s_t.numel() == out_ch:
@@ -231,49 +233,41 @@ def convert_scaled_fp8_to_quanto(
231
233
  return torch.full((out_ch,), float(meta_scale_map[alt]), dtype=torch.float32)
232
234
  return None
233
235
 
234
- # out dict: mutate original dict if in_place, else new dict
235
- out_sd: Dict[str, torch.Tensor] = acc.sd if isinstance(acc, DictAccessor) and in_place else {}
236
+ out_sd: Dict[str, torch.Tensor] = {}
236
237
  qmap: Dict[str, Dict] = {}
237
238
 
238
239
  # Single pass: rewrite FP8 weights, copy-through others
239
240
  for k in keys:
240
241
  # Drop source-only artifacts
241
- if k == "scaled_fp8" or k.endswith(".scale_weight"):
242
- if acc.can_delete(): acc.delete(k)
242
+ if k == "scaled_fp8" or k.endswith(".scale_weight") :
243
243
  continue
244
244
 
245
- if _is_weight_key(k):
246
- t = acc.get_tensor(k)
247
- if t.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
248
- # Quantized: keep original FP8 tensor as _data
249
- out_sd[k + DATA_SUFFIX] = t
245
+ t = acc.get_tensor(k)
246
+ if in_place: acc.delete(k)
247
+ if _is_weight_key(k) and t.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
248
+ # Quantized: keep original FP8 tensor as _data
249
+ out_sd[k + DATA_SUFFIX] = t
250
250
 
251
- out_ch = int(t.shape[0])
252
- s_vec = get_scale_vec_for_weight(k, out_ch)
253
- if s_vec is None:
254
- if require_scale and not allow_default_scale:
255
- raise KeyError(f"No scale found for '{k}' (looked for '.scale_weight' and metadata).")
256
- s_vec = torch.full((out_ch,), float(default_missing_scale), dtype=torch.float32)
251
+ out_ch = int(t.shape[0])
252
+ s_vec = get_scale_vec_for_weight(k, out_ch)
253
+ if s_vec is None:
254
+ if require_scale and not allow_default_scale:
255
+ raise KeyError(f"No scale found for '{k}' (looked for '.scale_weight' and metadata).")
256
+ s_vec = torch.full((out_ch,), float(default_missing_scale), dtype=torch.float32)
257
257
 
258
- s_grid = _per_channel_reshape(s_vec, t).to(sd_scale_dtype)
259
- out_sd[k + SCALE_SUFFIX] = s_grid
260
-
261
- if add_activation_placeholders:
262
- base = k[:-len(".weight")]
263
- out_sd[base + IN_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
264
- out_sd[base + OUT_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
258
+ s_grid = _per_channel_reshape(s_vec, t).to(sd_scale_dtype)
259
+ out_sd[k + SCALE_SUFFIX] = s_grid
265
260
 
261
+ if add_activation_placeholders:
266
262
  base = k[:-len(".weight")]
267
- qmap[base] = {"weights": _QTYPE_NAME[fmt], "activations": "none"}
268
-
269
- if acc.can_delete():
270
- acc.delete(k)
271
- continue # don't copy original .weight
272
-
273
- # Copy-through
274
- if not (isinstance(acc, DictAccessor) and in_place):
275
- out_sd[k] = acc.get_tensor(k)
276
-
263
+ out_sd[base + IN_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
264
+ out_sd[base + OUT_SCALE] = torch.tensor([1], dtype=sd_scale_dtype)
265
+
266
+ base = k[:-len(".weight")]
267
+ qmap[base] = {"weights": _QTYPE_NAME[fmt], "activations": "none"}
268
+ else:
269
+ out_sd[k] = t if t.dtype == dtype or t.dtype == torch.float32 else t.to(dtype)
270
+ t = None
277
271
  return ConvertResult(state_dict=out_sd, quant_map=qmap, fp8_format=fmt, patch_needed=patch_needed)
278
272
  finally:
279
273
  closer()
@@ -481,7 +475,7 @@ def _cli():
481
475
  res = convert_scaled_fp8_to_quanto(
482
476
  args.in_path,
483
477
  fp8_format=args.fp8_format,
484
- scale_dtype=args.scale_dtype,
478
+ dtype=args.scale_dtype,
485
479
  add_activation_placeholders=not args.no_activation_placeholders,
486
480
  default_missing_scale=args.default_missing_scale,
487
481
  )
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -66,8 +66,6 @@ from accelerate import init_empty_weights
66
66
 
67
67
  import functools
68
68
  import types
69
- import torch
70
-
71
69
 
72
70
  from mmgp import safetensors2
73
71
  from mmgp import profile_type
@@ -87,6 +85,9 @@ class QEmbedding(QModuleMixin, torch.nn.Embedding):
87
85
  return torch.nn.functional.embedding( input, self.qweight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse )
88
86
 
89
87
 
88
+
89
+
90
+
90
91
  shared_state = {}
91
92
 
92
93
  def get_cache(cache_name):
@@ -688,7 +689,7 @@ def _welcome():
688
689
  if welcome_displayed:
689
690
  return
690
691
  welcome_displayed = True
691
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.2) by DeepBeepMeep ************{ENDC}{UNBOLD}")
692
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
692
693
 
693
694
  def change_dtype(model, new_dtype, exclude_buffers = False):
694
695
  for submodule_name, submodule in model.named_modules():
@@ -1049,7 +1050,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1049
1050
 
1050
1051
  if split_linear_modules_map != None:
1051
1052
  new_state_dict = dict()
1052
- suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False)]
1053
+ suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False),(".dora_scale", -2, False),]
1053
1054
  for module_name, module_data in state_dict.items():
1054
1055
  name_parts = module_name.split(".")
1055
1056
  for suffix, pos, any_split in suffixes:
@@ -1089,7 +1090,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1089
1090
 
1090
1091
  lora_alphas = {}
1091
1092
  for k in keys:
1092
- if "alpha" in k:
1093
+ if k.endswith(".alpha"):
1093
1094
  alpha_value = state_dict.pop(k)
1094
1095
  if torch.is_tensor(alpha_value):
1095
1096
  alpha_value = float(alpha_value.item())
@@ -1100,13 +1101,16 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1100
1101
  new_state_dict = {}
1101
1102
  for k in list(state_dict.keys()):
1102
1103
  v = state_dict.pop(k)
1103
- lora_A = lora_B = diff_b = diff = lora_key = None
1104
+ lora_A = lora_B = diff_b = diff = lora_key = dora_scale = None
1104
1105
  if k.endswith(".diff"):
1105
1106
  diff = v
1106
1107
  module_name = k[ : -5]
1107
1108
  elif k.endswith(".diff_b"):
1108
1109
  diff_b = v
1109
1110
  module_name = k[ : -7]
1111
+ elif k.endswith(".dora_scale"):
1112
+ dora_scale = v
1113
+ module_name = k[ : -11]
1110
1114
  else:
1111
1115
  pos = k.rfind(".lora_")
1112
1116
  if pos <=0:
@@ -1185,7 +1189,17 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1185
1189
  fail = True
1186
1190
  break
1187
1191
  v = diff_b = diff_b.to(module.weight.dtype)
1188
-
1192
+ elif dora_scale != None:
1193
+ rank = dora_scale.shape[1]
1194
+ if module_shape[0] != v.shape[0]:
1195
+ if ignore_model_variations:
1196
+ skip = True
1197
+ else:
1198
+ msg = f"Lora '{path}': Dora Scale dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, dora scale = {v.shape[0]}). It is likely this Dora has been made for another version of this model."
1199
+ error_msg = append(error_msg, msg)
1200
+ fail = True
1201
+ break
1202
+ v = dora_scale = dora_scale.to(module.weight.dtype)
1189
1203
  if not check_only:
1190
1204
  new_state_dict[k] = v
1191
1205
  v = None
@@ -1193,19 +1207,23 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1193
1207
  assert loras_module_data != None
1194
1208
  loras_adapter_data = loras_module_data.get(adapter_name, None)
1195
1209
  if loras_adapter_data == None:
1196
- loras_adapter_data = [None, None, None, 1.]
1210
+ loras_adapter_data = [None, None, None, None, 1.]
1211
+ module.any_dora = False
1197
1212
  loras_module_data[adapter_name] = loras_adapter_data
1198
1213
  if lora_A != None:
1199
1214
  loras_adapter_data[0] = lora_A
1200
1215
  elif lora_B != None:
1201
1216
  loras_adapter_data[1] = lora_B
1217
+ elif dora_scale != None:
1218
+ loras_adapter_data[3] = dora_scale
1219
+ loras_module_data["any_dora"] = True
1202
1220
  else:
1203
1221
  loras_adapter_data[2] = diff_b
1204
1222
  if rank != None and lora_key is not None and "lora" in lora_key:
1205
1223
  alpha_key = k[:-len(lora_key)] + "alpha"
1206
1224
  alpha = lora_alphas.get(alpha_key, None)
1207
- if alpha is not None: loras_adapter_data[3] = alpha / rank
1208
- lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = None
1225
+ if alpha is not None: loras_adapter_data[4] = alpha / rank
1226
+ lora_A = lora_B = diff = diff_b = v = loras_module_data = loras_adapter_data = lora_alphas = dora_scale = None
1209
1227
 
1210
1228
  if len(invalid_keys) > 0:
1211
1229
  msg = f"Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
@@ -1413,7 +1431,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
1413
1431
 
1414
1432
 
1415
1433
 
1416
- def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1):
1434
+ def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, postprocess_sd = None, modules = None, return_shared_modules = None, default_dtype = torch.bfloat16, ignore_unused_weights = False, verboseLevel = -1):
1417
1435
  """
1418
1436
  Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
1419
1437
  """
@@ -1489,38 +1507,41 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
1489
1507
  state_dict.update(sd)
1490
1508
  else:
1491
1509
  state_dict, metadata = _safetensors_load_file(file, writable_tensors =writable_tensors)
1492
-
1493
- if metadata != None:
1494
- quantization_map = metadata.get("quantization_map", None)
1495
- config = metadata.get("config", None)
1496
- if config is not None:
1497
- model._config = config
1498
-
1499
- tied_weights_map = metadata.get("tied_weights_map", None)
1500
- if tied_weights_map != None:
1501
- for name, tied_weights_list in tied_weights_map.items():
1502
- mapped_weight = state_dict[name]
1503
- for tied_weights in tied_weights_list:
1504
- state_dict[tied_weights] = mapped_weight
1505
-
1506
- if quantization_map is None:
1507
- detection_type = detect_safetensors_format(state_dict)
1508
- if detection_type["kind"] in ['scaled_fp8','fp8']:
1509
- conv_result = convert_scaled_fp8_to_quanto(state_dict, scale_dtype = default_dtype)
1510
- state_dict = conv_result["state_dict"]
1511
- quantization_map = conv_result["quant_map"]
1512
- conv_result = None
1513
- # enable_fp8_fp32_scale_support()
1514
-
1515
- if quantization_map is None:
1516
- pos = str.rfind(file, ".")
1517
- if pos > 0:
1518
- quantization_map_path = file[:pos]
1519
- quantization_map_path += "_map.json"
1520
-
1521
- if os.path.isfile(quantization_map_path):
1522
- with open(quantization_map_path, 'r') as f:
1523
- quantization_map = json.load(f)
1510
+
1511
+ if preprocess_sd != None:
1512
+ state_dict = preprocess_sd(state_dict)
1513
+
1514
+ if metadata != None:
1515
+ quantization_map = metadata.get("quantization_map", None)
1516
+ config = metadata.get("config", None)
1517
+ if config is not None:
1518
+ model._config = config
1519
+
1520
+ tied_weights_map = metadata.get("tied_weights_map", None)
1521
+ if tied_weights_map != None:
1522
+ for name, tied_weights_list in tied_weights_map.items():
1523
+ mapped_weight = state_dict[name]
1524
+ for tied_weights in tied_weights_list:
1525
+ state_dict[tied_weights] = mapped_weight
1526
+
1527
+ if quantization_map is None:
1528
+ detection_type = detect_safetensors_format(state_dict)
1529
+ if detection_type["kind"] in ['scaled_fp8','fp8']:
1530
+ conv_result = convert_scaled_fp8_to_quanto(state_dict, dtype = default_dtype, in_place= True)
1531
+ state_dict = conv_result["state_dict"]
1532
+ quantization_map = conv_result["quant_map"]
1533
+ conv_result = None
1534
+ # enable_fp8_fp32_scale_support()
1535
+
1536
+ if quantization_map is None:
1537
+ pos = str.rfind(file, ".")
1538
+ if pos > 0:
1539
+ quantization_map_path = file[:pos]
1540
+ quantization_map_path += "_map.json"
1541
+
1542
+ if os.path.isfile(quantization_map_path):
1543
+ with open(quantization_map_path, 'r') as f:
1544
+ quantization_map = json.load(f)
1524
1545
 
1525
1546
  full_state_dict.update(state_dict)
1526
1547
  if quantization_map != None:
@@ -1539,8 +1560,8 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
1539
1560
  full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
1540
1561
 
1541
1562
  # deal if we are trying to load just a sub part of a larger model
1542
- if preprocess_sd != None:
1543
- state_dict, quantization_map = preprocess_sd(state_dict, quantization_map)
1563
+ if postprocess_sd != None:
1564
+ state_dict, quantization_map = postprocess_sd(state_dict, quantization_map)
1544
1565
 
1545
1566
  if modelPrefix != None:
1546
1567
  base_model_prefix = modelPrefix + "."
@@ -2127,7 +2148,7 @@ class offload:
2127
2148
  data = loras_data.get(active_adapter + '_GPU', None)
2128
2149
  if data == None:
2129
2150
  continue
2130
- diff_w , _ , diff_b, alpha = data
2151
+ diff_w , _ , diff_b, _, alpha = data
2131
2152
  scaling = self._get_lora_scaling( loras_scaling, model, active_adapter) * alpha
2132
2153
  if scaling == 0:
2133
2154
  continue
@@ -2153,15 +2174,116 @@ class offload:
2153
2174
  return ret
2154
2175
 
2155
2176
 
2177
+ def _dora_linear_forward(
2178
+ self,
2179
+ model,
2180
+ submodule,
2181
+ adapters_data, # dict: name+"_GPU" -> (A, B, diff_b, g_abs, alpha); g_abs=None means LoRA
2182
+ weight= None,
2183
+ bias = None,
2184
+ original_bias = True,
2185
+ dora_mode: str = "blend", # "ref_exact" | "blend"
2186
+ ):
2187
+ active_adapters = getattr(model, "_loras_active_adapters", [])
2188
+ loras_scaling = getattr(model, "_loras_scaling", {})
2189
+ # Snapshot base weight (safe for quantized modules)
2190
+ if weight is None:
2191
+ bias = submodule.bias
2192
+ original_bias = True
2193
+ if isinstance(submodule, QModuleMixin):
2194
+ weight = submodule.weight.view(submodule.weight.shape)
2195
+ else:
2196
+ weight = submodule.weight.clone()
2197
+
2198
+ base_dtype = weight.dtype
2199
+ eps = 1e-8
2200
+ W0 = weight.float()
2201
+ g0 = torch.linalg.vector_norm(W0, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps) # [out,1]
2202
+
2203
+ # Keep big mats in low precision
2204
+ # Wc = W0 if W0.dtype == compute_dtype else W0.to(compute_dtype)
2205
+ W0 /= g0
2206
+ weight[...] = W0.to(base_dtype)
2207
+ W0 = None
2208
+
2209
+ dir_update = None # Σ s * ((B@A)/g0) in compute_dtype
2210
+ g = None # final magnitude: set absolute (ref_exact) or blended (blend)
2211
+ bias_delta = None # Σ s * diff_b
2212
+
2213
+ # Accumulate DoRA adapters only (g_abs != None)
2214
+ for name in active_adapters:
2215
+ data = adapters_data.get(name + "_GPU", None)
2216
+ if data is None: continue
2217
+ A, B, diff_b, g_abs, alpha = data
2218
+ if g_abs is None: continue
2219
+
2220
+ s = self._get_lora_scaling(loras_scaling, model, name) * float(alpha)
2221
+ if s == 0: continue
2222
+
2223
+ # Direction update in V-space with row-wise 1/g0
2224
+ if (A is not None) and (B is not None):
2225
+ dV = torch.mm(B, A) # [out,in], compute_dtype
2226
+ dV /= g0 # row-wise divide
2227
+ dV.mul_(s)
2228
+ dir_update = dV if dir_update is None else dir_update.add_(dV)
2229
+
2230
+
2231
+ if dora_mode == "ref_exact":
2232
+ # absolute magnitude (last one wins if multiple DoRAs present)
2233
+ g = g_abs
2234
+ elif dora_mode == "blend":
2235
+ # blend towards absolute magnitude proportional to s
2236
+ if g is None:
2237
+ g = g0.clone()
2238
+ g.add_(g_abs.sub(g0), alpha=s)
2239
+ else:
2240
+ raise ValueError(f"Unknown dora_mode: {dora_mode}")
2241
+
2242
+ # Optional bias deltas (not in reference, but harmless if present)
2243
+ if diff_b is not None:
2244
+ db = diff_b.mul(s)
2245
+ bias_delta = db if bias_delta is None else bias_delta.add_(db)
2246
+ db = None
2247
+
2248
+ if g is None:
2249
+ g = g0 # no magnitude provided -> keep original
2250
+
2251
+ # Re-normalize rows if we changed direction
2252
+ if dir_update is not None:
2253
+ weight.add_(dir_update)
2254
+ V = weight.float()
2255
+ Vn = torch.linalg.vector_norm(V, dim=1, keepdim=True, dtype=torch.float32).clamp_min(eps)
2256
+ V /= Vn
2257
+ V *= g
2258
+ weight[...] = V.to(base_dtype)
2259
+ V = None
2260
+ else:
2261
+ weight *= g
2262
+ # Recompose adapted weight; cast back to module dtype
2263
+
2264
+ # Merge DoRA bias delta safely
2265
+ if bias_delta is not None:
2266
+ if bias is None:
2267
+ bias = bias_delta
2268
+ else:
2269
+ bias = bias.clone() if original_bias else bias
2270
+ bias.add_(bias_delta)
2271
+
2272
+ return weight, bias
2273
+
2274
+
2275
+
2156
2276
  def _lora_linear_forward(self, model, submodule, loras_data, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
2157
2277
  weight = submodule.weight
2278
+ bias = submodule.bias
2158
2279
  active_adapters = model._loras_active_adapters
2159
2280
  loras_scaling = model._loras_scaling
2281
+ any_dora = loras_data.get("any_dora", False)
2160
2282
  training = False
2161
2283
 
2162
2284
  dtype = weight.dtype
2163
- if weight.shape[-1] < x.shape[-2]: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
2164
- bias = submodule.bias
2285
+ if weight.shape[-1] < x.shape[-2] or any_dora: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
2286
+ original_bias = True
2165
2287
  original_bias = True
2166
2288
  if len(active_adapters) > 0:
2167
2289
  if isinstance(submodule, QModuleMixin):
@@ -2172,9 +2294,9 @@ class offload:
2172
2294
  data = loras_data.get(active_adapter + '_GPU', None)
2173
2295
  if data == None:
2174
2296
  continue
2175
- lora_A_weight, lora_B_weight, diff_b, alpha = data
2297
+ lora_A_weight, lora_B_weight, diff_b, g_abs, alpha = data
2176
2298
  scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
2177
- if scaling == 0:
2299
+ if scaling == 0 or g_abs is not None:
2178
2300
  continue
2179
2301
  if lora_A_weight != None:
2180
2302
  weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
@@ -2188,6 +2310,10 @@ class offload:
2188
2310
  original_bias = False
2189
2311
  bias.add_(diff_b, alpha=scaling)
2190
2312
  # base_weight += scaling * lora_B_weight @ lora_A_weight
2313
+
2314
+ if any_dora :
2315
+ weight, bias = self._dora_linear_forward(model, submodule, loras_data, weight, bias, original_bias)
2316
+
2191
2317
  if training:
2192
2318
  pass
2193
2319
  # result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
@@ -2195,7 +2321,7 @@ class offload:
2195
2321
  result = torch.nn.functional.linear(x, weight, bias=bias)
2196
2322
 
2197
2323
  else:
2198
- result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
2324
+ result = torch.nn.functional.linear(x, weight, bias=bias)
2199
2325
 
2200
2326
  if len(active_adapters) > 0:
2201
2327
  x = x.to(dtype)
@@ -2204,10 +2330,10 @@ class offload:
2204
2330
  data = loras_data.get(active_adapter + '_GPU', None)
2205
2331
  if data == None:
2206
2332
  continue
2207
- lora_A, lora_B, diff_b, alpha = data
2333
+ lora_A, lora_B, diff_b, g_abs, alpha = data
2208
2334
  # dropout = self.lora_dropout[active_adapter]
2209
2335
  scaling = self._get_lora_scaling(loras_scaling, model, active_adapter) * alpha
2210
- if scaling == 0:
2336
+ if scaling == 0 or g_abs is not None:
2211
2337
  continue
2212
2338
 
2213
2339
  if lora_A == None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.6.2
3
+ Version: 3.6.4
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.6.2 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.4 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes