mmgp 3.2.6__tar.gz → 3.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.2.6
3
+ Version: 3.2.8
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -13,11 +13,10 @@ Requires-Dist: optimum-quanto
13
13
  Requires-Dist: accelerate
14
14
  Requires-Dist: safetensors
15
15
  Requires-Dist: psutil
16
- Requires-Dist: peft
17
16
 
18
17
 
19
18
  <p align="center">
20
- <H2>Memory Management 3.2.6 for the GPU Poor by DeepBeepMeep</H2>
19
+ <H2>Memory Management 3.2.8 for the GPU Poor by DeepBeepMeep</H2>
21
20
  </p>
22
21
 
23
22
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  <p align="center">
3
- <H2>Memory Management 3.2.6 for the GPU Poor by DeepBeepMeep</H2>
3
+ <H2>Memory Management 3.2.8 for the GPU Poor by DeepBeepMeep</H2>
4
4
  </p>
5
5
 
6
6
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.2.6"
3
+ version = "3.2.8"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -13,7 +13,6 @@ dependencies = [
13
13
  "optimum-quanto",
14
14
  "accelerate",
15
15
  "safetensors",
16
- "psutil",
17
- "peft"
16
+ "psutil"
18
17
  ]
19
18
 
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.2.6 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.2.8 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -63,25 +63,11 @@ import json
63
63
  import psutil
64
64
  from accelerate import init_empty_weights
65
65
 
66
- try:
67
-
68
- from peft.tuners.tuners_utils import BaseTuner
69
-
70
- from diffusers.utils.peft_utils import set_weights_and_activate_adapters, get_peft_kwargs
71
- except:
72
- set_weights_and_activate_adapters = None
73
- get_peft_kwargs = None
74
- pass
75
- try:
76
- from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
77
- except:
78
- inject_adapter_in_model = None
79
- pass
80
66
 
81
67
  from mmgp import safetensors2
82
68
  from mmgp import profile_type
83
69
 
84
- from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module, register_qmodule
70
+ from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QLinear, QTensor, quantize_module, register_qmodule
85
71
 
86
72
  # support for Embedding module quantization that is not supported by default by quanto
87
73
  @register_qmodule(torch.nn.Embedding)
@@ -302,13 +288,115 @@ def _get_tensor_ref(p):
302
288
  return p.data_ptr()
303
289
 
304
290
 
305
- def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, verboseLevel = 1):
291
+ # BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
292
+ BIG_TENSOR_MAX_SIZE = 2**27 # 128 MB
293
+
294
+ def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
295
+ tied_weights = {}
296
+ tied_weights_count = 0
297
+ tied_weights_total = 0
298
+ tied_weights_last = None
299
+ ref_cache = {}
300
+
301
+ for n, p in sd.items():
302
+ ref = _get_tensor_ref(p)
303
+ match = ref_cache.get(ref, None)
304
+ if match != None:
305
+ match_name, match_size = match
306
+ tied_weights_count += 1
307
+ tied_weights_total += match_size
308
+ if verboseLevel >=1:
309
+ tied_weights_last = f"{match_name} <-> {n}"
310
+ tied_weights[n] = match_name
311
+ else:
312
+ length = torch.numel(p.data) * p.data.element_size()
313
+ ref_cache[ref] = (n, length)
314
+
315
+ if verboseLevel >=1 and tied_weights_count > 0:
316
+ if tied_weights_count == 1:
317
+ print(f"Tied weights of {tied_weights_total/ONE_MB:0.2f} MB detected: {tied_weights_last}")
318
+ else:
319
+ print(f"Found {tied_weights_count} tied weights for a total of {tied_weights_total/ONE_MB:0.2f} MB, last : {tied_weights_last}")
320
+
321
+ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
322
+ current_big_tensor_size = 0
323
+ big_tensor_no = 0
324
+ big_tensors_sizes = []
325
+ tensor_map_indexes = []
326
+ total_tensor_bytes = 0
327
+
328
+ for n, p in sd.items():
329
+ if tied_weights == None or not n in tied_weights :
330
+ length = torch.numel(p.data) * p.data.element_size()
331
+
332
+ if current_big_tensor_size + length > gig_tensor_size :
333
+ big_tensors_sizes.append(current_big_tensor_size)
334
+ current_big_tensor_size = 0
335
+ big_tensor_no += 1
336
+
337
+ itemsize = p.data.dtype.itemsize
338
+ if current_big_tensor_size % itemsize:
339
+ current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
340
+ tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
341
+ current_big_tensor_size += length
342
+
343
+ total_tensor_bytes += length
344
+
345
+ big_tensors_sizes.append(current_big_tensor_size)
346
+
347
+ big_tensors = []
348
+ last_big_tensor = 0
349
+ total = 0
350
+
351
+ for size in big_tensors_sizes:
352
+ try:
353
+ current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
354
+ big_tensors.append(current_big_tensor)
355
+ except:
356
+ print(f"Unable to pin more tensors for '{sd_name}' as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
357
+ break
358
+
359
+ last_big_tensor += 1
360
+ total += size
361
+
362
+
363
+ tensor_no = 0
364
+ # prev_big_tensor = 0
365
+ q_name = None
366
+ for n, p in sd.items():
367
+ if tied_weights != None:
368
+ q_name = tied_weights.get(n,None)
369
+ if q_name != None:
370
+ q = sd[q_name]
371
+ p.data = q.data
372
+ assert p.data.is_pinned()
373
+ q = None
374
+ else:
375
+ big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
376
+
377
+ if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
378
+ current_big_tensor = big_tensors[big_tensor_no]
379
+ length = torch.numel(p.data) * p.data.element_size()
380
+ q = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
381
+ torch.utils.swap_tensors(p, q)
382
+ del q
383
+ tensor_no += 1
384
+ del p
385
+ # global total_pinned_bytes
386
+ # total_pinned_bytes += total
387
+ gc.collect()
388
+
389
+ if verboseLevel >=1:
390
+ print(f"'{sd_name}' was pinned entirely to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
391
+
392
+ return
393
+
394
+
395
+ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
306
396
  if partialPinning:
307
397
  towers_names, _ = _detect_main_towers(model)
308
398
 
309
399
 
310
- # BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
311
- BIG_TENSOR_MAX_SIZE = 2**27 # 128 MB
312
400
  current_big_tensor_size = 0
313
401
  big_tensor_no = 0
314
402
  big_tensors_sizes = []
@@ -320,7 +408,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, v
320
408
  include = True
321
409
  if partialPinning:
322
410
  include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
323
- if include and not pinnedLora and ".lora_" in k:
411
+ if include and not pinnedPEFTLora and ".lora_" in k:
324
412
  include = False
325
413
 
326
414
  if include:
@@ -368,7 +456,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, v
368
456
  length = torch.numel(p.data) * p.data.element_size()
369
457
 
370
458
  ref_cache[ref] = (n, length)
371
- if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
459
+ if current_big_tensor_size + length > gig_tensor_size :
372
460
  big_tensors_sizes.append(current_big_tensor_size)
373
461
  current_big_tensor_size = 0
374
462
  big_tensor_no += 1
@@ -463,7 +551,6 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, v
463
551
  else:
464
552
  length = torch.numel(p.data) * p.data.element_size()
465
553
  p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
466
- p.aaaaa = n
467
554
  tensor_no += 1
468
555
  del p
469
556
  global total_pinned_bytes
@@ -488,7 +575,7 @@ def _welcome():
488
575
  if welcome_displayed:
489
576
  return
490
577
  welcome_displayed = True
491
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.6) by DeepBeepMeep ************{ENDC}{UNBOLD}")
578
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.8) by DeepBeepMeep ************{ENDC}{UNBOLD}")
492
579
 
493
580
  def _extract_num_from_str(num_in_str):
494
581
  size = len(num_in_str)
@@ -771,167 +858,66 @@ def split_linear_modules(model, map ):
771
858
 
772
859
  delattr(parent_module, module_suffix)
773
860
 
774
- def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
775
- self._check_forward_args(x, *args, **kwargs)
776
- adapter_names = kwargs.pop("adapter_names", None)
777
- if self.disable_adapters:
778
- if self.merged:
779
- self.unmerge()
780
- result = self.base_layer(x, *args, **kwargs)
781
- elif adapter_names is not None:
782
- result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
783
- elif self.merged:
784
- result = self.base_layer(x, *args, **kwargs)
785
- else:
786
- def get_scaling(active_adapter):
787
- scaling_dict = shared_state.get("_lora_scaling", None)
788
- if scaling_dict == None:
789
- return self.scaling[active_adapter]
790
- scaling_list = scaling_dict[active_adapter]
791
- if isinstance(scaling_list, list):
792
- step_no =shared_state.get("_lora_step_no", 0)
793
- return scaling_list[step_no]
794
- else:
795
- return float(scaling_list)
796
-
797
- base_weight = self.base_layer.weight
798
- new_weights = not isinstance(self.base_layer, QModuleMixin)
799
- if base_weight.shape[-1] < x.shape[-2] : # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
800
- for active_adapter in self.active_adapters:
801
- if active_adapter not in self.lora_A.keys():
802
- continue
803
- if self.use_dora[active_adapter]:
804
- raise Exception("Dora not yet supported by mmgp")
805
-
806
- lora_A = self.lora_A[active_adapter]
807
- lora_B = self.lora_B[active_adapter]
808
- dropout = self.lora_dropout[active_adapter]
809
- scaling = get_scaling(active_adapter)
810
- lora_A_weight = lora_A.weight
811
- lora_B_weight = lora_B.weight
812
- if new_weights or True:
813
- base_weight = torch.addmm(base_weight, lora_B_weight, lora_A_weight, alpha= scaling )
814
- # base_weight = base_weight + scaling * lora_B_weight @ lora_A_weight
815
- else:
816
- base_weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
817
- # base_weight += scaling * lora_B_weight @ lora_A_weight
818
- new_weights = False
819
-
820
- if self.training:
821
- result = torch.nn.functional.linear(dropout(x), base_weight, bias=self.base_layer.bias)
822
- else:
823
- result = torch.nn.functional.linear(x, base_weight, bias=self.base_layer.bias)
824
- torch_result_dtype = result.dtype
825
-
826
- else:
827
- result = self.base_layer(x, *args, **kwargs)
828
- torch_result_dtype = result.dtype
829
- x = x.to(torch.bfloat16)
830
-
831
- for active_adapter in self.active_adapters:
832
- if active_adapter not in self.lora_A.keys():
833
- continue
834
- lora_A = self.lora_A[active_adapter]
835
- lora_B = self.lora_B[active_adapter]
836
- dropout = self.lora_dropout[active_adapter]
837
- scaling = get_scaling(active_adapter)
838
- x = x.to(lora_A.weight.dtype)
839
-
840
- if not self.use_dora[active_adapter]:
841
- if self.training:
842
- y = lora_A(dropout(x))
843
- else:
844
- y = lora_A(x)
845
-
846
- y = lora_B(y)
847
- y*= scaling
848
- result+= y
849
- del lora_A, lora_B, y
850
- # result = result + lora_B(lora_A(dropout(x))) * scaling
851
- else:
852
- if isinstance(dropout, torch.nn.Identity) or not self.training:
853
- base_result = result
854
- else:
855
- x = dropout(x)
856
- base_result = None
857
-
858
- result = result + self.lora_magnitude_vector[active_adapter](
859
- x,
860
- lora_A=lora_A,
861
- lora_B=lora_B,
862
- scaling=scaling,
863
- base_layer=self.get_base_layer(),
864
- base_result=base_result,
865
- )
866
-
867
- result = result.to(torch_result_dtype)
868
- return result
869
-
870
- def _inject_adapter(
871
- self, model: torch.nn.Module, adapter_name: str, autocast_adapter_dtype: bool = True, low_cpu_mem_usage: bool = False
872
- ) -> None:
873
-
874
- def _get_submodules(model, key):
875
- parent = model.get_submodule(".".join(key.split(".")[:-1]))
876
- target_name = key.split(".")[-1]
877
- target = model.get_submodule(key)
878
- return parent, target, target_name
879
-
880
- peft_config = self.peft_config[adapter_name]
881
- self._check_new_adapter_config(peft_config)
882
-
883
- model_config = self.get_model_config(model)
884
-
885
- peft_config = self._prepare_adapter_config(peft_config, model_config)
886
861
 
887
- self._prepare_model(peft_config, model)
888
-
889
- target_modules = peft_config.target_modules.copy()
890
-
891
- # unexpected_modules = []
892
- for key, target in model.named_modules():
893
- if not key:
894
- continue
895
- if key in target_modules:
896
- target_modules.remove(key)
897
- self.targeted_module_names.append(key)
898
- # pos = key.rfind(".")
899
- # parent = key[:pos]
900
- # target_name = key[pos+1:]
901
- parent, target, target_name = _get_submodules(model, key)
902
- with init_empty_weights():
903
- self._create_and_replace(peft_config, adapter_name, target, target_name, parent, current_key=key)
904
-
905
- self.set_adapter(self.active_adapters)
906
- self._mark_only_adapters_as_trainable(model)
907
-
908
- return target_modules
909
-
910
- def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
862
+ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, check_only = False, ignore_model_variations = False, pinnedLora = False, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
911
863
  verboseLevel = _compute_verbose_level(verboseLevel)
864
+ modules_dict = {k: v for k,v in model.named_modules()}
865
+
866
+ if not check_only:
867
+ loras_model_data = dict()
868
+ model._loras_model_data = loras_model_data
869
+ loras_active_adapters = set()
870
+ model._loras_active_adapters = loras_active_adapters
871
+ loras_scaling = dict()
872
+ model._loras_scaling = loras_scaling
873
+ loras_tied_weights = dict()
874
+ model._loras_tied_weights = loras_tied_weights
912
875
 
913
- if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or get_peft_kwargs == None:
914
- raise Exception("Unable to load Lora, missing 'peft' and / or 'diffusers' modules")
915
-
916
- from peft.tuners.lora import Linear
917
- Linear.forward = _lora_linear_forward
918
- BaseTuner.inject_adapter = _inject_adapter
876
+ CrLf = '\r\n'
877
+ error_msg = ""
878
+ def append(source, text ):
879
+ if len(source) == 0:
880
+ return text
881
+ else:
882
+ return source + CrLf + text
883
+
884
+ def trunc(text, sz):
885
+ if len(text) < sz:
886
+ return str(text)
887
+ else:
888
+ return str(text)[0:sz] + '...'
919
889
 
920
890
  if not isinstance(lora_path, list):
921
891
  lora_path = [lora_path]
922
892
 
923
893
  if lora_multi is None:
924
894
  lora_multi = [1. for _ in lora_path]
925
-
895
+ loras_nos = []
896
+ loras_multi = []
897
+ new_lora_path = []
898
+ errors = []
899
+ adapters = {}
900
+ adapter_no = 0
926
901
  for i, path in enumerate(lora_path):
927
- adapter_name = str(i)
928
-
902
+ adapter_name = str(adapter_no)
903
+ error_msg = ""
904
+ if not os.path.isfile(path):
905
+ error_msg = f"Lora '{path}' was not found"
906
+ errors.append((path, error_msg))
907
+ print(error_msg)
908
+ continue
909
+ fail = False
910
+ skip = False
929
911
  state_dict = safetensors2.torch_load_file(path)
912
+
913
+
914
+
915
+
930
916
  if preprocess_sd != None:
931
917
  state_dict = preprocess_sd(state_dict)
932
918
 
933
919
  if split_linear_modules_map != None:
934
- new_state_dict = {}
920
+ new_state_dict = dict()
935
921
  targets_A = { "."+k+".lora_A.weight" : k for k in split_linear_modules_map }
936
922
  targets_B = { "."+k+".lora_B.weight" : k for k in split_linear_modules_map }
937
923
  for module_name, module_data in state_dict.items():
@@ -961,82 +947,158 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
961
947
  new_state_dict[module_name] = module_data
962
948
  state_dict = new_state_dict
963
949
  del new_state_dict
950
+ # tied_weights = _extract_tie_weights_from_sd(state_dict, path) # to do
951
+
952
+ clean_up = False
953
+ first_key = next(iter(state_dict), None)
954
+ if first_key == None:
955
+ msg = f"Empty Lora '{path}'"
956
+ error_msg = append(error_msg, msg)
957
+ fail = True
958
+
959
+ if not fail:
960
+ pos = first_key.find(".")
961
+ prefix = first_key[0:pos]
962
+ if prefix not in ["diffusion_model", "transformer"]:
963
+ msg = f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format."
964
+ error_msg = append(error_msg, msg)
965
+ fail = True
966
+
967
+ if not fail:
968
+
969
+ state_dict = { k[ len(prefix) + 1:]: v for k, v in state_dict.items() if k.startswith(prefix) }
970
+ clean_up = True
971
+
972
+ keys = list(state_dict.keys())
973
+
974
+ lora_alphas = {}
975
+ for k in keys:
976
+ if "alpha" in k:
977
+ alpha_value = state_dict.pop(k)
978
+ if torch.is_tensor(alpha_value):
979
+ alpha_value = float(alpha_value.item())
980
+ lora_alphas[k] = alpha_value
981
+
982
+ invalid_keys = []
983
+ unexpected_keys = []
984
+ for k, v in state_dict.items():
985
+ pos = k.rfind(".lora_")
986
+ if pos <=0:
987
+ invalid_keys.append(k)
988
+ continue
989
+ module_name = k[ : pos]
990
+ lora_key = k[ pos+1:]
991
+ lora_A = None
992
+ lora_B = None
993
+ if lora_key == "lora_A.weight":
994
+ lora_A = v
995
+ elif lora_key == "lora_B.weight":
996
+ lora_B = v
997
+ else:
998
+ invalid_keys.append(k)
999
+ continue
964
1000
 
965
- keys = list(state_dict.keys())
966
- if len(keys) == 0:
967
- raise Exception(f"Empty Lora '{path}'")
968
-
969
- network_alphas = {}
970
- for k in keys:
971
- if "alpha" in k:
972
- alpha_value = state_dict.pop(k)
973
- if not ( (torch.is_tensor(alpha_value) and torch.is_floating_point(alpha_value)) or isinstance(
974
- alpha_value, float
975
- )):
976
- network_alphas[k] = torch.tensor( float(alpha_value.item() ) )
977
-
978
- pos = keys[0].find(".")
979
- prefix = keys[0][0:pos]
980
- if not any( prefix.startswith(some_prefix) for some_prefix in ["diffusion_model", "transformer"]):
981
- msg = f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format."
982
- raise Exception(msg)
983
-
984
- transformer = model
985
-
986
- transformer_keys = [k for k in keys if k.startswith(prefix)]
987
- state_dict = {
988
- k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in transformer_keys
989
- }
990
-
991
- sd_keys = state_dict.keys()
992
- if len(sd_keys) == 0:
993
- print(f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format.")
994
- return
995
-
996
- # is_correct_format = all("lora" in key for key in state_dict.keys())
997
-
998
- # check with first key if is not in peft format
999
- # first_key = next(iter(state_dict.keys()))
1000
- # if "lora_A" not in first_key:
1001
- # state_dict = convert_unet_state_dict_to_peft(state_dict)
1001
+ module = modules_dict.get(module_name, None)
1002
+ if module == None:
1003
+ unexpected_keys.append(k)
1004
+ continue
1005
+ if not isinstance(module, (QLinear, torch.nn.Linear)):
1006
+ msg = f"Lora '{path}' contains a non linear layer '{k}'"
1007
+ error_msg = append(error_msg, msg)
1008
+ fail = True
1009
+ break
1010
+ module_shape = module.weight.shape
1011
+ if lora_A != None:
1012
+ if module_shape[1] != v.shape[1]:
1013
+ if ignore_model_variations:
1014
+ skip = True
1015
+ else:
1016
+ msg = f"Lora '{path}': Lora A dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[1]}, lora A = {v.shape[1]}). It is likely this Lora has been made for another version of this model."
1017
+ error_msg = append(error_msg, msg)
1018
+ fail = True
1019
+ break
1020
+ if lora_B != None:
1021
+ if module_shape[0] != v.shape[0]:
1022
+ if ignore_model_variations:
1023
+ skip = True
1024
+ else:
1025
+ msg = f"Lora '{path}': Lora B dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, lora B = {v.shape[0]}). It is likely this Lora has been made for another version of this model."
1026
+ error_msg = append(error_msg, msg)
1027
+ fail = True
1028
+ break
1029
+ if not check_only:
1030
+ loras_module_data = loras_model_data.get(module, None)
1031
+ if loras_module_data == None:
1032
+ loras_module_data = dict()
1033
+ loras_model_data[module] = loras_module_data
1034
+ loras_adapter_data = loras_module_data.get(adapter_name, None)
1035
+ lora_A = None if lora_A == None else lora_A.to(torch.bfloat16)
1036
+ lora_B = None if lora_B == None else lora_B.to(torch.bfloat16)
1037
+ if loras_adapter_data == None:
1038
+ alpha = lora_alphas.get(k[:-len("lora_X.weight")] + "alpha", 1.)
1039
+ loras_adapter_data = [lora_A, lora_B, alpha]
1040
+ loras_module_data[adapter_name] = loras_adapter_data
1041
+ elif lora_A != None:
1042
+ loras_adapter_data[0] = lora_A
1043
+ else:
1044
+ loras_adapter_data[1] = lora_B
1045
+ lora_A, lora_B, v, loras_module_data, loras_adapter_data = None, None, None, None, None
1046
+ lora_alphas = None
1047
+
1048
+ if len(invalid_keys) > 0:
1049
+ msg = "Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
1050
+ error_msg = append(error_msg, msg)
1051
+ fail = True
1052
+ if len(unexpected_keys) > 0:
1053
+ msg = f"Lora '{path}' contains unexpected module keys, it is likely that this Lora is for a different model : '{trunc(unexpected_keys,200)}'"
1054
+ error_msg = append(error_msg, msg)
1055
+ fail = True
1056
+ if fail or skip:
1057
+ if fail:
1058
+ errors.append((path, error_msg))
1059
+ print(error_msg)
1060
+ if clean_up and not check_only:
1061
+ for m,loras_module_data in loras_model_data.items():
1062
+ if adapter_name in loras_module_data:
1063
+ del loras_module_data[adapter_name]
1002
1064
 
1003
- if adapter_name in getattr(transformer, "peft_config", {}):
1004
- raise ValueError(
1005
- f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
1006
- )
1065
+ else:
1066
+ if not check_only:
1067
+ # model._loras_tied_weights[adapter_name] = tied_weights
1068
+ if pinnedLora:
1069
+ _pin_sd_to_memory(state_dict, path)
1007
1070
 
1008
- rank = {}
1009
- for key, val in state_dict.items():
1010
- if "lora_B" in key:
1011
- rank[key] = val.shape[1]
1071
+ del state_dict
1012
1072
 
1013
- if network_alphas is not None and len(network_alphas) >= 1:
1014
- alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
1015
- network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
1016
1073
 
1017
- lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
1018
-
1019
- lora_config = LoraConfig(**lora_config_kwargs)
1020
- peft_kwargs = {}
1021
- peft_kwargs["low_cpu_mem_usage"] = True
1022
- inject_adapter_in_model(lora_config, model, adapter_name=adapter_name, **peft_kwargs)
1023
-
1024
- incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name, **peft_kwargs)
1025
-
1026
- warn_msg = ""
1027
- if incompatible_keys is not None:
1028
- # Check only for unexpected keys.
1029
- unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
1030
- if unexpected_keys:
1031
- raise Exception(f"Lora '{path}' contains invalid keys '{unexpected_keys}'")
1074
+ adapters[adapter_name] = path
1075
+ loras_nos.append(adapter_name)
1076
+ new_lora_path.append(path)
1077
+ loras_multi.append(1.0 if i > (len(lora_multi) -1) else lora_multi[i])
1078
+ pass
1079
+ adapter_no += 1
1080
+ if verboseLevel >=1:
1081
+ if check_only:
1082
+ print(f"Lora '{path}' was found for model '{_get_module_name(model)}'")
1083
+ else:
1084
+ print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
1032
1085
 
1033
- if verboseLevel >=1:
1034
- print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
1086
+ model._loras_errors = errors
1087
+ if not check_only:
1088
+ model._loras_adapters = adapters
1035
1089
  if activate_all_loras:
1036
- set_weights_and_activate_adapters(model,[ str(i) for i in range(len(lora_multi))], lora_multi)
1090
+ activate_loras(model, loras_nos, loras_multi)
1091
+ return new_lora_path
1037
1092
 
1038
- def set_step_no_for_lora(step_no):
1039
- shared_state["_lora_step_no"] = step_no
1093
+ def unload_loras_from_model(model):
1094
+ model._loras_model_data = None
1095
+ model._loras_errors = None
1096
+ model._loras_adapters = None
1097
+ model._loras_active_adapters = None
1098
+ model._loras_scaling = None
1099
+
1100
+ def set_step_no_for_lora(model, step_no):
1101
+ model._lora_step_no = step_no
1040
1102
 
1041
1103
  def activate_loras(model, lora_nos, lora_multi = None ):
1042
1104
  if not isinstance(lora_nos, list):
@@ -1046,15 +1108,13 @@ def activate_loras(model, lora_nos, lora_multi = None ):
1046
1108
  if lora_multi is None:
1047
1109
  lora_multi = [1. for _ in lora_nos]
1048
1110
 
1049
- lora_fake_scaling = [1. if isinstance(mult, list) else mult for mult in lora_multi ]
1050
1111
  lora_scaling_dict = {}
1051
1112
  for no, multi in zip(lora_nos, lora_multi):
1052
1113
  lora_scaling_dict[no] = multi
1053
1114
 
1054
- shared_state["_lora_scaling"] = lora_scaling_dict
1055
- shared_state["_lora_step_no"] = 0
1056
-
1057
- set_weights_and_activate_adapters(model, lora_nos, lora_fake_scaling)
1115
+ model._lora_step_no = 0
1116
+ model._loras_active_adapters = set(lora_nos)
1117
+ model._loras_scaling = lora_scaling_dict
1058
1118
 
1059
1119
 
1060
1120
  def move_loras_to_device(model, device="cpu" ):
@@ -1399,12 +1459,12 @@ class offload:
1399
1459
  self.loaded_blocks = {}
1400
1460
  self.prev_blocks_names = {}
1401
1461
  self.next_blocks_names = {}
1402
- self.lora_parents = {}
1403
1462
  self.preloaded_blocks_per_model = {}
1404
1463
  self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
1405
1464
  self.transfer_stream = torch.cuda.Stream()
1406
1465
  self.async_transfers = False
1407
1466
  self.parameters_ref = {}
1467
+
1408
1468
  global last_offload_obj
1409
1469
  last_offload_obj = self
1410
1470
 
@@ -1428,15 +1488,12 @@ class offload:
1428
1488
  self.next_blocks_names[prev_entry_name] = entry_name
1429
1489
  bef = blocks_params_size
1430
1490
 
1431
- lora_name = None
1432
- if self.lora_parents.get(submodule, None) != None:
1433
- lora_name = str(submodule_name[ submodule_name.rfind(".") + 1: ] )
1434
1491
  for k,p in submodule.named_parameters(recurse=False):
1435
1492
  param_size = 0
1436
1493
  ref = _get_tensor_ref(p)
1437
1494
  tied_param = self.parameters_ref.get(ref, None)
1438
1495
  if isinstance(p, QTensor):
1439
- blocks_params.append( (submodule, k, p, False, tied_param, lora_name ) )
1496
+ blocks_params.append( (submodule, k, p, False, tied_param ) )
1440
1497
 
1441
1498
  if p._qtype == qint4:
1442
1499
  if hasattr(p,"_scale_shift"):
@@ -1450,7 +1507,7 @@ class offload:
1450
1507
  param_size += torch.numel(p._scale) * p._scale.element_size()
1451
1508
  param_size += torch.numel(p._data) * p._data.element_size()
1452
1509
  else:
1453
- blocks_params.append( (submodule, k, p, False, tied_param, lora_name) )
1510
+ blocks_params.append( (submodule, k, p, False, tied_param) )
1454
1511
  param_size += torch.numel(p.data) * p.data.element_size()
1455
1512
 
1456
1513
 
@@ -1459,7 +1516,7 @@ class offload:
1459
1516
  self.parameters_ref[ref] = (submodule, k)
1460
1517
 
1461
1518
  for k, p in submodule.named_buffers(recurse=False):
1462
- blocks_params.append( (submodule, k, p, True, None, lora_name) )
1519
+ blocks_params.append( (submodule, k, p, True, None) )
1463
1520
  blocks_params_size += p.data.nbytes
1464
1521
 
1465
1522
  aft = blocks_params_size
@@ -1484,6 +1541,19 @@ class offload:
1484
1541
  return False
1485
1542
  return True
1486
1543
 
1544
+ def _move_loras(self, loras_active_adapters, loras_modules, to_GPU):
1545
+ for name, lora_module in loras_modules.items():
1546
+ for adapter in loras_active_adapters:
1547
+ lora_data = lora_module.get(adapter, None)
1548
+ if lora_data == None:
1549
+ continue
1550
+ lora_A, lora_B, alpha = lora_data
1551
+ key = adapter + '_GPU'
1552
+ if to_GPU:
1553
+ lora_module[key] = [lora_A.cuda(), lora_B.cuda(), alpha]
1554
+ elif key in lora_module:
1555
+ del lora_module[key]
1556
+
1487
1557
  @torch.compiler.disable()
1488
1558
  def gpu_load_blocks(self, model_id, blocks_name, preload = False):
1489
1559
  # cl = clock.start()
@@ -1492,12 +1562,17 @@ class offload:
1492
1562
  entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
1493
1563
 
1494
1564
  def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
1565
+ model = self.models[model_id]
1566
+ loras_modules = {}
1567
+ loras_active_adapters = getattr(model ,"_loras_active_adapters", None)
1568
+ if loras_active_adapters == None or len(loras_active_adapters) == 0:
1569
+ loras_model_data = None
1570
+ else:
1571
+ loras_model_data = getattr(model, "_loras_model_data", None)
1572
+
1495
1573
  with torch.cuda.stream(stream_to_use):
1496
1574
  for param in blocks_params:
1497
- parent_module, n, p, is_buffer, tied_param, lora_name = param
1498
- if lora_name != None:
1499
- if not lora_name in self.lora_parents[parent_module].active_adapters:
1500
- continue
1575
+ parent_module, n, p, is_buffer, tied_param = param
1501
1576
 
1502
1577
  if tied_param != None:
1503
1578
  tied_p = getattr( tied_param[0], tied_param[1])
@@ -1515,6 +1590,12 @@ class offload:
1515
1590
  if tied_param != None:
1516
1591
  setattr( tied_param[0], tied_param[1], q)
1517
1592
  del p, q
1593
+ if loras_model_data != None:
1594
+ lora_data = loras_model_data.get(parent_module, None)
1595
+ if lora_data != None:
1596
+ loras_modules[parent_module]= lora_data
1597
+ if len(loras_modules) > 0:
1598
+ self._move_loras(loras_active_adapters, loras_modules, True)
1518
1599
 
1519
1600
  loaded_block = self.loaded_blocks[model_id]
1520
1601
 
@@ -1575,14 +1656,31 @@ class offload:
1575
1656
  print(f"Unloading model {blocks_name} ({model_name}) from GPU")
1576
1657
 
1577
1658
  blocks_params = self.blocks_of_modules[blocks_name]
1659
+ model = self.models[model_id]
1660
+ loras_modules = {}
1661
+ loras_active_adapters = getattr(model ,"_loras_active_adapters", None)
1662
+ if loras_active_adapters == None or len(loras_active_adapters) == 0 :
1663
+ loras_model_data = None
1664
+ else:
1665
+ loras_model_data = getattr(model, "_loras_model_data", None)
1666
+
1578
1667
  for param in blocks_params:
1579
- parent_module, n, p, is_buffer, _, _ = param
1668
+ parent_module, n, p, is_buffer, _ = param
1580
1669
  if is_buffer:
1581
1670
  q = torch.nn.Buffer(p)
1582
1671
  else:
1583
1672
  q = torch.nn.Parameter(p , requires_grad=False)
1584
1673
  setattr(parent_module, n , q)
1585
1674
  del p, q
1675
+
1676
+ if loras_model_data != None:
1677
+ lora_data = loras_model_data.get(parent_module, None)
1678
+ if lora_data != None:
1679
+ loras_modules[parent_module]= lora_data
1680
+
1681
+ if len(loras_modules) > 0:
1682
+ self._move_loras(loras_active_adapters, loras_modules, False)
1683
+
1586
1684
  # cl.stop()
1587
1685
  # print(f"unload time: {cl.format_time_gap()}")
1588
1686
 
@@ -1670,6 +1768,92 @@ class offload:
1670
1768
 
1671
1769
  return False
1672
1770
 
1771
+ def _lora_linear_forward(self, model, submodule, loras_data, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
1772
+
1773
+ def get_scaling(active_adapter):
1774
+ scaling_list = loras_scaling[active_adapter]
1775
+ if isinstance(scaling_list, list):
1776
+ step_no =getattr(model, "_lora_step_no", 0)
1777
+ return scaling_list[step_no]
1778
+ else:
1779
+ return float(scaling_list)
1780
+
1781
+ weight = submodule.weight
1782
+
1783
+ if loras_data == None:
1784
+ return torch.nn.functional.linear(x, weight, bias=submodule.bias)
1785
+
1786
+ active_adapters = model._loras_active_adapters
1787
+ loras_scaling = model._loras_scaling
1788
+ training = False
1789
+
1790
+
1791
+ if weight.shape[-1] < x.shape[-2] : # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
1792
+ if len(active_adapters) > 0:
1793
+ if isinstance(submodule, QModuleMixin):
1794
+ weight = weight.view(weight.shape) # get a persistent copy of the on the fly dequantized weights
1795
+ else:
1796
+ weight = weight.clone()
1797
+
1798
+
1799
+ for active_adapter in active_adapters:
1800
+ data = loras_data.get(active_adapter + '_GPU', None)
1801
+ if data == None:
1802
+ continue
1803
+ lora_A_weight, lora_B_weight, alpha = data
1804
+ scaling = get_scaling(active_adapter) * alpha
1805
+ weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
1806
+ # base_weight += scaling * lora_B_weight @ lora_A_weight
1807
+
1808
+ if training:
1809
+ pass
1810
+ # result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
1811
+ else:
1812
+ result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
1813
+
1814
+ else:
1815
+ result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
1816
+
1817
+ if len(active_adapters) > 0:
1818
+ x = x.to(torch.bfloat16)
1819
+
1820
+ for active_adapter in active_adapters:
1821
+ data = loras_data.get(active_adapter + '_GPU', None)
1822
+ if data == None:
1823
+ continue
1824
+ lora_A, lora_B, alpha = data
1825
+ # dropout = self.lora_dropout[active_adapter]
1826
+ scaling = get_scaling(active_adapter) * alpha
1827
+ x = x.to(lora_A.dtype)
1828
+
1829
+ if training:
1830
+ pass
1831
+ # y = lora_A(dropout(x))
1832
+ else:
1833
+ y = torch.nn.functional.linear(x, lora_A, bias=None)
1834
+
1835
+ y = torch.nn.functional.linear(y, lora_B, bias=None)
1836
+ y*= scaling
1837
+ result+= y
1838
+ del y
1839
+
1840
+ return result
1841
+
1842
+
1843
+ def hook_lora_linear(self, submodule, current_model, model_id, submodule_name):
1844
+ old_forward = submodule.forward
1845
+ def lora_linear_forward(module, *args, **kwargs):
1846
+ loras_model_data = getattr(current_model, "_loras_model_data", None)
1847
+ loras_data = None
1848
+ if loras_model_data != None:
1849
+ loras_data = loras_model_data.get(submodule, None)
1850
+ if loras_data == None:
1851
+ return old_forward(*args, **kwargs)
1852
+ else:
1853
+ return self._lora_linear_forward(current_model, submodule, loras_data, *args, **kwargs)
1854
+
1855
+ return functools.update_wrapper(functools.partial(lora_linear_forward, submodule), old_forward)
1856
+
1673
1857
  def ensure_model_loaded(self, model_id):
1674
1858
  if model_id in self.active_models_ids:
1675
1859
  return
@@ -1851,6 +2035,8 @@ class offload:
1851
2035
 
1852
2036
  for model_id, model in self.models.items():
1853
2037
  move_loras_to_device(model, "cpu")
2038
+ if hasattr(model, "_loras_model_data"):
2039
+ unload_loras_from_model(model)
1854
2040
 
1855
2041
  self.models = None
1856
2042
 
@@ -1860,7 +2046,7 @@ class offload:
1860
2046
 
1861
2047
 
1862
2048
 
1863
- def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
2049
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
1864
2050
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
1865
2051
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
1866
2052
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -1912,7 +2098,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
1912
2098
  _welcome()
1913
2099
  if coTenantsMap != None:
1914
2100
  self.cotenants_map = coTenantsMap
1915
-
2101
+ if loras != None and isinstance(loras, str):
2102
+ loras = [loras]
1916
2103
  self.models = models
1917
2104
 
1918
2105
  extraModelsToQuantize = extraModelsToQuantize if extraModelsToQuantize is not None else []
@@ -2059,12 +2246,12 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
2059
2246
  if self.verboseLevel >=1:
2060
2247
  print(f"Model '{model_id}' already pinned to reserved memory")
2061
2248
  else:
2062
- _pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedLora = pinnedLora, verboseLevel=verboseLevel)
2063
-
2249
+ _pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, verboseLevel=verboseLevel)
2250
+
2064
2251
  current_budget = model_budgets[model_id]
2065
2252
  cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
2066
2253
  self.loaded_blocks[model_id] = None
2067
-
2254
+ any_lora = loras !=None and model_id in loras or getattr(current_model, "_loras_model_data", False)
2068
2255
  for submodule_name, submodule in current_model.named_modules():
2069
2256
  # create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
2070
2257
  # (it is queried in many pipelines even if offloading is not properly implemented)
@@ -2096,7 +2283,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
2096
2283
 
2097
2284
 
2098
2285
  if hasattr(submodule, "forward"):
2099
- submodule_method = getattr(submodule, "forward")
2286
+ if any_lora and isinstance(submodule, torch.nn.Linear):
2287
+ submodule_method = self.hook_lora_linear(submodule, current_model, model_id, submodule_name)
2288
+ else:
2289
+ submodule_method = getattr(submodule, "forward")
2100
2290
  if callable(submodule_method):
2101
2291
  if len(submodule_name.split("."))==1:
2102
2292
  self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method)
@@ -2107,13 +2297,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
2107
2297
 
2108
2298
  self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
2109
2299
 
2110
- if hasattr(submodule, "active_adapters"):
2111
- for dictmodule in ["lora_A","lora_B"]:
2112
- ssubmod = getattr(submodule, dictmodule, None)
2113
- if ssubmod !=None:
2114
- for k, loramod in ssubmod._modules.items():
2115
- self.lora_parents[loramod] = submodule
2116
-
2117
2300
 
2118
2301
  self.tune_preloading(model_id, current_budget, towers_names)
2119
2302
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.2.6
3
+ Version: 3.2.8
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -13,11 +13,10 @@ Requires-Dist: optimum-quanto
13
13
  Requires-Dist: accelerate
14
14
  Requires-Dist: safetensors
15
15
  Requires-Dist: psutil
16
- Requires-Dist: peft
17
16
 
18
17
 
19
18
  <p align="center">
20
- <H2>Memory Management 3.2.6 for the GPU Poor by DeepBeepMeep</H2>
19
+ <H2>Memory Management 3.2.8 for the GPU Poor by DeepBeepMeep</H2>
21
20
  </p>
22
21
 
23
22
 
@@ -3,4 +3,3 @@ optimum-quanto
3
3
  accelerate
4
4
  safetensors
5
5
  psutil
6
- peft
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes