mmgp 3.3.2__tar.gz → 3.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.3.2
3
+ Version: 3.3.4
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Dynamic: license-file
17
17
 
18
18
 
19
19
  <p align="center">
20
- <H2>Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep</H2>
20
+ <H2>Memory Management 3.3.4 for the GPU Poor by DeepBeepMeep</H2>
21
21
  </p>
22
22
 
23
23
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  <p align="center">
3
- <H2>Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep</H2>
3
+ <H2>Memory Management 3.3.4 for the GPU Poor by DeepBeepMeep</H2>
4
4
  </p>
5
5
 
6
6
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.3.2"
3
+ version = "3.3.4"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.3.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -401,7 +401,7 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
401
401
  return
402
402
 
403
403
 
404
- def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
404
+ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, big_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
405
405
 
406
406
  global max_pinnable_bytes, total_pinned_bytes
407
407
  if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
@@ -474,7 +474,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
474
474
  length = torch.numel(p.data) * p.data.element_size()
475
475
 
476
476
  ref_cache[ref] = (n, length)
477
- if current_big_tensor_size + length > gig_tensor_size :
477
+ if current_big_tensor_size + length > big_tensor_size and current_big_tensor_size !=0 :
478
478
  big_tensors_sizes.append(current_big_tensor_size)
479
479
  current_big_tensor_size = 0
480
480
  big_tensor_no += 1
@@ -498,28 +498,11 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
498
498
  big_tensors_sizes.append(current_big_tensor_size)
499
499
 
500
500
  big_tensors = []
501
- last_big_tensor = 0
502
501
  total = 0
503
502
 
504
503
 
505
504
  failed_planned_allocation = False
506
505
 
507
- # for size in big_tensors_sizes:
508
- # try:
509
- # # if total > 7000 * ONE_MB:
510
- # # raise Exception ("test no more reserved RAM")
511
- # current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
512
- # big_tensors.append(current_big_tensor)
513
- # except:
514
- # print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
515
- # max_pinnable_bytes = total + total_pinned_bytes
516
- # failed_planned_allocation = True
517
- # break
518
-
519
- # last_big_tensor += 1
520
- # total += size
521
-
522
-
523
506
  gc.collect()
524
507
 
525
508
  last_allocated_big_tensor = -1
@@ -561,13 +544,6 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
561
544
 
562
545
  total += size
563
546
 
564
- # if big_tensor_no != prev_big_tensor:
565
- # gc.collect()
566
- # prev_big_tensor = big_tensor_no
567
- # match_param, match_isbuffer = tied_weights.get(n, (None, False))
568
- # if match_param != None:
569
-
570
- # if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
571
547
  current_big_tensor = big_tensors[big_tensor_no]
572
548
  if is_buffer :
573
549
  _force_load_buffer(p) # otherwise potential memory leak
@@ -600,9 +576,9 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
600
576
 
601
577
  if verboseLevel >=1:
602
578
  if partialPinning or failed_planned_allocation:
603
- print(f"The model was partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
579
+ print(f"The model was partially pinned to reserved RAM: {last_allocated_big_tensor + 1} large blocks spread across {total/ONE_MB:.2f} MB")
604
580
  else:
605
- print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
581
+ print(f"The whole model was pinned to reserved RAM: {last_allocated_big_tensor + 1} large blocks spread across {total/ONE_MB:.2f} MB")
606
582
 
607
583
  model._already_pinned = True
608
584
 
@@ -615,7 +591,7 @@ def _welcome():
615
591
  if welcome_displayed:
616
592
  return
617
593
  welcome_displayed = True
618
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
594
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
619
595
 
620
596
  def _extract_num_from_str(num_in_str):
621
597
  size = len(num_in_str)
@@ -901,17 +877,15 @@ def split_linear_modules(model, map ):
901
877
 
902
878
  def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, check_only = False, ignore_model_variations = False, pinnedLora = False, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
903
879
  verboseLevel = _compute_verbose_level(verboseLevel)
904
- modules_dict = {k: v for k,v in model.named_modules()}
905
880
 
881
+ loras_model_data = getattr(model, "_loras_model_data", None)
882
+ if loras_model_data == None:
883
+ raise Exception(f"No Loras has been declared for this model while creating the corresponding offload object")
884
+
906
885
  if not check_only:
907
- loras_model_data = dict()
908
- model._loras_model_data = loras_model_data
909
- loras_active_adapters = set()
910
- model._loras_active_adapters = loras_active_adapters
911
- loras_scaling = dict()
912
- model._loras_scaling = loras_scaling
913
- loras_tied_weights = dict()
914
- model._loras_tied_weights = loras_tied_weights
886
+ unload_loras_from_model(model)
887
+
888
+ modules_dict = {k: v for k,v in model.named_modules()}
915
889
 
916
890
  CrLf = '\r\n'
917
891
  error_msg = ""
@@ -949,10 +923,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
949
923
  continue
950
924
  fail = False
951
925
  skip = False
952
- state_dict = safetensors2.torch_load_file(path)
953
-
954
-
955
-
926
+ state_dict = safetensors2.torch_load_file(path, writable_tensors= False)
956
927
 
957
928
  if preprocess_sd != None:
958
929
  state_dict = preprocess_sd(state_dict)
@@ -1069,9 +1040,10 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1069
1040
  break
1070
1041
  if not check_only:
1071
1042
  loras_module_data = loras_model_data.get(module, None)
1072
- if loras_module_data == None:
1073
- loras_module_data = dict()
1074
- loras_model_data[module] = loras_module_data
1043
+ assert loras_module_data != None
1044
+ # if loras_module_data == None:
1045
+ # loras_module_data = dict()
1046
+ # loras_model_data[module] = loras_module_data
1075
1047
  loras_adapter_data = loras_module_data.get(adapter_name, None)
1076
1048
  lora_A = None if lora_A == None else lora_A.to(torch.bfloat16)
1077
1049
  lora_B = None if lora_B == None else lora_B.to(torch.bfloat16)
@@ -1132,12 +1104,17 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1132
1104
  return new_lora_path
1133
1105
 
1134
1106
  def unload_loras_from_model(model):
1135
- model._loras_model_data = None
1107
+ for _, v in model._loras_model_data.items():
1108
+ v.clear()
1109
+
1110
+ model._loras_active_adapters = set()
1111
+ model._loras_scaling = dict()
1112
+ model._loras_tied_weights = dict()
1136
1113
  model._loras_errors = None
1137
1114
  model._loras_adapters = None
1138
- model._loras_active_adapters = None
1139
1115
  model._loras_scaling = None
1140
1116
 
1117
+
1141
1118
  def set_step_no_for_lora(model, step_no):
1142
1119
  model._lora_step_no = step_no
1143
1120
 
@@ -1881,14 +1858,14 @@ class offload:
1881
1858
  return result
1882
1859
 
1883
1860
 
1884
- def hook_lora_linear(self, submodule, current_model, model_id, submodule_name):
1861
+ def hook_lora_linear(self, submodule, current_model, model_id, loras_model_data, submodule_name):
1885
1862
  old_forward = submodule.forward
1863
+
1864
+ loras_data = {}
1865
+ loras_model_data[submodule] = loras_data
1866
+
1886
1867
  def lora_linear_forward(module, *args, **kwargs):
1887
- loras_model_data = getattr(current_model, "_loras_model_data", None)
1888
- loras_data = None
1889
- if loras_model_data != None:
1890
- loras_data = loras_model_data.get(submodule, None)
1891
- if loras_data == None:
1868
+ if len(loras_data) == 0:
1892
1869
  return old_forward(*args, **kwargs)
1893
1870
  else:
1894
1871
  return self._lora_linear_forward(current_model, submodule, loras_data, *args, **kwargs)
@@ -2295,7 +2272,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2295
2272
  current_budget = model_budgets[model_id]
2296
2273
  cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
2297
2274
  self.loaded_blocks[model_id] = None
2298
- any_lora = loras !=None and model_id in loras or getattr(current_model, "_loras_model_data", False)
2275
+ any_lora = loras !=None and model_id in loras
2276
+ if any_lora:
2277
+ loras_model_data = {}
2278
+ current_model._loras_model_data = loras_model_data
2299
2279
  for submodule_name, submodule in current_model.named_modules():
2300
2280
  # create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
2301
2281
  # (it is queried in many pipelines even if offloading is not properly implemented)
@@ -2328,7 +2308,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2328
2308
 
2329
2309
  if hasattr(submodule, "forward"):
2330
2310
  if any_lora and isinstance(submodule, torch.nn.Linear):
2331
- submodule_method = self.hook_lora_linear(submodule, current_model, model_id, submodule_name)
2311
+ submodule_method = self.hook_lora_linear(submodule, current_model, model_id, loras_model_data, submodule_name)
2332
2312
  else:
2333
2313
  submodule_method = getattr(submodule, "forward")
2334
2314
  if callable(submodule_method):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.3.2
3
+ Version: 3.3.4
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Dynamic: license-file
17
17
 
18
18
 
19
19
  <p align="center">
20
- <H2>Memory Management 3.3.2 for the GPU Poor by DeepBeepMeep</H2>
20
+ <H2>Memory Management 3.3.4 for the GPU Poor by DeepBeepMeep</H2>
21
21
  </p>
22
22
 
23
23
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes