mmgp 3.2.2__py3-none-any.whl → 3.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/offload.py CHANGED
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.2.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.2.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -479,7 +479,7 @@ def _welcome():
479
479
  if welcome_displayed:
480
480
  return
481
481
  welcome_displayed = True
482
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
482
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
483
483
 
484
484
  def _extract_num_from_str(num_in_str):
485
485
  size = len(num_in_str)
@@ -858,7 +858,7 @@ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor
858
858
  result = result.to(torch_result_dtype)
859
859
  return result
860
860
 
861
- def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, split_linear_modules_map = None,verboseLevel = -1,):
861
+ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
862
862
  verboseLevel = _compute_verbose_level(verboseLevel)
863
863
 
864
864
  if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or get_peft_kwargs == None:
@@ -877,7 +877,8 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
877
877
  adapter_name = str(i)
878
878
 
879
879
  state_dict = safetensors2.torch_load_file(path)
880
-
880
+ if preprocess_sd != None:
881
+ state_dict = preprocess_sd(state_dict)
881
882
 
882
883
  if split_linear_modules_map != None:
883
884
  new_state_dict = {}
@@ -977,7 +978,8 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
977
978
  # Check only for unexpected keys.
978
979
  unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
979
980
  if unexpected_keys:
980
- pass
981
+ raise Exception(f"Lora '{path}' contains invalid keys '{unexpected_keys}'")
982
+
981
983
  if verboseLevel >=1:
982
984
  print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
983
985
  if activate_all_loras:
@@ -1015,7 +1017,7 @@ def move_loras_to_device(model, device="cpu" ):
1015
1017
  if ".lora_" in k:
1016
1018
  m.to(device)
1017
1019
 
1018
- def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, verboseLevel = -1):
1020
+ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, modelClass=None, modelPrefix = None, verboseLevel = -1):
1019
1021
  """
1020
1022
  quick version of .LoadfromPretrained of the transformers library
1021
1023
  used to build a model and load the corresponding weights (quantized or not)
@@ -1096,13 +1098,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
1096
1098
 
1097
1099
  model._config = transformer_config
1098
1100
 
1099
- load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, verboseLevel=verboseLevel )
1101
+ load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, verboseLevel=verboseLevel )
1100
1102
 
1101
1103
  return model
1102
1104
 
1103
1105
 
1104
1106
 
1105
- def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
1107
+ def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, verboseLevel = -1):
1106
1108
  """
1107
1109
  Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
1108
1110
  """
@@ -1113,6 +1115,26 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
1113
1115
  verboseLevel = _compute_verbose_level(verboseLevel)
1114
1116
 
1115
1117
  model = _remove_model_wrapper(model)
1118
+
1119
+ def filter_state_dict(state_dict, base_model_prefix):
1120
+ new_state_dict= {}
1121
+ start = -1
1122
+ for k,v in state_dict.items():
1123
+ if k.startswith(base_model_prefix):
1124
+
1125
+ new_start = len(base_model_prefix)
1126
+ else:
1127
+ pos = k.find("." + base_model_prefix)
1128
+ if pos < 0:
1129
+ continue
1130
+ new_start = pos + len(base_model_prefix) +1
1131
+ if start != -1 and start != new_start:
1132
+ new_state_dict = state_dict
1133
+ break
1134
+ start = new_start
1135
+ new_state_dict[k[ start:]] = v
1136
+ return new_state_dict
1137
+
1116
1138
  if not (".safetensors" in file_path or ".sft" in file_path):
1117
1139
  if pinToMemory:
1118
1140
  raise Exception("Pinning to memory while loading only supported for safe tensors files")
@@ -1151,6 +1173,11 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
1151
1173
  quantization_map = json.load(f)
1152
1174
 
1153
1175
 
1176
+ # deal if we are trying to load just a sub part of a larger model
1177
+ if modelPrefix != None:
1178
+ base_model_prefix = modelPrefix + "."
1179
+ state_dict = filter_state_dict(state_dict,base_model_prefix)
1180
+ quantization_map = filter_state_dict(quantization_map,base_model_prefix)
1154
1181
 
1155
1182
  if quantization_map is None :
1156
1183
  if "quanto" in file_path and not do_quantize:
@@ -1160,32 +1187,12 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
1160
1187
 
1161
1188
  missing_keys , unexpected_keys = model.load_state_dict(state_dict, False, assign = True )
1162
1189
  if len(missing_keys) > 0 :
1163
- # if there is a key mismatch maybe we forgot to remove some prefix or we are trying to load just a sub part of a larger model
1164
- if hasattr(model, "base_model_prefix"):
1165
- base_model_prefix = model.base_model_prefix + "."
1166
- else:
1167
- for k,v in state_dict.items():
1168
- if k.endswith(missing_keys[0]):
1169
- base_model_prefix = k[:-len(missing_keys[0])]
1170
- break
1171
-
1172
- new_state_dict= {}
1173
- start = -1
1190
+ # if there is a key mismatch maybe we forgot to remove some prefix
1174
1191
  for k,v in state_dict.items():
1175
- if k.startswith(base_model_prefix):
1176
- new_start = len(base_model_prefix)
1177
- else:
1178
- pos = k.find("." + base_model_prefix)
1179
- if pos < 0:
1180
- continue
1181
- new_start = pos + len(base_model_prefix) +1
1182
- if start != -1 and start != new_start:
1183
- new_state_dict = state_dict
1192
+ if k.endswith(missing_keys[0]):
1193
+ base_model_prefix = k[:-len(missing_keys[0])]
1184
1194
  break
1185
- start = new_start
1186
- new_state_dict[k[ start:]] = v
1187
- state_dict = new_state_dict
1188
- del new_state_dict
1195
+ state_dict = filter_state_dict(state_dict,base_model_prefix)
1189
1196
  missing_keys , unexpected_keys = model.load_state_dict(state_dict, False, assign = True )
1190
1197
  del state_dict
1191
1198
 
@@ -1354,6 +1361,8 @@ class offload:
1354
1361
 
1355
1362
  def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
1356
1363
 
1364
+ if blocks_name!=None and ".lora_" in blocks_name:
1365
+ blocks_name = None
1357
1366
  entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
1358
1367
  if entry_name in self.blocks_of_modules:
1359
1368
  blocks_params = self.blocks_of_modules[entry_name]
@@ -1372,7 +1381,6 @@ class offload:
1372
1381
  lora_name = None
1373
1382
  if self.lora_parents.get(submodule, None) != None:
1374
1383
  lora_name = str(submodule_name[ submodule_name.rfind(".") + 1: ] )
1375
-
1376
1384
  for k,p in submodule.named_parameters(recurse=False):
1377
1385
  param_size = 0
1378
1386
  ref = _get_tensor_ref(p)
@@ -1457,11 +1465,10 @@ class offload:
1457
1465
  if tied_param != None:
1458
1466
  setattr( tied_param[0], tied_param[1], q)
1459
1467
  del p, q
1460
- any_past_block = False
1461
1468
 
1462
1469
  loaded_block = self.loaded_blocks[model_id]
1470
+
1463
1471
  if not preload and loaded_block != None:
1464
- any_past_block = True
1465
1472
  self.gpu_unload_blocks(model_id, loaded_block)
1466
1473
  if self.ready_to_check_mem():
1467
1474
  self.empty_cache_if_needed()
@@ -1475,7 +1482,8 @@ class offload:
1475
1482
 
1476
1483
 
1477
1484
  if self.async_transfers and blocks_name != None:
1478
- first = self.prev_blocks_names[entry_name] == None or not any_past_block
1485
+ prev = self.prev_blocks_names[entry_name]
1486
+ first = prev == None or prev != loaded_block
1479
1487
  next_blocks_entry = self.next_blocks_names[entry_name] if entry_name in self.next_blocks_names else None
1480
1488
  if first:
1481
1489
  if self.verboseLevel >=2:
@@ -1497,7 +1505,6 @@ class offload:
1497
1505
  print(f"Loading model {entry_name} ({model_name}) in GPU")
1498
1506
  cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
1499
1507
  torch.cuda.synchronize()
1500
-
1501
1508
  if not preload:
1502
1509
  self.loaded_blocks[model_id] = blocks_name
1503
1510
 
@@ -1710,7 +1717,7 @@ class offload:
1710
1717
  current_budget -= base_size
1711
1718
  if current_budget <= 0:
1712
1719
  if self.verboseLevel >=1:
1713
- print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only only base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1720
+ print(f"Async loading plan for model '{model_id}' : minimum budget management, beside the async shuttle only base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1714
1721
  return
1715
1722
 
1716
1723
  towers = []
@@ -1732,7 +1739,7 @@ class offload:
1732
1739
  current_budget -= 2 * max_floor_size
1733
1740
  if current_budget <= 0:
1734
1741
  if self.verboseLevel >=1:
1735
- print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1742
+ print(f"Async loading plan for model '{model_id}' : minimum budget management, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1736
1743
  return
1737
1744
 
1738
1745
 
@@ -1743,7 +1750,7 @@ class offload:
1743
1750
  max_blocks_fetch = max(max_floor_size, max_blocks_fetch)
1744
1751
  if preload_blocks_count <= 0:
1745
1752
  if self.verboseLevel >=1:
1746
- print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1753
+ print(f"Async loading plan for model '{model_id}' : minimum budget management, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1747
1754
  return
1748
1755
 
1749
1756
  nb_blocks= len(floors)
@@ -1821,16 +1828,20 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1821
1828
 
1822
1829
  windows_os = os.name == 'nt'
1823
1830
 
1831
+ def get_parsed_budget(b):
1832
+ if isinstance(b , str) and b.endswith("%"):
1833
+ return float(b[:-1]) * self.device_mem_capacity
1834
+ else:
1835
+ return b * ONE_MB
1836
+
1824
1837
  budget = 0
1825
1838
  if not budgets is None:
1826
1839
  if isinstance(budgets , dict):
1827
- model_budgets = budgets
1828
- budget = budgets.get("*", 0) * ONE_MB
1840
+ model_budgets = { k : get_parsed_budget(b) for k , b in budgets.items() }
1841
+ budget = model_budgets.get("*", 0)
1829
1842
  else:
1830
- budget = int(budgets) * ONE_MB
1843
+ budget = get_parsed_budget(budget)
1831
1844
 
1832
- # if (budgets!= None or budget >0) :
1833
- # self.async_transfers = True
1834
1845
  self.async_transfers = asyncTransfers
1835
1846
 
1836
1847
 
@@ -1938,18 +1949,19 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1938
1949
  estimatesBytesToPin += current_model_size
1939
1950
 
1940
1951
 
1941
- model_budget = model_budgets[model_id] * ONE_MB if model_id in model_budgets else budget
1952
+ model_budget = model_budgets[model_id] if model_id in model_budgets else budget
1942
1953
  if workingVRAM != None:
1943
1954
  model_minimumVRAM = -1
1944
1955
  if isinstance(workingVRAM, dict):
1945
1956
  if model_id in workingVRAM:
1946
- model_minimumVRAM = workingVRAM[model_id]
1957
+ model_minimumVRAM = get_parsed_budget(workingVRAM[model_id])
1947
1958
  elif "*" in model_id in workingVRAM:
1948
- model_minimumVRAM = workingVRAM["*"]
1959
+ model_minimumVRAM = get_parsed_budget(workingVRAM["*"])
1949
1960
  else:
1950
- model_minimumVRAM = workingVRAM
1961
+ model_minimumVRAM = get_parsed_budget(workingVRAM)
1962
+
1951
1963
  if model_minimumVRAM > 0:
1952
- new_budget = self.device_mem_capacity - model_minimumVRAM * ONE_MB
1964
+ new_budget = self.device_mem_capacity - model_minimumVRAM
1953
1965
  new_budget = 1 if new_budget < 0 else new_budget
1954
1966
  model_budget = new_budget if model_budget == 0 or new_budget < model_budget else model_budget
1955
1967
  if model_budget > 0 and model_budget > current_model_size:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.2.2
3
+ Version: 3.2.4
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Requires-Dist: peft
17
17
 
18
18
 
19
19
  <p align="center">
20
- <H2>Memory Management 3.2.1 for the GPU Poor by DeepBeepMeep</H2>
20
+ <H2>Memory Management 3.2.4 for the GPU Poor by DeepBeepMeep</H2>
21
21
  </p>
22
22
 
23
23
 
@@ -119,9 +119,9 @@ For example:
119
119
  - pinnedMemory: Boolean (for all models) or List of models ids to pin to RAM. Every model pinned to RAM will load much faster (up to 2 times) but this requires more RAM
120
120
  - quantizeTransformer: boolean by default True. The 'transformer' model in the pipe contains usually the video or image generator is by defaut; quantized on the fly by default to 8 bits. If you want to save time on disk and reduce the loading time, you may want to load directly a prequantized model. If you don't want to quantize the image generator, you need to set the option *quantizeTransformer* to *False* to turn off on the fly quantization.
121
121
  - extraModelsToQuantize: list of additional modelids of models to quantize on the fly. If the corresponding model is already quantized, this option will be ignored.
122
- - budgets: either a number in mega bytes (for all models, if 0 unlimited budget) or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
122
+ - budgets: either a number in mega bytes, (for all models, if 0 unlimited budget) a string that is perecentage of the total VRAM or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
123
123
  The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
124
- - workingVRAM: either a number in mega bytes or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
124
+ - workingVRAM: either a number in mega bytes, a string that is perecentage of the total VRAM or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
125
125
  - asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
126
126
  - verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
127
127
  - compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=vGxgCcWV8PQQ4JjSlYFOX57Mr9RLlvPBMOOj3f63qL4,96389
4
+ mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
5
+ mmgp-3.2.4.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
+ mmgp-3.2.4.dist-info/METADATA,sha256=UGZ7ADvrhU5P0hS7gFgu8SHpEnzzpEgE3Ionk-I7ckw,16151
7
+ mmgp-3.2.4.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
8
+ mmgp-3.2.4.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.2.4.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=hzirru31j78E88OIT38GJ46iMvddEFM2c3_CCn4N4K4,95676
4
- mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
5
- mmgp-3.2.2.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
- mmgp-3.2.2.dist-info/METADATA,sha256=hTjAL-soDwYbUlnD1Om7kefG8D4vaXUTjsHoQDikVQA,16054
7
- mmgp-3.2.2.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
8
- mmgp-3.2.2.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.2.2.dist-info/RECORD,,
File without changes