mmgp 3.5.5__tar.gz → 3.5.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.5
3
+ Version: 3.5.7
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.5 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.5.7 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  <p align="center">
3
- <H2>Memory Management 3.5.5 for the GPU Poor by DeepBeepMeep</H2>
3
+ <H2>Memory Management 3.5.7 for the GPU Poor by DeepBeepMeep</H2>
4
4
  </p>
5
5
 
6
6
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.5.5"
3
+ version = "3.5.7"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.5.5 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.5.7 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -122,8 +122,6 @@ class clock:
122
122
  def format_time_gap(self):
123
123
  return f"{self.stop_time - self.start_time:.2f}s"
124
124
 
125
-
126
-
127
125
  # useful functions to move a group of tensors (to design custom offload patches)
128
126
  def move_tensors(obj, device):
129
127
  if torch.is_tensor(obj):
@@ -668,7 +666,7 @@ def _welcome():
668
666
  if welcome_displayed:
669
667
  return
670
668
  welcome_displayed = True
671
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.5) by DeepBeepMeep ************{ENDC}{UNBOLD}")
669
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.7) by DeepBeepMeep ************{ENDC}{UNBOLD}")
672
670
 
673
671
  def change_dtype(model, new_dtype, exclude_buffers = False):
674
672
  for submodule_name, submodule in model.named_modules():
@@ -1295,7 +1293,7 @@ def move_loras_to_device(model, device="cpu" ):
1295
1293
  if ".lora_" in k:
1296
1294
  m.to(device)
1297
1295
 
1298
- def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, modules = None, return_shared_modules = None, configKwargs ={}):
1296
+ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, forcedConfigPath = None, defaultConfigPath = None, modelClass=None, modelPrefix = None, writable_tensors = True, verboseLevel = -1, preprocess_sd = None, modules = None, return_shared_modules = None, configKwargs ={}):
1299
1297
  """
1300
1298
  quick version of .LoadfromPretrained of the transformers library
1301
1299
  used to build a model and load the corresponding weights (quantized or not)
@@ -1383,13 +1381,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
1383
1381
 
1384
1382
  model._config = transformer_config
1385
1383
 
1386
- load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, modules = modules, return_shared_modules = return_shared_modules, verboseLevel=verboseLevel )
1384
+ load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, modelPrefix = modelPrefix, writable_tensors =writable_tensors, preprocess_sd = preprocess_sd , modules = modules, return_shared_modules = return_shared_modules, verboseLevel=verboseLevel )
1387
1385
 
1388
1386
  return model
1389
1387
 
1390
1388
 
1391
1389
 
1392
- def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, modules = None, return_shared_modules = None, verboseLevel = -1):
1390
+ def load_model_data(model, file_path, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, modelPrefix = None, writable_tensors = True, preprocess_sd = None, modules = None, return_shared_modules = None, verboseLevel = -1):
1393
1391
  """
1394
1392
  Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
1395
1393
  """
@@ -1506,6 +1504,9 @@ def load_model_data(model, file_path, do_quantize = False, quantizationType = qi
1506
1504
  full_state_dict, full_quantization_map, full_tied_weights_map = None, None, None
1507
1505
 
1508
1506
  # deal if we are trying to load just a sub part of a larger model
1507
+ if preprocess_sd != None:
1508
+ state_dict, quantization_map = preprocess_sd(state_dict, quantization_map)
1509
+
1509
1510
  if modelPrefix != None:
1510
1511
  base_model_prefix = modelPrefix + "."
1511
1512
  state_dict = filter_state_dict(state_dict,base_model_prefix)
@@ -2095,24 +2096,23 @@ class offload:
2095
2096
  if scaling == 0:
2096
2097
  continue
2097
2098
  if first_weight:
2098
- original_weight= weight.clone() if weight != None else None
2099
+ original_weight= weight.clone() if weight is not None else None
2099
2100
  first_weight = False
2100
2101
  if first_bias:
2101
- original_bias= bias.clone() if bias != None else None
2102
+ original_bias= bias.clone() if bias is not None else None
2102
2103
  first_bias = False
2103
2104
 
2104
- if diff_w != None:
2105
+ if diff_w is not None:
2105
2106
  weight.add_(diff_w, alpha= scaling)
2106
2107
  diff_w = None
2107
- if diff_b != None:
2108
+ if diff_b is not None:
2108
2109
  bias.add_(diff_b, alpha= scaling)
2109
2110
  diff_b = None
2110
2111
 
2111
2112
  ret = func(*args, **kwargs )
2112
2113
 
2113
- weight.data = original_weight if original_weight != None else None
2114
- if original_bias != None:
2115
- bias.data = original_bias
2114
+ if original_weight is not None: weight.data = original_weight
2115
+ if original_bias is not None: bias.data = original_bias
2116
2116
 
2117
2117
  return ret
2118
2118
 
@@ -2277,7 +2277,7 @@ class offload:
2277
2277
  setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
2278
2278
 
2279
2279
 
2280
- def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
2280
+ def hook_change_module(self, target_module, model, model_id, module_id, previous_method, previous_method_name ):
2281
2281
  if hasattr(target_module, "_lock_dtype"):
2282
2282
  dtype = target_module._lock_dtype
2283
2283
  else:
@@ -2290,11 +2290,12 @@ class offload:
2290
2290
  args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
2291
2291
  return previous_method(*args, **kwargs)
2292
2292
 
2293
- if hasattr(target_module, "_mm_id"):
2293
+ if hasattr(target_module, "_mm_" + previous_method_name):
2294
2294
  return
2295
- setattr(target_module, "_mm_id", model_id)
2295
+ setattr(target_module, "_mm_Id", model_id)
2296
+ setattr(target_module, "_mm_" + previous_method_name, previous_method)
2296
2297
 
2297
- setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_change_module, target_module), previous_method) )
2298
+ setattr(target_module, previous_method_name, functools.update_wrapper(functools.partial(check_change_module, target_module), previous_method) )
2298
2299
 
2299
2300
  if not self.verboseLevel >=1:
2300
2301
  return
@@ -2662,23 +2663,27 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2662
2663
  cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre, None, num, False
2663
2664
  cur_blocks_name = submodule_name
2664
2665
  # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
2665
-
2666
-
2667
- if hasattr(submodule, "forward"):
2668
- # if any_lora and isinstance(submodule, ( torch.nn.Linear, torch.nn.Conv3d, torch.nn.LayerNorm)):
2669
- if any_lora and hasattr(submodule,"weight"):
2666
+ top_submodule = len(submodule_name.split("."))==1
2667
+ offload_hooks = submodule._offload_hooks if hasattr(submodule, "_offload_hooks") else []
2668
+ if len(offload_hooks) > 0:
2669
+ pass
2670
+ assert top_submodule or len(offload_hooks) == 0, "custom offload hooks can only be set at the of the module"
2671
+ submodule_method_names = ["forward"] + offload_hooks
2672
+ for submodule_method_name in submodule_method_names:
2673
+ if not hasattr(submodule, submodule_method_name ): continue
2674
+ if submodule_method_name == "forward" and any_lora and hasattr(submodule,"weight"):
2670
2675
  submodule_method = self.hook_lora(submodule, current_model, model_id, loras_model_data, loras_model_shortcuts, submodule_name)
2671
2676
  else:
2672
- submodule_method = getattr(submodule, "forward")
2673
- if callable(submodule_method):
2674
- if len(submodule_name.split("."))==1:
2675
- self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method)
2677
+ submodule_method = getattr(submodule, submodule_method_name)
2678
+ if callable(submodule_method):
2679
+ if top_submodule and cur_blocks_name is None:
2680
+ self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method, submodule_method_name)
2676
2681
  elif compilationInThisOne and submodule in towers_modules:
2677
2682
  self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
2678
2683
  else:
2679
2684
  self.hook_check_empty_cache_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2680
-
2681
- self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
2685
+
2686
+ self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
2682
2687
 
2683
2688
 
2684
2689
  self.tune_preloading(model_id, current_budget, towers_names)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.5
3
+ Version: 3.5.7
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.5 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.5.7 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes