mmgp 3.5.11__tar.gz → 3.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.11
3
+ Version: 3.6.0
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.11 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.0 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  <p align="center">
3
- <H2>Memory Management 3.5.11 for the GPU Poor by DeepBeepMeep</H2>
3
+ <H2>Memory Management 3.6.0 for the GPU Poor by DeepBeepMeep</H2>
4
4
  </p>
5
5
 
6
6
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.5.11"
3
+ version = "3.6.0"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.5.11 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.6.0 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -94,7 +94,7 @@ def get_cache(cache_name):
94
94
  if all_cache is None:
95
95
  all_cache = {}
96
96
  shared_state["_cache"]= all_cache
97
- cache = shared_state.get(cache_name, None)
97
+ cache = all_cache.get(cache_name, None)
98
98
  if cache is None:
99
99
  cache = {}
100
100
  all_cache[cache_name] = cache
@@ -688,7 +688,7 @@ def _welcome():
688
688
  if welcome_displayed:
689
689
  return
690
690
  welcome_displayed = True
691
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.11) by DeepBeepMeep ************{ENDC}{UNBOLD}")
691
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.6.0) by DeepBeepMeep ************{ENDC}{UNBOLD}")
692
692
 
693
693
  def change_dtype(model, new_dtype, exclude_buffers = False):
694
694
  for submodule_name, submodule in model.named_modules():
@@ -2287,9 +2287,10 @@ class offload:
2287
2287
  src = f"""
2288
2288
  def {fname}(module, *args, **kwargs):
2289
2289
  _ = __TYPE_CONST # anchor type as a constant to make code object unique per class
2290
+ nada = "{fname}"
2290
2291
  mgr = module._mm_manager
2291
2292
  mgr._pre_check(module)
2292
- return module._mm_forward(*args, **kwargs)
2293
+ return module._mm_forward(*args, **kwargs) #{fname}
2293
2294
  """
2294
2295
  ns = {"__TYPE_CONST": mod_cls}
2295
2296
  exec(src, ns) # compile a new function object/code object for this class
@@ -2310,7 +2311,8 @@ def {fname}(module, *args, **kwargs):
2310
2311
  wrapper_fn = self._get_wrapper_for_type(type(target_module))
2311
2312
 
2312
2313
  # bind as a bound method (no partial/closures)
2313
- target_module.forward = types.MethodType(wrapper_fn, target_module)
2314
+ # target_module.forward = types.MethodType(wrapper_fn, target_module)
2315
+ target_module.forward = functools.update_wrapper(functools.partial(wrapper_fn, target_module), previous_method)
2314
2316
 
2315
2317
  def hook_check_load_into_GPU_if_needed_default(self, target_module, model, model_id, blocks_name, previous_method, context):
2316
2318
 
@@ -2345,12 +2347,12 @@ def {fname}(module, *args, **kwargs):
2345
2347
  if isinstance(target_module, torch.nn.Linear):
2346
2348
  def check_load_into_GPU_needed_linear(module, *args, **kwargs):
2347
2349
  check_load_into_GPU_needed()
2348
- return previous_method(*args, **kwargs)
2350
+ return previous_method(*args, **kwargs) # linear
2349
2351
  check_load_into_GPU_needed_module = check_load_into_GPU_needed_linear
2350
2352
  else:
2351
2353
  def check_load_into_GPU_needed_other(module, *args, **kwargs):
2352
2354
  check_load_into_GPU_needed()
2353
- return previous_method(*args, **kwargs)
2355
+ return previous_method(*args, **kwargs) # other
2354
2356
  check_load_into_GPU_needed_module = check_load_into_GPU_needed_other
2355
2357
 
2356
2358
  setattr(target_module, "_mm_id", model_id)
@@ -2498,7 +2500,7 @@ def {fname}(module, *args, **kwargs):
2498
2500
 
2499
2501
 
2500
2502
 
2501
- def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
2503
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, compile_mode ="default", verboseLevel = -1):
2502
2504
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
2503
2505
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
2504
2506
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -2507,6 +2509,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2507
2509
  budgets: 0 by default (unlimited). If non 0, it corresponds to the maximum size in MB that every model will occupy at any moment
2508
2510
  (in fact the real usage is twice this number). It is very efficient to reduce VRAM consumption but this feature may be very slow
2509
2511
  if pinnedMemory is not enabled
2512
+ vram_safety_coefficient: float between 0 and 1 (exclusive), default 0.8. Sets the maximum portion of VRAM that can be used for models.
2513
+ Lower values provide more safety margin but may reduce performance.
2510
2514
  """
2511
2515
  self = offload()
2512
2516
  self.verboseLevel = verboseLevel
@@ -2522,7 +2526,11 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2522
2526
  return float(b[:-1]) * self.device_mem_capacity
2523
2527
  else:
2524
2528
  return b * ONE_MB
2525
-
2529
+
2530
+ # Validate vram_safety_coefficient
2531
+ if not isinstance(vram_safety_coefficient, float) or vram_safety_coefficient <= 0 or vram_safety_coefficient >= 1:
2532
+ raise ValueError("vram_safety_coefficient must be a float between 0 and 1 (exclusive)")
2533
+
2526
2534
  budget = 0
2527
2535
  if not budgets is None:
2528
2536
  if isinstance(budgets , dict):
@@ -2667,14 +2675,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2667
2675
  model_budget = new_budget if model_budget == 0 or new_budget < model_budget else model_budget
2668
2676
  if model_budget > 0 and model_budget > current_model_size:
2669
2677
  model_budget = 0
2670
- coef =0.8
2678
+ coef =vram_safety_coefficient
2671
2679
  if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
2672
2680
  if verboseLevel >= 1:
2673
2681
  if model_budget == 0:
2674
- print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
2682
+ print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100:.0f}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB:0.1f}) MB)")
2675
2683
  else:
2676
2684
  print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
2677
- print(f"Budget allocation for this model has been consequently reduced to the 80% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
2685
+ print(f"Budget allocation for this model has been consequently reduced to the {coef * 100:.0f}% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
2678
2686
  model_budget = coef * self.device_mem_capacity
2679
2687
 
2680
2688
 
@@ -2765,8 +2773,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2765
2773
  elif compilationInThisOne and submodule in towers_modules:
2766
2774
  self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
2767
2775
  else:
2768
- if compilationInThisOne and False:
2769
- self.hook_check_load_into_GPU_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2776
+ if compilationInThisOne: #and False
2777
+ self.hook_check_load_into_GPU_if_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2770
2778
  else:
2771
2779
  self.hook_check_load_into_GPU_if_needed_default(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2772
2780
 
@@ -2783,7 +2791,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2783
2791
  print(f"Pytorch compilation of model '{model_id}' is not yet supported.")
2784
2792
 
2785
2793
  for submodel in towers_modules:
2786
- submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
2794
+ submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode= compile_mode) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
2787
2795
  #dynamic=True,
2788
2796
 
2789
2797
  self.tune_preloading(model_id, current_budget, towers_names)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.11
3
+ Version: 3.6.0
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.11 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.6.0 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
File without changes
File without changes
File without changes
File without changes
File without changes