mmgp 3.5.11__tar.gz → 3.5.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.11
3
+ Version: 3.5.12
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.11 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -1,6 +1,6 @@
1
1
 
2
2
  <p align="center">
3
- <H2>Memory Management 3.5.11 for the GPU Poor by DeepBeepMeep</H2>
3
+ <H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
4
4
  </p>
5
5
 
6
6
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mmgp"
3
- version = "3.5.11"
3
+ version = "3.5.12"
4
4
  authors = [
5
5
  { name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
6
6
  ]
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.5.11 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -94,7 +94,7 @@ def get_cache(cache_name):
94
94
  if all_cache is None:
95
95
  all_cache = {}
96
96
  shared_state["_cache"]= all_cache
97
- cache = shared_state.get(cache_name, None)
97
+ cache = all_cache.get(cache_name, None)
98
98
  if cache is None:
99
99
  cache = {}
100
100
  all_cache[cache_name] = cache
@@ -688,7 +688,7 @@ def _welcome():
688
688
  if welcome_displayed:
689
689
  return
690
690
  welcome_displayed = True
691
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.11) by DeepBeepMeep ************{ENDC}{UNBOLD}")
691
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.12) by DeepBeepMeep ************{ENDC}{UNBOLD}")
692
692
 
693
693
  def change_dtype(model, new_dtype, exclude_buffers = False):
694
694
  for submodule_name, submodule in model.named_modules():
@@ -2498,7 +2498,7 @@ def {fname}(module, *args, **kwargs):
2498
2498
 
2499
2499
 
2500
2500
 
2501
- def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
2501
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, verboseLevel = -1):
2502
2502
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
2503
2503
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
2504
2504
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -2507,6 +2507,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2507
2507
  budgets: 0 by default (unlimited). If non 0, it corresponds to the maximum size in MB that every model will occupy at any moment
2508
2508
  (in fact the real usage is twice this number). It is very efficient to reduce VRAM consumption but this feature may be very slow
2509
2509
  if pinnedMemory is not enabled
2510
+ vram_safety_coefficient: float between 0 and 1 (exclusive), default 0.8. Sets the maximum portion of VRAM that can be used for models.
2511
+ Lower values provide more safety margin but may reduce performance.
2510
2512
  """
2511
2513
  self = offload()
2512
2514
  self.verboseLevel = verboseLevel
@@ -2522,7 +2524,11 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2522
2524
  return float(b[:-1]) * self.device_mem_capacity
2523
2525
  else:
2524
2526
  return b * ONE_MB
2525
-
2527
+
2528
+ # Validate vram_safety_coefficient
2529
+ if not isinstance(vram_safety_coefficient, float) or vram_safety_coefficient <= 0 or vram_safety_coefficient >= 1:
2530
+ raise ValueError("vram_safety_coefficient must be a float between 0 and 1 (exclusive)")
2531
+
2526
2532
  budget = 0
2527
2533
  if not budgets is None:
2528
2534
  if isinstance(budgets , dict):
@@ -2667,14 +2673,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2667
2673
  model_budget = new_budget if model_budget == 0 or new_budget < model_budget else model_budget
2668
2674
  if model_budget > 0 and model_budget > current_model_size:
2669
2675
  model_budget = 0
2670
- coef =0.8
2676
+ coef =vram_safety_coefficient
2671
2677
  if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
2672
2678
  if verboseLevel >= 1:
2673
2679
  if model_budget == 0:
2674
- print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
2680
+ print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100:.0f}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB:0.1f}) MB)")
2675
2681
  else:
2676
2682
  print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
2677
- print(f"Budget allocation for this model has been consequently reduced to the 80% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
2683
+ print(f"Budget allocation for this model has been consequently reduced to the {coef * 100:.0f}% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
2678
2684
  model_budget = coef * self.device_mem_capacity
2679
2685
 
2680
2686
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.11
3
+ Version: 3.5.12
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.11 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
File without changes
File without changes
File without changes
File without changes
File without changes