mmgp 3.5.10__py3-none-any.whl → 3.5.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/offload.py CHANGED
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.5.10 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -66,7 +66,6 @@ from accelerate import init_empty_weights
66
66
 
67
67
  import functools
68
68
  import types
69
- from functools import lru_cache
70
69
  import torch
71
70
 
72
71
 
@@ -90,6 +89,23 @@ class QEmbedding(QModuleMixin, torch.nn.Embedding):
90
89
 
91
90
  shared_state = {}
92
91
 
92
+ def get_cache(cache_name):
93
+ all_cache = shared_state.get("_cache", None)
94
+ if all_cache is None:
95
+ all_cache = {}
96
+ shared_state["_cache"]= all_cache
97
+ cache = all_cache.get(cache_name, None)
98
+ if cache is None:
99
+ cache = {}
100
+ all_cache[cache_name] = cache
101
+ return cache
102
+
103
+ def clear_caches():
104
+ all_cache = shared_state.get("_cache", None)
105
+ if all_cache is not None:
106
+ all_cache.clear()
107
+
108
+
93
109
  mmm = safetensors2.mmm
94
110
 
95
111
  default_verboseLevel = 1
@@ -623,6 +639,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
623
639
  total += size
624
640
 
625
641
  current_big_tensor = big_tensors[big_tensor_no]
642
+
626
643
  if is_buffer :
627
644
  _force_load_buffer(p) # otherwise potential memory leak
628
645
  if isinstance(p, QTensor):
@@ -671,7 +688,7 @@ def _welcome():
671
688
  if welcome_displayed:
672
689
  return
673
690
  welcome_displayed = True
674
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.10) by DeepBeepMeep ************{ENDC}{UNBOLD}")
691
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.5.12) by DeepBeepMeep ************{ENDC}{UNBOLD}")
675
692
 
676
693
  def change_dtype(model, new_dtype, exclude_buffers = False):
677
694
  for submodule_name, submodule in model.named_modules():
@@ -1032,7 +1049,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1032
1049
 
1033
1050
  if split_linear_modules_map != None:
1034
1051
  new_state_dict = dict()
1035
- suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False)]
1052
+ suffixes = [(".alpha", -2, False), (".lora_B.weight", -3, True), (".lora_A.weight", -3, False), (".lora_up.weight", -3, True), (".lora_down.weight", -3, False)]
1036
1053
  for module_name, module_data in state_dict.items():
1037
1054
  name_parts = module_name.split(".")
1038
1055
  for suffix, pos, any_split in suffixes:
@@ -1306,7 +1323,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
1306
1323
  model_path = [model_path]
1307
1324
 
1308
1325
 
1309
- if not builtins.all(file_name.endswith(".sft") or file_name.endswith(".safetensors") or file_name.endswith(".pt") for file_name in model_path):
1326
+ if not builtins.all(file_name.endswith(".sft") or file_name.endswith(".safetensors") or file_name.endswith(".pt") or file_name.endswith(".ckpt") for file_name in model_path):
1310
1327
  raise Exception("full model path to file expected")
1311
1328
 
1312
1329
  model_path = [ _get_model(file) for file in model_path]
@@ -1314,7 +1331,7 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantiza
1314
1331
  raise Exception("Unable to find file")
1315
1332
 
1316
1333
  verboseLevel = _compute_verbose_level(verboseLevel)
1317
- if model_path[-1].endswith(".pt"):
1334
+ if model_path[-1].endswith(".pt") or model_path[-1].endswith(".ckpt"):
1318
1335
  metadata = None
1319
1336
  else:
1320
1337
  with safetensors2.safe_open(model_path[-1], writable_tensors =writable_tensors) as f:
@@ -2481,7 +2498,7 @@ def {fname}(module, *args, **kwargs):
2481
2498
 
2482
2499
 
2483
2500
 
2484
- def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
2501
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, vram_safety_coefficient = 0.8, verboseLevel = -1):
2485
2502
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
2486
2503
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
2487
2504
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -2490,6 +2507,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2490
2507
  budgets: 0 by default (unlimited). If non 0, it corresponds to the maximum size in MB that every model will occupy at any moment
2491
2508
  (in fact the real usage is twice this number). It is very efficient to reduce VRAM consumption but this feature may be very slow
2492
2509
  if pinnedMemory is not enabled
2510
+ vram_safety_coefficient: float between 0 and 1 (exclusive), default 0.8. Sets the maximum portion of VRAM that can be used for models.
2511
+ Lower values provide more safety margin but may reduce performance.
2493
2512
  """
2494
2513
  self = offload()
2495
2514
  self.verboseLevel = verboseLevel
@@ -2505,7 +2524,11 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2505
2524
  return float(b[:-1]) * self.device_mem_capacity
2506
2525
  else:
2507
2526
  return b * ONE_MB
2508
-
2527
+
2528
+ # Validate vram_safety_coefficient
2529
+ if not isinstance(vram_safety_coefficient, float) or vram_safety_coefficient <= 0 or vram_safety_coefficient >= 1:
2530
+ raise ValueError("vram_safety_coefficient must be a float between 0 and 1 (exclusive)")
2531
+
2509
2532
  budget = 0
2510
2533
  if not budgets is None:
2511
2534
  if isinstance(budgets , dict):
@@ -2650,14 +2673,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2650
2673
  model_budget = new_budget if model_budget == 0 or new_budget < model_budget else model_budget
2651
2674
  if model_budget > 0 and model_budget > current_model_size:
2652
2675
  model_budget = 0
2653
- coef =0.8
2676
+ coef =vram_safety_coefficient
2654
2677
  if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
2655
2678
  if verboseLevel >= 1:
2656
2679
  if model_budget == 0:
2657
- print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
2680
+ print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100:.0f}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB:0.1f}) MB)")
2658
2681
  else:
2659
2682
  print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
2660
- print(f"Budget allocation for this model has been consequently reduced to the 80% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
2683
+ print(f"Budget allocation for this model has been consequently reduced to the {coef * 100:.0f}% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
2661
2684
  model_budget = coef * self.device_mem_capacity
2662
2685
 
2663
2686
 
@@ -2681,6 +2704,21 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2681
2704
  print(f"Model '{model_id}' already pinned to reserved memory")
2682
2705
  else:
2683
2706
  _pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, perc_reserved_mem_max = perc_reserved_mem_max, verboseLevel=verboseLevel)
2707
+ # empty_tensor = torch.empty((1,))
2708
+ # for sub_module_name, sub_module in current_model.named_modules():
2709
+ # for k, p in sub_module.named_parameters(recurse=False):
2710
+ # if p is not None:
2711
+ # if isinstance(p, QTensor):
2712
+ # p._data.data = empty_tensor
2713
+ # p._scale.data = empty_tensor
2714
+ # else:
2715
+ # p.data = empty_tensor
2716
+ # del k
2717
+ # for k, v in sub_module.named_buffers(recurse=False):
2718
+ # del k
2719
+ # sub_module = None
2720
+ # v = None
2721
+ # gc.collect()
2684
2722
  current_budget = model_budgets[model_id]
2685
2723
  cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
2686
2724
  self.loaded_blocks[model_id] = None
mmgp/safetensors2.py CHANGED
@@ -46,7 +46,16 @@ class MmapTracker:
46
46
  file_path = os.path.join(*s)
47
47
  self.file_path = file_path # os.path.abspath(file_path)
48
48
  self.count = 0
49
- mmm[file_path] = self
49
+ key = file_path
50
+ i = 1
51
+ while True:
52
+ if key not in mmm:
53
+ mmm[key] = self
54
+ break
55
+ i +=1
56
+ key = key + "#" + str(i)
57
+ self.mmm_key = key
58
+ # print(f"MMAP Add: {file_path}: {mmm.keys()}")
50
59
 
51
60
  def register(self, mmap_obj, map_id, start, size):
52
61
 
@@ -61,7 +70,8 @@ class MmapTracker:
61
70
 
62
71
  print(f"MMap Manager of file '{self.file_path}' : MMap no {map_id} has been released" + text)
63
72
  if self.count == self._already_released:
64
- del mmm[self.file_path]
73
+ # print(f"MMAP Del: {self.file_path}: {mmm.keys()}")
74
+ del mmm[self.mmm_key ]
65
75
 
66
76
  self._maps.pop(map_id, None)
67
77
 
@@ -240,7 +250,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None, extr
240
250
  t = t.view(torch.uint16)
241
251
  elif dtype == torch.float8_e5m2 or dtype == torch.float8_e4m3fn:
242
252
  t = t.view(torch.uint8)
243
- buffer = t.numpy().tobytes()
253
+ buffer = t.cpu().numpy().tobytes()
244
254
  bytes_written = writer.write(buffer)
245
255
  assert bytes_written == size
246
256
  i+=1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.5.10
3
+ Version: 3.5.12
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  Requires-Python: >=3.10
@@ -15,7 +15,7 @@ Dynamic: license-file
15
15
 
16
16
 
17
17
  <p align="center">
18
- <H2>Memory Management 3.5.10 for the GPU Poor by DeepBeepMeep</H2>
18
+ <H2>Memory Management 3.5.12 for the GPU Poor by DeepBeepMeep</H2>
19
19
  </p>
20
20
 
21
21
 
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=qUr0KW3eGtkNLc2eZvgz8roK2jFh9T-KpPe6icSin7I,132211
4
+ mmgp/safetensors2.py,sha256=zYNMprt1KoxgVALbcz6DawxsQDNNRImvgO9cYRChUiY,19028
5
+ mmgp-3.5.12.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
+ mmgp-3.5.12.dist-info/METADATA,sha256=zbOHAwD5QciOmKHWdHt9zpMO3KtIyYadeVytReJ52lo,16311
7
+ mmgp-3.5.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ mmgp-3.5.12.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.5.12.dist-info/RECORD,,
@@ -1,2 +1,2 @@
1
- GNU GENERAL PUBLIC LICENSE
1
+ GNU GENERAL PUBLIC LICENSE
2
2
  Version 3, 29 June 2007
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=trK86XTXP5sebBV2rn8TZ5q2kDcwyii1JsUTr3Fa_So,130319
4
- mmgp/safetensors2.py,sha256=4nKV13qCMabnNEB1TA_ueFbfGYYmiQ9racR_C6SsGug,18693
5
- mmgp-3.5.10.dist-info/licenses/LICENSE.md,sha256=DD-WIS0BkPoWJ_8hQO3J8hMP9K_1-dyrYv1YCbkxcDU,94
6
- mmgp-3.5.10.dist-info/METADATA,sha256=xkcp_PywTbsEmhKdi5d6hgQl0dqcQn-mV7V1S9G1GWo,16311
7
- mmgp-3.5.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- mmgp-3.5.10.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.5.10.dist-info/RECORD,,
File without changes