mmgp 3.2.8__py3-none-any.whl → 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mmgp/offload.py CHANGED
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.2.8 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.3.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -92,6 +92,8 @@ ONE_MB = 1048576
92
92
  sizeofbfloat16 = torch.bfloat16.itemsize
93
93
  sizeofint8 = torch.int8.itemsize
94
94
  total_pinned_bytes = 0
95
+ max_pinnable_bytes = 0
96
+
95
97
  physical_memory= psutil.virtual_memory().total
96
98
 
97
99
  HEADER = '\033[95m'
@@ -319,6 +321,13 @@ def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
319
321
  print(f"Found {tied_weights_count} tied weights for a total of {tied_weights_total/ONE_MB:0.2f} MB, last : {tied_weights_last}")
320
322
 
321
323
  def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
324
+ global max_pinnable_bytes, total_pinned_bytes
325
+ if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
326
+
327
+ if verboseLevel>=1 :
328
+ print(f"Unable pin data of '{sd_name}' to reserved RAM as there is no reserved RAM left")
329
+ return
330
+
322
331
  current_big_tensor_size = 0
323
332
  big_tensor_no = 0
324
333
  big_tensors_sizes = []
@@ -393,10 +402,19 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
393
402
 
394
403
 
395
404
  def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
405
+
406
+ global max_pinnable_bytes, total_pinned_bytes
407
+ if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
408
+
409
+ if verboseLevel>=1 :
410
+ print(f"Unable pin data of '{model_id}' to reserved RAM as there is no reserved RAM left")
411
+ return
412
+
396
413
  if partialPinning:
397
414
  towers_names, _ = _detect_main_towers(model)
398
415
 
399
416
 
417
+
400
418
  current_big_tensor_size = 0
401
419
  big_tensor_no = 0
402
420
  big_tensors_sizes = []
@@ -484,13 +502,18 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
484
502
  total = 0
485
503
 
486
504
 
505
+ failed_planned_allocation = False
487
506
 
488
507
  for size in big_tensors_sizes:
489
508
  try:
509
+ # if total > 7000 * ONE_MB:
510
+ # raise Exception ("test no more reserved RAM")
490
511
  current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
491
512
  big_tensors.append(current_big_tensor)
492
513
  except:
493
514
  print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
515
+ max_pinnable_bytes = total + total_pinned_bytes
516
+ failed_planned_allocation = True
494
517
  break
495
518
 
496
519
  last_big_tensor += 1
@@ -553,13 +576,13 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
553
576
  p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
554
577
  tensor_no += 1
555
578
  del p
556
- global total_pinned_bytes
579
+ model._pinned_bytes = total
557
580
  total_pinned_bytes += total
558
581
  del params_dict
559
582
  gc.collect()
560
583
 
561
584
  if verboseLevel >=1:
562
- if partialPinning:
585
+ if partialPinning or failed_planned_allocation:
563
586
  print(f"The model was partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
564
587
  else:
565
588
  print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
@@ -575,7 +598,7 @@ def _welcome():
575
598
  if welcome_displayed:
576
599
  return
577
600
  welcome_displayed = True
578
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.8) by DeepBeepMeep ************{ENDC}{UNBOLD}")
601
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
579
602
 
580
603
  def _extract_num_from_str(num_in_str):
581
604
  size = len(num_in_str)
@@ -882,10 +905,11 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
882
905
  return source + CrLf + text
883
906
 
884
907
  def trunc(text, sz):
908
+ text = str(text)
885
909
  if len(text) < sz:
886
- return str(text)
910
+ return text
887
911
  else:
888
- return str(text)[0:sz] + '...'
912
+ return text[0:sz] + '...'
889
913
 
890
914
  if not isinstance(lora_path, list):
891
915
  lora_path = [lora_path]
@@ -1408,7 +1432,9 @@ def extract_models(obj = None, prefix = None):
1408
1432
  elif prefix[ -1:] != "/":
1409
1433
  prefix + "/"
1410
1434
 
1411
- for name in dir(obj):
1435
+ for name in dir(obj):
1436
+ if name in ["_execution_device"]:
1437
+ continue
1412
1438
  element = getattr(obj,name)
1413
1439
  if name in ("pipeline", "pipe"):
1414
1440
  pipeline = element
@@ -1550,7 +1576,7 @@ class offload:
1550
1576
  lora_A, lora_B, alpha = lora_data
1551
1577
  key = adapter + '_GPU'
1552
1578
  if to_GPU:
1553
- lora_module[key] = [lora_A.cuda(), lora_B.cuda(), alpha]
1579
+ lora_module[key] = [lora_A.cuda(non_blocking=True), lora_B.cuda(non_blocking=True), alpha]
1554
1580
  elif key in lora_module:
1555
1581
  del lora_module[key]
1556
1582
 
@@ -1594,8 +1620,8 @@ class offload:
1594
1620
  lora_data = loras_model_data.get(parent_module, None)
1595
1621
  if lora_data != None:
1596
1622
  loras_modules[parent_module]= lora_data
1597
- if len(loras_modules) > 0:
1598
- self._move_loras(loras_active_adapters, loras_modules, True)
1623
+ if len(loras_modules) > 0:
1624
+ self._move_loras(loras_active_adapters, loras_modules, True)
1599
1625
 
1600
1626
  loaded_block = self.loaded_blocks[model_id]
1601
1627
 
@@ -2019,7 +2045,7 @@ class offload:
2019
2045
  print(f"Async loading plan for model '{model_id}' : {(preload_total+base_size)/ONE_MB:0.2f} MB will be preloaded (base size of {base_size/ONE_MB:0.2f} MB + {preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async" + (" circular" if len(towers) == 1 else "") + " shuttle")
2020
2046
 
2021
2047
  def release(self):
2022
- global last_offload_obj
2048
+ global last_offload_obj, total_pinned_bytes
2023
2049
 
2024
2050
  if last_offload_obj == self:
2025
2051
  last_offload_obj = None
@@ -2035,6 +2061,8 @@ class offload:
2035
2061
 
2036
2062
  for model_id, model in self.models.items():
2037
2063
  move_loras_to_device(model, "cpu")
2064
+ if hasattr(model, "_pinned_bytes"):
2065
+ total_pinned_bytes -= model._pinned_bytes
2038
2066
  if hasattr(model, "_loras_model_data"):
2039
2067
  unload_loras_from_model(model)
2040
2068
 
@@ -2046,7 +2074,7 @@ class offload:
2046
2074
 
2047
2075
 
2048
2076
 
2049
- def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
2077
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
2050
2078
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
2051
2079
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
2052
2080
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -2216,9 +2244,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, l
2216
2244
 
2217
2245
  model_budgets[model_id] = model_budget
2218
2246
 
2219
- partialPinning = False
2220
2247
 
2221
- if estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
2248
+ if not partialPinning and estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
2222
2249
  if self.verboseLevel >=1:
2223
2250
  print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated available reservable RAM is {(max_reservable_memory-total_pinned_bytes)/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
2224
2251
  partialPinning = True
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.2.8
3
+ Version: 3.3.1
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -13,10 +13,11 @@ Requires-Dist: optimum-quanto
13
13
  Requires-Dist: accelerate
14
14
  Requires-Dist: safetensors
15
15
  Requires-Dist: psutil
16
+ Dynamic: license-file
16
17
 
17
18
 
18
19
  <p align="center">
19
- <H2>Memory Management 3.2.8 for the GPU Poor by DeepBeepMeep</H2>
20
+ <H2>Memory Management 3.3.1 for the GPU Poor by DeepBeepMeep</H2>
20
21
  </p>
21
22
 
22
23
 
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=hG-gMFeHsRkjaPan_lwiTsQOctkXylJMiWhyL3KvGQA,106337
4
+ mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
5
+ mmgp-3.3.1.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
+ mmgp-3.3.1.dist-info/METADATA,sha256=SF0kLwi8zGHF1F53ZxFDZq5bDCWE39l-A24tYeyyhHo,16153
7
+ mmgp-3.3.1.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
8
+ mmgp-3.3.1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.3.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (76.1.0)
2
+ Generator: setuptools (77.0.3)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=Wwk3uV3ZJv3ApyX-vpzukOllkBOTkLwGm5qDadmqVqQ,105209
4
- mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
5
- mmgp-3.2.8.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
- mmgp-3.2.8.dist-info/METADATA,sha256=_3nE_8-UHpItfJsJsb4KUIs_WdROc68SCTNTP5lj_ho,16131
7
- mmgp-3.2.8.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
8
- mmgp-3.2.8.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.2.8.dist-info/RECORD,,