mmgp 3.3.0__py3-none-any.whl → 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mmgp/offload.py CHANGED
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.3.0 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.3.1 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -92,6 +92,8 @@ ONE_MB = 1048576
92
92
  sizeofbfloat16 = torch.bfloat16.itemsize
93
93
  sizeofint8 = torch.int8.itemsize
94
94
  total_pinned_bytes = 0
95
+ max_pinnable_bytes = 0
96
+
95
97
  physical_memory= psutil.virtual_memory().total
96
98
 
97
99
  HEADER = '\033[95m'
@@ -319,6 +321,13 @@ def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
319
321
  print(f"Found {tied_weights_count} tied weights for a total of {tied_weights_total/ONE_MB:0.2f} MB, last : {tied_weights_last}")
320
322
 
321
323
  def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
324
+ global max_pinnable_bytes, total_pinned_bytes
325
+ if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
326
+
327
+ if verboseLevel>=1 :
328
+ print(f"Unable pin data of '{sd_name}' to reserved RAM as there is no reserved RAM left")
329
+ return
330
+
322
331
  current_big_tensor_size = 0
323
332
  big_tensor_no = 0
324
333
  big_tensors_sizes = []
@@ -393,10 +402,19 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
393
402
 
394
403
 
395
404
  def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
405
+
406
+ global max_pinnable_bytes, total_pinned_bytes
407
+ if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
408
+
409
+ if verboseLevel>=1 :
410
+ print(f"Unable pin data of '{model_id}' to reserved RAM as there is no reserved RAM left")
411
+ return
412
+
396
413
  if partialPinning:
397
414
  towers_names, _ = _detect_main_towers(model)
398
415
 
399
416
 
417
+
400
418
  current_big_tensor_size = 0
401
419
  big_tensor_no = 0
402
420
  big_tensors_sizes = []
@@ -484,13 +502,18 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
484
502
  total = 0
485
503
 
486
504
 
505
+ failed_planned_allocation = False
487
506
 
488
507
  for size in big_tensors_sizes:
489
508
  try:
509
+ # if total > 7000 * ONE_MB:
510
+ # raise Exception ("test no more reserved RAM")
490
511
  current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
491
512
  big_tensors.append(current_big_tensor)
492
513
  except:
493
514
  print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
515
+ max_pinnable_bytes = total + total_pinned_bytes
516
+ failed_planned_allocation = True
494
517
  break
495
518
 
496
519
  last_big_tensor += 1
@@ -553,13 +576,13 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
553
576
  p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
554
577
  tensor_no += 1
555
578
  del p
556
- global total_pinned_bytes
579
+ model._pinned_bytes = total
557
580
  total_pinned_bytes += total
558
581
  del params_dict
559
582
  gc.collect()
560
583
 
561
584
  if verboseLevel >=1:
562
- if partialPinning:
585
+ if partialPinning or failed_planned_allocation:
563
586
  print(f"The model was partially pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
564
587
  else:
565
588
  print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
@@ -575,7 +598,7 @@ def _welcome():
575
598
  if welcome_displayed:
576
599
  return
577
600
  welcome_displayed = True
578
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.8) by DeepBeepMeep ************{ENDC}{UNBOLD}")
601
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.1) by DeepBeepMeep ************{ENDC}{UNBOLD}")
579
602
 
580
603
  def _extract_num_from_str(num_in_str):
581
604
  size = len(num_in_str)
@@ -2022,7 +2045,7 @@ class offload:
2022
2045
  print(f"Async loading plan for model '{model_id}' : {(preload_total+base_size)/ONE_MB:0.2f} MB will be preloaded (base size of {base_size/ONE_MB:0.2f} MB + {preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async" + (" circular" if len(towers) == 1 else "") + " shuttle")
2023
2046
 
2024
2047
  def release(self):
2025
- global last_offload_obj
2048
+ global last_offload_obj, total_pinned_bytes
2026
2049
 
2027
2050
  if last_offload_obj == self:
2028
2051
  last_offload_obj = None
@@ -2038,6 +2061,8 @@ class offload:
2038
2061
 
2039
2062
  for model_id, model in self.models.items():
2040
2063
  move_loras_to_device(model, "cpu")
2064
+ if hasattr(model, "_pinned_bytes"):
2065
+ total_pinned_bytes -= model._pinned_bytes
2041
2066
  if hasattr(model, "_loras_model_data"):
2042
2067
  unload_loras_from_model(model)
2043
2068
 
@@ -2049,7 +2074,7 @@ class offload:
2049
2074
 
2050
2075
 
2051
2076
 
2052
- def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
2077
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
2053
2078
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
2054
2079
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
2055
2080
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -2219,9 +2244,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, l
2219
2244
 
2220
2245
  model_budgets[model_id] = model_budget
2221
2246
 
2222
- partialPinning = False
2223
2247
 
2224
- if estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
2248
+ if not partialPinning and estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
2225
2249
  if self.verboseLevel >=1:
2226
2250
  print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated available reservable RAM is {(max_reservable_memory-total_pinned_bytes)/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
2227
2251
  partialPinning = True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmgp
3
- Version: 3.3.0
3
+ Version: 3.3.1
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Dynamic: license-file
17
17
 
18
18
 
19
19
  <p align="center">
20
- <H2>Memory Management 3.3.0 for the GPU Poor by DeepBeepMeep</H2>
20
+ <H2>Memory Management 3.3.1 for the GPU Poor by DeepBeepMeep</H2>
21
21
  </p>
22
22
 
23
23
 
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=hG-gMFeHsRkjaPan_lwiTsQOctkXylJMiWhyL3KvGQA,106337
4
+ mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
5
+ mmgp-3.3.1.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
+ mmgp-3.3.1.dist-info/METADATA,sha256=SF0kLwi8zGHF1F53ZxFDZq5bDCWE39l-A24tYeyyhHo,16153
7
+ mmgp-3.3.1.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
8
+ mmgp-3.3.1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.3.1.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=xdlYbB8nKUywAAMPcfCzJmCxYHvBB5vcZgv2wEQTtbE,105329
4
- mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
5
- mmgp-3.3.0.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
- mmgp-3.3.0.dist-info/METADATA,sha256=33eB_YmC6PciTkzi_Z_gsWWzoz6RJgyLbEItFatVghk,16153
7
- mmgp-3.3.0.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
8
- mmgp-3.3.0.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.3.0.dist-info/RECORD,,
File without changes