mmgp 3.3.3__py3-none-any.whl → 3.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/offload.py CHANGED
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.3.3 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.4.0 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -89,7 +89,7 @@ mmm = safetensors2.mmm
89
89
  default_verboseLevel = 1
90
90
 
91
91
  ONE_MB = 1048576
92
- sizeofbfloat16 = torch.bfloat16.itemsize
92
+ sizeofhalffloat = torch.bfloat16.itemsize
93
93
  sizeofint8 = torch.int8.itemsize
94
94
  total_pinned_bytes = 0
95
95
  max_pinnable_bytes = 0
@@ -149,10 +149,17 @@ def _compute_verbose_level(level):
149
149
  safetensors2.verboseLevel = level
150
150
  return level
151
151
 
152
- def _get_perc_reserved_mem_max(perc_reserved_mem_max):
153
- if perc_reserved_mem_max<=0:
152
+ def _get_perc_reserved_mem_max(perc_reserved_mem_max = 0):
153
+ if perc_reserved_mem_max <=0:
154
+ perc_reserved_mem_max = os.getenv("perc_reserved_mem_max", 0)
155
+
156
+ if perc_reserved_mem_max <= 0:
154
157
  perc_reserved_mem_max = 0.40 if os.name == 'nt' else 0.5
155
- return perc_reserved_mem_max
158
+ return perc_reserved_mem_max
159
+
160
+ def _get_max_reservable_memory(perc_reserved_mem_max = 0):
161
+ max_reservable_memory = perc_reserved_mem_max * physical_memory
162
+ return max_reservable_memory
156
163
 
157
164
  def _detect_main_towers(model, min_floors = 5):
158
165
  cur_blocks_prefix = None
@@ -214,7 +221,7 @@ def _get_model(model_path):
214
221
  _filename = _path[-1]
215
222
  _path = _path[:-1]
216
223
  if len(_path)<=1:
217
- raise("file not found")
224
+ raise Exception("file not found")
218
225
  else:
219
226
  try:
220
227
  from huggingface_hub import hf_hub_download #snapshot_download,
@@ -290,8 +297,9 @@ def _get_tensor_ref(p):
290
297
  return p.data_ptr()
291
298
 
292
299
 
293
- # BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
294
- BIG_TENSOR_MAX_SIZE = 2**27 # 128 MB
300
+ BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
301
+ BIG_TENSOR_MIN_SIZE = 2**26 # 64 MB
302
+ RESERVED_RAM_MIN_AVAILABLE = 2**27 # 128 MB
295
303
 
296
304
  def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
297
305
  tied_weights = {}
@@ -322,7 +330,7 @@ def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
322
330
 
323
331
  def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
324
332
  global max_pinnable_bytes, total_pinned_bytes
325
- if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
333
+ if max_pinnable_bytes > 0 and total_pinned_bytes >= max_pinnable_bytes:
326
334
 
327
335
  if verboseLevel>=1 :
328
336
  print(f"Unable pin data of '{sd_name}' to reserved RAM as there is no reserved RAM left")
@@ -357,6 +365,12 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
357
365
  last_big_tensor = 0
358
366
  total = 0
359
367
 
368
+ try:
369
+ dummy_pinned_tensor = torch.empty( RESERVED_RAM_MIN_AVAILABLE, dtype= torch.uint8, pin_memory=True, device="cpu")
370
+ except:
371
+ print("There isn't any Reserved RAM left, you may need to choose a profile with a higher number that requires less Reserved RAM or set OS env 'perc_reserved_mem_max' to a value less 0.3")
372
+ return
373
+
360
374
  for size in big_tensors_sizes:
361
375
  try:
362
376
  current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
@@ -367,6 +381,7 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
367
381
 
368
382
  last_big_tensor += 1
369
383
  total += size
384
+ del dummy_pinned_tensor
370
385
 
371
386
 
372
387
  tensor_no = 0
@@ -401,10 +416,10 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
401
416
  return
402
417
 
403
418
 
404
- def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, big_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
419
+ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, big_tensor_size = BIG_TENSOR_MAX_SIZE, perc_reserved_mem_max = 0,verboseLevel = 1):
405
420
 
406
421
  global max_pinnable_bytes, total_pinned_bytes
407
- if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
422
+ if max_pinnable_bytes > 0 and total_pinned_bytes >= max_pinnable_bytes:
408
423
 
409
424
  if verboseLevel>=1 :
410
425
  print(f"Unable pin data of '{model_id}' to reserved RAM as there is no reserved RAM left")
@@ -414,6 +429,8 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
414
429
  towers_names, _ = _detect_main_towers(model)
415
430
 
416
431
 
432
+ perc_reserved_mem_max = _get_perc_reserved_mem_max(perc_reserved_mem_max)
433
+ max_reservable_memory = _get_max_reservable_memory(perc_reserved_mem_max)
417
434
 
418
435
  current_big_tensor_size = 0
419
436
  big_tensor_no = 0
@@ -502,8 +519,12 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
502
519
 
503
520
 
504
521
  failed_planned_allocation = False
505
-
506
522
  gc.collect()
523
+ try:
524
+ dummy_pinned_tensor = torch.empty( RESERVED_RAM_MIN_AVAILABLE, dtype= torch.uint8, pin_memory=True, device="cpu")
525
+ except:
526
+ print("There isn't any Reserved RAM left, you may need to choose a profile with a higher number that requires less Reserved RAM or set OS env 'perc_reserved_mem_max' to a value less than{perc_reserved_mem_max}")
527
+ return
507
528
 
508
529
  last_allocated_big_tensor = -1
509
530
  tensor_no = 0
@@ -530,16 +551,21 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
530
551
  big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
531
552
  if last_allocated_big_tensor < big_tensor_no:
532
553
  last_allocated_big_tensor += 1
533
- size = big_tensors_sizes[last_allocated_big_tensor]
554
+ size = max(big_tensors_sizes[last_allocated_big_tensor], BIG_TENSOR_MIN_SIZE)
534
555
  try:
535
- # if total > 7000 * ONE_MB:
536
- # raise Exception ("test no more reserved RAM")
556
+ if max_reservable_memory > 0 and ( (total_pinned_bytes + total + size) >= max_reservable_memory):
557
+ dummy_pinned_tensor = None
558
+ failed_planned_allocation = True
559
+ max_pinnable_bytes = total_pinned_bytes + total
560
+ break
561
+
537
562
  current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
538
563
  big_tensors.append(current_big_tensor)
539
564
  except:
540
- print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
541
- max_pinnable_bytes = total + total_pinned_bytes
565
+ print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f}).")
566
+ dummy_pinned_tensor = None
542
567
  failed_planned_allocation = True
568
+ max_pinnable_bytes = total_pinned_bytes + total
543
569
  break
544
570
 
545
571
  total += size
@@ -569,6 +595,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
569
595
  p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
570
596
  tensor_no += 1
571
597
  del p
598
+ del dummy_pinned_tensor
572
599
  model._pinned_bytes = total
573
600
  total_pinned_bytes += total
574
601
  del params_dict
@@ -591,7 +618,7 @@ def _welcome():
591
618
  if welcome_displayed:
592
619
  return
593
620
  welcome_displayed = True
594
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.3) by DeepBeepMeep ************{ENDC}{UNBOLD}")
621
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.4.0) by DeepBeepMeep ************{ENDC}{UNBOLD}")
595
622
 
596
623
  def _extract_num_from_str(num_in_str):
597
624
  size = len(num_in_str)
@@ -728,10 +755,10 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 2*
728
755
  tied_weights[submodule_name]= (n, ) + match
729
756
  else:
730
757
  cache_ref[ref] = (submodule_name, n)
731
- size += torch.numel(p.data) * sizeofbfloat16
758
+ size += torch.numel(p.data) * sizeofhalffloat
732
759
 
733
760
  for p in submodule.buffers(recurse=False):
734
- size += torch.numel(p.data) * sizeofbfloat16
761
+ size += torch.numel(p.data) * sizeofhalffloat
735
762
 
736
763
 
737
764
 
@@ -853,7 +880,7 @@ def split_linear_modules(model, map ):
853
880
  sub_bias = torch.split(bias, split_sizes, dim=0)
854
881
  for sub_name, _subdata, _subbias, _subscale in zip(mapped_modules, sub_data, sub_bias, sub_scale):
855
882
  with init_empty_weights():
856
- sub_module = QLinear(_subdata.shape[1], _subdata.shape[0], bias=bias != None, device ="cpu", dtype=torch.bfloat16)
883
+ sub_module = QLinear(_subdata.shape[1], _subdata.shape[0], bias=bias != None, device ="cpu", dtype=weight.dtype)
857
884
  sub_module.weight = torch.nn.Parameter(WeightQBytesTensor.create(weight.qtype, weight.axis, _subdata.size(), weight.stride(), _subdata, _subscale, activation_qtype=weight.activation_qtype, requires_grad=weight.requires_grad ))
858
885
  if bias != None:
859
886
  sub_module.bias = torch.nn.Parameter(_subbias)
@@ -866,7 +893,7 @@ def split_linear_modules(model, map ):
866
893
  sub_bias = torch.split(bias, split_sizes, dim=0)
867
894
  for sub_name, subdata, subbias in zip(mapped_modules, sub_data, sub_bias):
868
895
  with init_empty_weights():
869
- sub_module = torch.nn.Linear( subdata.shape[1], subdata.shape[0], bias=bias != None, device ="cpu", dtype=torch.bfloat16)
896
+ sub_module = torch.nn.Linear( subdata.shape[1], subdata.shape[0], bias=bias != None, device ="cpu", dtype=weight.dtype)
870
897
  sub_module.weight = torch.nn.Parameter(subdata , requires_grad=False)
871
898
  if bias != None:
872
899
  sub_module.bias = torch.nn.Parameter(subbias)
@@ -877,17 +904,15 @@ def split_linear_modules(model, map ):
877
904
 
878
905
  def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, check_only = False, ignore_model_variations = False, pinnedLora = False, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
879
906
  verboseLevel = _compute_verbose_level(verboseLevel)
880
- modules_dict = {k: v for k,v in model.named_modules()}
881
907
 
908
+ loras_model_data = getattr(model, "_loras_model_data", None)
909
+ if loras_model_data == None:
910
+ raise Exception(f"No Loras has been declared for this model while creating the corresponding offload object")
911
+
882
912
  if not check_only:
883
- loras_model_data = dict()
884
- model._loras_model_data = loras_model_data
885
- loras_active_adapters = set()
886
- model._loras_active_adapters = loras_active_adapters
887
- loras_scaling = dict()
888
- model._loras_scaling = loras_scaling
889
- loras_tied_weights = dict()
890
- model._loras_tied_weights = loras_tied_weights
913
+ unload_loras_from_model(model)
914
+
915
+ modules_dict = {k: v for k,v in model.named_modules()}
891
916
 
892
917
  CrLf = '\r\n'
893
918
  error_msg = ""
@@ -927,9 +952,6 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
927
952
  skip = False
928
953
  state_dict = safetensors2.torch_load_file(path, writable_tensors= False)
929
954
 
930
-
931
-
932
-
933
955
  if preprocess_sd != None:
934
956
  state_dict = preprocess_sd(state_dict)
935
957
 
@@ -1045,12 +1067,13 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1045
1067
  break
1046
1068
  if not check_only:
1047
1069
  loras_module_data = loras_model_data.get(module, None)
1048
- if loras_module_data == None:
1049
- loras_module_data = dict()
1050
- loras_model_data[module] = loras_module_data
1070
+ assert loras_module_data != None
1071
+ # if loras_module_data == None:
1072
+ # loras_module_data = dict()
1073
+ # loras_model_data[module] = loras_module_data
1051
1074
  loras_adapter_data = loras_module_data.get(adapter_name, None)
1052
- lora_A = None if lora_A == None else lora_A.to(torch.bfloat16)
1053
- lora_B = None if lora_B == None else lora_B.to(torch.bfloat16)
1075
+ lora_A = None if lora_A == None else lora_A.to(module.weight.dtype)
1076
+ lora_B = None if lora_B == None else lora_B.to(module.weight.dtype)
1054
1077
  if loras_adapter_data == None:
1055
1078
  alpha = lora_alphas.get(k[:-len("lora_X.weight")] + "alpha", 1.)
1056
1079
  loras_adapter_data = [lora_A, lora_B, alpha]
@@ -1108,12 +1131,17 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
1108
1131
  return new_lora_path
1109
1132
 
1110
1133
  def unload_loras_from_model(model):
1111
- model._loras_model_data = None
1134
+ for _, v in model._loras_model_data.items():
1135
+ v.clear()
1136
+
1137
+ model._loras_active_adapters = set()
1138
+ model._loras_scaling = dict()
1139
+ model._loras_tied_weights = dict()
1112
1140
  model._loras_errors = None
1113
1141
  model._loras_adapters = None
1114
- model._loras_active_adapters = None
1115
1142
  model._loras_scaling = None
1116
1143
 
1144
+
1117
1145
  def set_step_no_for_lora(model, step_no):
1118
1146
  model._lora_step_no = step_no
1119
1147
 
@@ -1482,6 +1510,7 @@ class offload:
1482
1510
  self.transfer_stream = torch.cuda.Stream()
1483
1511
  self.async_transfers = False
1484
1512
  self.parameters_ref = {}
1513
+ self.max_reservable_memory = 0
1485
1514
 
1486
1515
  global last_offload_obj
1487
1516
  last_offload_obj = self
@@ -1728,22 +1757,22 @@ class offload:
1728
1757
  gc.collect()
1729
1758
  self.last_reserved_mem_check = time.time()
1730
1759
 
1731
- def move_args_to_gpu(self, *args, **kwargs):
1760
+ def move_args_to_gpu(self, dtype, *args, **kwargs):
1732
1761
  new_args= []
1733
1762
  new_kwargs={}
1763
+
1734
1764
  for arg in args:
1735
1765
  if torch.is_tensor(arg):
1736
1766
  if arg.dtype == torch.float32:
1737
- arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
1767
+ arg = arg.to(dtype).cuda(non_blocking=True)
1738
1768
  elif not arg.is_cuda:
1739
1769
  arg = arg.cuda(non_blocking=True)
1740
1770
  new_args.append(arg)
1741
-
1742
1771
  for k in kwargs:
1743
1772
  arg = kwargs[k]
1744
1773
  if torch.is_tensor(arg):
1745
1774
  if arg.dtype == torch.float32:
1746
- arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
1775
+ arg = arg.to(dtype).cuda(non_blocking=True)
1747
1776
  elif not arg.is_cuda:
1748
1777
  arg = arg.cuda(non_blocking=True)
1749
1778
  new_kwargs[k]= arg
@@ -1804,7 +1833,7 @@ class offload:
1804
1833
  loras_scaling = model._loras_scaling
1805
1834
  training = False
1806
1835
 
1807
-
1836
+ dtype = weight.dtype
1808
1837
  if weight.shape[-1] < x.shape[-2] : # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
1809
1838
  if len(active_adapters) > 0:
1810
1839
  if isinstance(submodule, QModuleMixin):
@@ -1832,7 +1861,7 @@ class offload:
1832
1861
  result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
1833
1862
 
1834
1863
  if len(active_adapters) > 0:
1835
- x = x.to(torch.bfloat16)
1864
+ x = x.to(dtype)
1836
1865
 
1837
1866
  for active_adapter in active_adapters:
1838
1867
  data = loras_data.get(active_adapter + '_GPU', None)
@@ -1857,14 +1886,14 @@ class offload:
1857
1886
  return result
1858
1887
 
1859
1888
 
1860
- def hook_lora_linear(self, submodule, current_model, model_id, submodule_name):
1889
+ def hook_lora_linear(self, submodule, current_model, model_id, loras_model_data, submodule_name):
1861
1890
  old_forward = submodule.forward
1891
+
1892
+ loras_data = {}
1893
+ loras_model_data[submodule] = loras_data
1894
+
1862
1895
  def lora_linear_forward(module, *args, **kwargs):
1863
- loras_model_data = getattr(current_model, "_loras_model_data", None)
1864
- loras_data = None
1865
- if loras_model_data != None:
1866
- loras_data = loras_model_data.get(submodule, None)
1867
- if loras_data == None:
1896
+ if len(loras_data) == 0:
1868
1897
  return old_forward(*args, **kwargs)
1869
1898
  else:
1870
1899
  return self._lora_linear_forward(current_model, submodule, loras_data, *args, **kwargs)
@@ -1872,7 +1901,7 @@ class offload:
1872
1901
  return functools.update_wrapper(functools.partial(lora_linear_forward, submodule), old_forward)
1873
1902
 
1874
1903
  def ensure_model_loaded(self, model_id):
1875
- if model_id in self.active_models_ids:
1904
+ if model_id in self.active_models_ids:
1876
1905
  return
1877
1906
  # new_model_id = getattr(module, "_mm_id")
1878
1907
  # do not always unload existing models if it is more efficient to keep in them in the GPU
@@ -1894,8 +1923,9 @@ class offload:
1894
1923
  target_module.register_forward_pre_hook(preload_blocks_for_compile)
1895
1924
 
1896
1925
 
1897
- def hook_check_empty_cache_needed(self, target_module, model_id, blocks_name, previous_method, context):
1926
+ def hook_check_empty_cache_needed(self, target_module, model, model_id, blocks_name, previous_method, context):
1898
1927
 
1928
+ dtype = model._dtype
1899
1929
  qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
1900
1930
  if qint4quantization:
1901
1931
  pass
@@ -1920,8 +1950,8 @@ class offload:
1920
1950
  self.empty_cache_if_needed()
1921
1951
  elif blocks_name != self.loaded_blocks[model_id] and blocks_name not in self.preloaded_blocks_per_model[model_id]:
1922
1952
  self.gpu_load_blocks(model_id, blocks_name)
1923
- if qint4quantization:
1924
- args, kwargs = self.move_args_to_gpu(*args, **kwargs)
1953
+ if qint4quantization and dtype !=None:
1954
+ args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
1925
1955
 
1926
1956
  return previous_method(*args, **kwargs)
1927
1957
 
@@ -1932,11 +1962,13 @@ class offload:
1932
1962
 
1933
1963
 
1934
1964
  def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
1965
+ dtype = model._dtype
1935
1966
 
1936
1967
  def check_change_module(module, *args, **kwargs):
1937
1968
  self.ensure_model_loaded(model_id)
1938
1969
  # transfer leftovers inputs that were incorrectly created in the RAM (mostly due to some .device tests that returned incorrectly "cpu")
1939
- args, kwargs = self.move_args_to_gpu(*args, **kwargs)
1970
+ if dtype != None:
1971
+ args, kwargs = self.move_args_to_gpu(dtype, *args, **kwargs)
1940
1972
  return previous_method(*args, **kwargs)
1941
1973
 
1942
1974
  if hasattr(target_module, "_mm_id"):
@@ -2068,7 +2100,7 @@ class offload:
2068
2100
 
2069
2101
 
2070
2102
 
2071
- def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertFloatToBfloat16 = True, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
2103
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, partialPinning = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, convertWeightsFloatTo = torch.bfloat16, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
2072
2104
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
2073
2105
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
2074
2106
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -2082,6 +2114,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2082
2114
  self.verboseLevel = verboseLevel
2083
2115
  safetensors2.verboseLevel = verboseLevel
2084
2116
  self.modules_data = {}
2117
+
2085
2118
  model_budgets = {}
2086
2119
 
2087
2120
  windows_os = os.name == 'nt'
@@ -2158,9 +2191,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2158
2191
  # torch._logging.set_logs(recompiles=True)
2159
2192
  # torch._inductor.config.realize_opcount_threshold = 100 # workaround bug "AssertionError: increase TRITON_MAX_BLOCK['X'] to 4096."
2160
2193
 
2161
-
2162
- perc_reserved_mem_max = _get_perc_reserved_mem_max(perc_reserved_mem_max)
2163
- max_reservable_memory = perc_reserved_mem_max * physical_memory
2194
+ perc_reserved_mem_max = _get_perc_reserved_mem_max(perc_reserved_mem_max)
2195
+ max_reservable_memory = _get_max_reservable_memory(perc_reserved_mem_max)
2164
2196
 
2165
2197
  estimatesBytesToPin = 0
2166
2198
  for model_id in models:
@@ -2175,12 +2207,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2175
2207
  modelPinned = (pinAllModels or model_id in modelsToPin) and not hasattr(current_model,"_already_pinned")
2176
2208
 
2177
2209
  current_model_size = 0
2210
+ model_dtype = None
2178
2211
  for n, p in current_model.named_parameters():
2179
2212
  p.requires_grad = False
2180
2213
  if isinstance(p, QTensor):
2181
- # # fix quanto bug (seems to have been fixed)
2182
- # if not modelPinned and p._scale.dtype == torch.float32:
2183
- # p._scale = p._scale.to(torch.bfloat16)
2184
2214
  if p._qtype == qint4:
2185
2215
  if hasattr(p,"_scale_shift"):
2186
2216
  current_model_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
@@ -2192,13 +2222,20 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2192
2222
  else:
2193
2223
  current_model_size += torch.numel(p._scale) * p._scale.element_size()
2194
2224
  current_model_size += torch.numel(p._data) * p._data.element_size()
2225
+ dtype = p._scale.dtype
2195
2226
 
2196
2227
  else:
2197
- if convertFloatToBfloat16 and p.data.dtype == torch.float32:
2198
- # convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
2199
- p.data = p.data.to(torch.bfloat16)
2228
+ dtype = p.data.dtype
2229
+ if convertWeightsFloatTo != None and dtype == torch.float32:
2230
+ # convert any left overs float32 weight to bfloat16 / float16 to divide by 2 the model memory footprint
2231
+ dtype = convertWeightsFloatTo if model_dtype == None else model_dtype
2232
+ p.data = p.data.to(dtype)
2233
+ if model_dtype== None:
2234
+ model_dtype = dtype
2235
+ else:
2236
+ assert model_dtype == dtype
2200
2237
  current_model_size += torch.numel(p.data) * p.data.element_size()
2201
-
2238
+ current_model._dtype = model_dtype
2202
2239
  for b in current_model.buffers():
2203
2240
  # do not convert 32 bits float to 16 bits since buffers are few (and potential gain low) and usually they are needed for precision calculation (for instance Rope)
2204
2241
  current_model_size += torch.numel(b.data) * b.data.element_size()
@@ -2266,12 +2303,15 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2266
2303
  if self.verboseLevel >=1:
2267
2304
  print(f"Model '{model_id}' already pinned to reserved memory")
2268
2305
  else:
2269
- _pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, verboseLevel=verboseLevel)
2306
+ _pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, perc_reserved_mem_max = perc_reserved_mem_max, verboseLevel=verboseLevel)
2270
2307
 
2271
2308
  current_budget = model_budgets[model_id]
2272
2309
  cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
2273
2310
  self.loaded_blocks[model_id] = None
2274
- any_lora = loras !=None and model_id in loras or getattr(current_model, "_loras_model_data", False)
2311
+ any_lora = loras !=None and model_id in loras
2312
+ if any_lora:
2313
+ loras_model_data = {}
2314
+ current_model._loras_model_data = loras_model_data
2275
2315
  for submodule_name, submodule in current_model.named_modules():
2276
2316
  # create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
2277
2317
  # (it is queried in many pipelines even if offloading is not properly implemented)
@@ -2304,7 +2344,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2304
2344
 
2305
2345
  if hasattr(submodule, "forward"):
2306
2346
  if any_lora and isinstance(submodule, torch.nn.Linear):
2307
- submodule_method = self.hook_lora_linear(submodule, current_model, model_id, submodule_name)
2347
+ submodule_method = self.hook_lora_linear(submodule, current_model, model_id, loras_model_data, submodule_name)
2308
2348
  else:
2309
2349
  submodule_method = getattr(submodule, "forward")
2310
2350
  if callable(submodule_method):
@@ -2313,7 +2353,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
2313
2353
  elif compilationInThisOne and submodule in towers_modules:
2314
2354
  self.hook_preload_blocks_for_compilation(submodule, model_id, cur_blocks_name, context = submodule_name )
2315
2355
  else:
2316
- self.hook_check_empty_cache_needed(submodule, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2356
+ self.hook_check_empty_cache_needed(submodule, current_model, model_id, cur_blocks_name, submodule_method, context = submodule_name )
2317
2357
 
2318
2358
  self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
2319
2359
 
@@ -1,197 +1,197 @@
1
- Metadata-Version: 2.4
2
- Name: mmgp
3
- Version: 3.3.3
4
- Summary: Memory Management for the GPU Poor
5
- Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
- License: GNU GENERAL PUBLIC LICENSE
7
- Version 3, 29 June 2007
8
- Requires-Python: >=3.10
9
- Description-Content-Type: text/markdown
10
- License-File: LICENSE.md
11
- Requires-Dist: torch>=2.1.0
12
- Requires-Dist: optimum-quanto
13
- Requires-Dist: accelerate
14
- Requires-Dist: safetensors
15
- Requires-Dist: psutil
16
- Dynamic: license-file
17
-
18
-
19
- <p align="center">
20
- <H2>Memory Management 3.3.3 for the GPU Poor by DeepBeepMeep</H2>
21
- </p>
22
-
23
-
24
- This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 12 to 24 GB GPU limited card.
25
- This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
26
- times in a pipe (eg VAE).
27
-
28
- Requirements:
29
- - VRAM: minimum 6 GB, recommended 24 GB (RTX 3090/ RTX 4090)
30
- - RAM: minimum 24 GB, recommended 48 GB
31
-
32
- This module features 5 profiles in order to able to run the model at a decent speed on a low end consumer config (24 GB of RAM and 6 VRAM) and to run it at a very good speed (if not the best) on a high end consumer config (48 GB of RAM and 24 GB of VRAM).\
33
- These RAM requirements are for Linux systems. Due to different memory management Windows will require an extra 16 GB of RAM to run the corresponding profile.
34
-
35
- Each profile may use a combination of the following:
36
- - Low RAM consumption (thanks to a rewritten safetensors library) that allows low RAM on the fly quantization
37
- - Smart automated loading / unloading of models in the GPU to avoid unloading models that may be needed again soon
38
- - Smart slicing of models to reduce memory occupied by models in the VRAM
39
- - Ability to pin models to reserved RAM to accelerate transfers to VRAM
40
- - Async transfers to VRAM to avoid a pause when loading a new slice of a model
41
- - Automated on the fly quantization or ability to load pre quantized models
42
- - Pretrained Lora support with low RAM requirements
43
- - Support for pytorch compilation on Linux and WSL (supported on pure Windows but requires a complex Triton Installation).
44
-
45
- ## Sample applications that use mmgp
46
- It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
47
- - Wan2GP: https://github.com/deepbeepmeep/Wan2GP :\
48
- An excellent text to video and image to video generator by Alibaba
49
-
50
- - Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
51
- A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
52
-
53
- - HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP :\
54
- One of the best open source Text to Video generator
55
-
56
- - FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP :\
57
- One of the best inpainting / outpainting tools based on Flux that can run with less than 12 GB of VRAM.
58
-
59
- - Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP :\
60
- This application include two models: a text to world generator and a image / video to world (probably the best open source image to video generator).
61
-
62
- - OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP :\
63
- A Flux derived application very powerful that can be used to transfer an object of your choice in a prompted scene. With mmgp you can run it with only 6 GB of VRAM.
64
-
65
- - YuE GP: https://github.com/deepbeepmeep/YuEGP :\
66
- A great song generator (instruments + singer's voice) based on prompted Lyrics and a genre description. Thanks to mmgp you can run it with less than 10 GB of VRAM without waiting forever.
67
-
68
- ## Installation
69
- First you need to install the module in your current project with:
70
- ```shell
71
- pip install mmgp
72
- ```
73
-
74
-
75
- ## Usage
76
-
77
- It is almost plug and play and just needs to be invoked from the main app just after the model pipeline has been created.
78
- 1) First make sure that the pipeline explictly loads the models in the CPU device, for instance:
79
- ```
80
- pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to("cpu")
81
- ```
82
-
83
- 2) Once every potential Lora has been loaded and merged, add the following lines for a quick setup:
84
- ```
85
- from mmgp import offload, profile_type
86
- offload.profile(pipe, profile_type.HighRAM_LowVRAM_Fast)
87
- ```
88
-
89
- You can choose between 5 profiles depending on your hardware:
90
- - HighRAM_HighVRAM (1): at least 48 GB of RAM and 24 GB of VRAM : the fastest well suited for a RTX 3090 / RTX 4090 but consumes much more VRAM, adapted for fast shorter video or small batches of pictures
91
- - HighRAM_LowVRAM (2): at least 48 GB of RAM and 12 GB of VRAM : a bit slower, better suited for RTX 3070/3080/4070/4080 or for RTX 3090 / RTX 4090 with large pictures batches or long videos
92
- - LowRAM_HighVRAM (3): at least 32 GB of RAM and 24 GB of VRAM : adapted for RTX 3090 / RTX 4090 with limited RAM but at the cost of VRAM (shorter videos / fewer images)
93
- - LowRAM_LowVRAM (4): at least 32 GB of RAM and 12 GB of VRAM : if you have little VRAM or want to generate longer videos / more images
94
- - VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
95
-
96
- Profile 2 (High RAM) and 4 (Low RAM) are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
97
- If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
98
- In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
99
-
100
- By default the model named 'transformer' will be quantized to 8 bits for all profiles. If you don't want that you may specify the optional parameter *quantizeTransformer = False*.
101
-
102
- Every parameter set automatically by a profile can be overridden with one or multiple parameters accepted by *offload.all* (see below):
103
- ```
104
- from mmgp import offload, profile_type
105
- offload.profile(pipe, profile_type.HighRAM_LowVRAM, budgets = 1000)
106
- ```
107
- If you want to know which parameter are set by one specific profile you can use the parameter *verboseLevel=2*
108
-
109
- **It is highly recommended to put the *from mmgp import offload, profile_type* at the top of your main python file (that is as the first import) so that all the existing safetensors calls are redirected to mmpg.**
110
-
111
-
112
- ## Alternatively you may want to create your own profile with specific parameters:
113
-
114
- For example:
115
- ```
116
- from mmgp import offload
117
- offload.all(pipe, pinnedMemory=True, ExtraModelsToQuantize = ["text_encoder_2"] )
118
- ```
119
- - pinnedMemory: Boolean (for all models) or List of models ids to pin to RAM. Every model pinned to RAM will load much faster (up to 2 times) but this requires more RAM
120
- - quantizeTransformer: boolean by default True. The 'transformer' model in the pipe contains usually the video or image generator is by defaut; quantized on the fly by default to 8 bits. If you want to save time on disk and reduce the loading time, you may want to load directly a prequantized model. If you don't want to quantize the image generator, you need to set the option *quantizeTransformer* to *False* to turn off on the fly quantization.
121
- - extraModelsToQuantize: list of additional modelids of models to quantize on the fly. If the corresponding model is already quantized, this option will be ignored.
122
- - budgets: either a number in mega bytes, (for all models, if 0 unlimited budget) a string that is perecentage of the total VRAM or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
123
- The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
124
- - workingVRAM: either a number in mega bytes, a string that is perecentage of the total VRAM or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
125
- - asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
126
- - verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
127
- - compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows
128
- - coTenantsMap: a dictionary that maps a model id to a list of other models with which it accepts to share the VRAM at the same time. This is useful to avoid unefficient loading / unloading when two models processes are interleaved. For instance *coTenantsMap = { "text_encoder_2": ["text_encoder"] }* , here when *text_encoder_2* is loaded it won't unload *text_encoder*. Please note that the reverse is not true as these maps by design are not symetrical to allow tailored workflows. If you need to have as well *text_encoder* that won't unload *text_encoder_2* if it is already loaded *coTenantsMap = { "text_encoder_2": ["text_encoder"], "text_encoder": ["text_encoder_2"] }*
129
-
130
- If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
131
-
132
- ## Going further
133
-
134
- The module includes several tools to package a light version of your favorite video / image generator:
135
- - *extract_models(string prefix, obj to explore)*\
136
- This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required by *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
137
-
138
- - *load_loras_into_model(model, lora_path, lora_multi, activate_all_loras = True)*\
139
- Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
140
- The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions. By default all the load loras will be activated or they can be activated later using *activate_loras*.
141
-
142
- -*activate_loras(model, lora_nos, lora_multi = None )*\
143
- Activate the loras whose nos are in the list of nos. Every lora that is not this list and that was activated previously will be disactivated.
144
-
145
- - *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
146
- Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
147
- The resulting safetensor file will contain extra fields in its metadata such as the quantization map and its configuration, so you will be able to move the file around without files such as *config.json* or *file_map.json*.
148
- You will need *load_model_data* or *fast_load_transformers_model* to read the file again . You may also load it using the default *safetensor* librar however you will need to provide in the same directory any complementary file that are usually requested (for instance *config.json*)
149
-
150
- - *load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
151
- Load the tensors data of a model in RAM of a model already initialized with no data. Detect and handle quantized models saved previously with *save_model*.A model can also be quantized on the fly while being loaded. The model which is loaded can be pinned to RAM while it is loaded, this is more RAM efficient than pinning tensors later using *offline.all* or *offline.profile*
152
-
153
- - *fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
154
- Initialize (build the model hierarchy in memory) and fast load the corresponding tensors of a 'transformers' or 'diffusers' library model.
155
- The advantages over the original *from_pretrained* method is that a full model can fit into a single file with a filename of your choosing (thefore you can have multiple 'transformers' versions of the same model in the same directory) and prequantized models are processed in a transparent way.
156
- Last but not least, you can also on the fly pin to RAM the whole model or the most important part of it (partialPin = True) in a more efficient way (faster and requires less RAM) than if you did through *offload.all* or *offload.profile*.
157
-
158
-
159
- The typical workflow wil be:
160
- 1) temporarly insert the *save_model* function just after a model has been fully loaded to save a copy of the model / quantized model.
161
- 2) replace the full initalizing / loading logic with *fast_load_transformers_model* (if there is a *from_pretrained* call to a transformers object) or only the tensor loading functions (*torch.load_model_file* and *torch.load_state_dict*) with *load_model_data after* the initializing logic.
162
-
163
- ## Special cases
164
- Sometime there isn't an explicit pipe object as each submodel is loaded separately in the main app. If this is the case, you may try to use *extract_models* or create a dictionary that manually maps all the models.\
165
- For instance :
166
-
167
-
168
- - for flux derived models:
169
- ```
170
- pipe = { "text_encoder": clip, "text_encoder_2": t5, "transformer": model, "vae":ae }
171
- ```
172
- - for mochi:
173
- ```
174
- pipe = { "text_encoder": self.text_encoder, "transformer": self.dit, "vae":self.decoder }
175
- ```
176
-
177
-
178
- Please note it is recommended to have always one model whose Id is 'transformer' so that you can leverage predefined profiles. The 'transformer' corresponds to the main image / video model which usually needs to be quantized (this is done on the fly by default when loading the model).
179
-
180
- Be careful, lots of models use the T5 XXL as a text encoder. However, quite often their corresponding pipeline configurations point at the official Google T5 XXL repository
181
- where there is a huge 40GB model to download and load. It is cumbersorme as it is a 32 bits model and contains the decoder part of T5 that is not used.
182
- I suggest you use instead one of the 16 bits encoder only version available around, for instance:
183
- ```
184
- text_encoder_2 = T5EncoderModel.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="text_encoder_2", torch_dtype=torch.float16)
185
- ```
186
-
187
- Sometime just providing the pipe won't be sufficient as you will need to change the content of the core model:
188
- - For instance you may need to disable an existing CPU offload logic that already exists (such as manual calls to move tensors between cuda and the cpu)
189
- - mmpg to tries to fake the device as being "cuda" but sometimes some code won't be fooled and it will create tensors in the cpu device and this may cause some issues.
190
-
191
- You are free to use my module for non commercial use as long you give me proper credits. You may contact me on twitter @deepbeepmeep
192
-
193
- Thanks to
194
- ---------
195
- - Huggingface / accelerate for the hooking examples
196
- - Huggingface / quanto for their very useful quantizer
197
- - gau-nernst for his Pinnig RAM samples
1
+ Metadata-Version: 2.4
2
+ Name: mmgp
3
+ Version: 3.4.0
4
+ Summary: Memory Management for the GPU Poor
5
+ Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
+ License: GNU GENERAL PUBLIC LICENSE
7
+ Version 3, 29 June 2007
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE.md
11
+ Requires-Dist: torch>=2.1.0
12
+ Requires-Dist: optimum-quanto
13
+ Requires-Dist: accelerate
14
+ Requires-Dist: safetensors
15
+ Requires-Dist: psutil
16
+ Dynamic: license-file
17
+
18
+
19
+ <p align="center">
20
+ <H2>Memory Management 3.3.1 for the GPU Poor by DeepBeepMeep</H2>
21
+ </p>
22
+
23
+
24
+ This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 12 to 24 GB GPU limited card.
25
+ This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
26
+ times in a pipe (eg VAE).
27
+
28
+ Requirements:
29
+ - VRAM: minimum 6 GB, recommended 24 GB (RTX 3090/ RTX 4090)
30
+ - RAM: minimum 24 GB, recommended 48 GB
31
+
32
+ This module features 5 profiles in order to able to run the model at a decent speed on a low end consumer config (24 GB of RAM and 6 VRAM) and to run it at a very good speed (if not the best) on a high end consumer config (48 GB of RAM and 24 GB of VRAM).\
33
+ These RAM requirements are for Linux systems. Due to different memory management Windows will require an extra 16 GB of RAM to run the corresponding profile.
34
+
35
+ Each profile may use a combination of the following:
36
+ - Low RAM consumption (thanks to a rewritten safetensors library) that allows low RAM on the fly quantization
37
+ - Smart automated loading / unloading of models in the GPU to avoid unloading models that may be needed again soon
38
+ - Smart slicing of models to reduce memory occupied by models in the VRAM
39
+ - Ability to pin models to reserved RAM to accelerate transfers to VRAM
40
+ - Async transfers to VRAM to avoid a pause when loading a new slice of a model
41
+ - Automated on the fly quantization or ability to load pre quantized models
42
+ - Pretrained Lora support with low RAM requirements
43
+ - Support for pytorch compilation on Linux and WSL (supported on pure Windows but requires a complex Triton Installation).
44
+
45
+ ## Sample applications that use mmgp
46
+ It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
47
+ - Wan2GP: https://github.com/deepbeepmeep/Wan2GP :\
48
+ An excellent text to video and image to video generator by Alibaba
49
+
50
+ - Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
51
+ A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
52
+
53
+ - HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP :\
54
+ One of the best open source Text to Video generator
55
+
56
+ - FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP :\
57
+ One of the best inpainting / outpainting tools based on Flux that can run with less than 12 GB of VRAM.
58
+
59
+ - Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP :\
60
+ This application include two models: a text to world generator and a image / video to world (probably the best open source image to video generator).
61
+
62
+ - OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP :\
63
+ A Flux derived application very powerful that can be used to transfer an object of your choice in a prompted scene. With mmgp you can run it with only 6 GB of VRAM.
64
+
65
+ - YuE GP: https://github.com/deepbeepmeep/YuEGP :\
66
+ A great song generator (instruments + singer's voice) based on prompted Lyrics and a genre description. Thanks to mmgp you can run it with less than 10 GB of VRAM without waiting forever.
67
+
68
+ ## Installation
69
+ First you need to install the module in your current project with:
70
+ ```shell
71
+ pip install mmgp
72
+ ```
73
+
74
+
75
+ ## Usage
76
+
77
+ It is almost plug and play and just needs to be invoked from the main app just after the model pipeline has been created.
78
+ 1) First make sure that the pipeline explictly loads the models in the CPU device, for instance:
79
+ ```
80
+ pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to("cpu")
81
+ ```
82
+
83
+ 2) Once every potential Lora has been loaded and merged, add the following lines for a quick setup:
84
+ ```
85
+ from mmgp import offload, profile_type
86
+ offload.profile(pipe, profile_type.HighRAM_LowVRAM_Fast)
87
+ ```
88
+
89
+ You can choose between 5 profiles depending on your hardware:
90
+ - HighRAM_HighVRAM (1): at least 48 GB of RAM and 24 GB of VRAM : the fastest well suited for a RTX 3090 / RTX 4090 but consumes much more VRAM, adapted for fast shorter video or small batches of pictures
91
+ - HighRAM_LowVRAM (2): at least 48 GB of RAM and 12 GB of VRAM : a bit slower, better suited for RTX 3070/3080/4070/4080 or for RTX 3090 / RTX 4090 with large pictures batches or long videos
92
+ - LowRAM_HighVRAM (3): at least 32 GB of RAM and 24 GB of VRAM : adapted for RTX 3090 / RTX 4090 with limited RAM but at the cost of VRAM (shorter videos / fewer images)
93
+ - LowRAM_LowVRAM (4): at least 32 GB of RAM and 12 GB of VRAM : if you have little VRAM or want to generate longer videos / more images
94
+ - VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
95
+
96
+ Profile 2 (High RAM) and 4 (Low RAM) are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
97
+ If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
98
+ In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
99
+
100
+ By default the model named 'transformer' will be quantized to 8 bits for all profiles. If you don't want that you may specify the optional parameter *quantizeTransformer = False*.
101
+
102
+ Every parameter set automatically by a profile can be overridden with one or multiple parameters accepted by *offload.all* (see below):
103
+ ```
104
+ from mmgp import offload, profile_type
105
+ offload.profile(pipe, profile_type.HighRAM_LowVRAM, budgets = 1000)
106
+ ```
107
+ If you want to know which parameter are set by one specific profile you can use the parameter *verboseLevel=2*
108
+
109
+ **It is highly recommended to put the *from mmgp import offload, profile_type* at the top of your main python file (that is as the first import) so that all the existing safetensors calls are redirected to mmpg.**
110
+
111
+
112
+ ## Alternatively you may want to create your own profile with specific parameters:
113
+
114
+ For example:
115
+ ```
116
+ from mmgp import offload
117
+ offload.all(pipe, pinnedMemory=True, ExtraModelsToQuantize = ["text_encoder_2"] )
118
+ ```
119
+ - pinnedMemory: Boolean (for all models) or List of models ids to pin to RAM. Every model pinned to RAM will load much faster (up to 2 times) but this requires more RAM
120
+ - quantizeTransformer: boolean by default True. The 'transformer' model in the pipe contains usually the video or image generator is by defaut; quantized on the fly by default to 8 bits. If you want to save time on disk and reduce the loading time, you may want to load directly a prequantized model. If you don't want to quantize the image generator, you need to set the option *quantizeTransformer* to *False* to turn off on the fly quantization.
121
+ - extraModelsToQuantize: list of additional modelids of models to quantize on the fly. If the corresponding model is already quantized, this option will be ignored.
122
+ - budgets: either a number in mega bytes, (for all models, if 0 unlimited budget) a string that is perecentage of the total VRAM or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
123
+ The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
124
+ - workingVRAM: either a number in mega bytes, a string that is perecentage of the total VRAM or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
125
+ - asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
126
+ - verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
127
+ - compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows
128
+ - coTenantsMap: a dictionary that maps a model id to a list of other models with which it accepts to share the VRAM at the same time. This is useful to avoid unefficient loading / unloading when two models processes are interleaved. For instance *coTenantsMap = { "text_encoder_2": ["text_encoder"] }* , here when *text_encoder_2* is loaded it won't unload *text_encoder*. Please note that the reverse is not true as these maps by design are not symetrical to allow tailored workflows. If you need to have as well *text_encoder* that won't unload *text_encoder_2* if it is already loaded *coTenantsMap = { "text_encoder_2": ["text_encoder"], "text_encoder": ["text_encoder_2"] }*
129
+
130
+ If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
131
+
132
+ ## Going further
133
+
134
+ The module includes several tools to package a light version of your favorite video / image generator:
135
+ - *extract_models(string prefix, obj to explore)*\
136
+ This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required by *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
137
+
138
+ - *load_loras_into_model(model, lora_path, lora_multi, activate_all_loras = True)*\
139
+ Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
140
+ The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions. By default all the load loras will be activated or they can be activated later using *activate_loras*.
141
+
142
+ -*activate_loras(model, lora_nos, lora_multi = None )*\
143
+ Activate the loras whose nos are in the list of nos. Every lora that is not this list and that was activated previously will be disactivated.
144
+
145
+ - *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
146
+ Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
147
+ The resulting safetensor file will contain extra fields in its metadata such as the quantization map and its configuration, so you will be able to move the file around without files such as *config.json* or *file_map.json*.
148
+ You will need *load_model_data* or *fast_load_transformers_model* to read the file again . You may also load it using the default *safetensor* librar however you will need to provide in the same directory any complementary file that are usually requested (for instance *config.json*)
149
+
150
+ - *load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
151
+ Load the tensors data of a model in RAM of a model already initialized with no data. Detect and handle quantized models saved previously with *save_model*.A model can also be quantized on the fly while being loaded. The model which is loaded can be pinned to RAM while it is loaded, this is more RAM efficient than pinning tensors later using *offline.all* or *offline.profile*
152
+
153
+ - *fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
154
+ Initialize (build the model hierarchy in memory) and fast load the corresponding tensors of a 'transformers' or 'diffusers' library model.
155
+ The advantages over the original *from_pretrained* method is that a full model can fit into a single file with a filename of your choosing (thefore you can have multiple 'transformers' versions of the same model in the same directory) and prequantized models are processed in a transparent way.
156
+ Last but not least, you can also on the fly pin to RAM the whole model or the most important part of it (partialPin = True) in a more efficient way (faster and requires less RAM) than if you did through *offload.all* or *offload.profile*.
157
+
158
+
159
+ The typical workflow wil be:
160
+ 1) temporarly insert the *save_model* function just after a model has been fully loaded to save a copy of the model / quantized model.
161
+ 2) replace the full initalizing / loading logic with *fast_load_transformers_model* (if there is a *from_pretrained* call to a transformers object) or only the tensor loading functions (*torch.load_model_file* and *torch.load_state_dict*) with *load_model_data after* the initializing logic.
162
+
163
+ ## Special cases
164
+ Sometime there isn't an explicit pipe object as each submodel is loaded separately in the main app. If this is the case, you may try to use *extract_models* or create a dictionary that manually maps all the models.\
165
+ For instance :
166
+
167
+
168
+ - for flux derived models:
169
+ ```
170
+ pipe = { "text_encoder": clip, "text_encoder_2": t5, "transformer": model, "vae":ae }
171
+ ```
172
+ - for mochi:
173
+ ```
174
+ pipe = { "text_encoder": self.text_encoder, "transformer": self.dit, "vae":self.decoder }
175
+ ```
176
+
177
+
178
+ Please note it is recommended to have always one model whose Id is 'transformer' so that you can leverage predefined profiles. The 'transformer' corresponds to the main image / video model which usually needs to be quantized (this is done on the fly by default when loading the model).
179
+
180
+ Be careful, lots of models use the T5 XXL as a text encoder. However, quite often their corresponding pipeline configurations point at the official Google T5 XXL repository
181
+ where there is a huge 40GB model to download and load. It is cumbersorme as it is a 32 bits model and contains the decoder part of T5 that is not used.
182
+ I suggest you use instead one of the 16 bits encoder only version available around, for instance:
183
+ ```
184
+ text_encoder_2 = T5EncoderModel.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="text_encoder_2", torch_dtype=torch.float16)
185
+ ```
186
+
187
+ Sometime just providing the pipe won't be sufficient as you will need to change the content of the core model:
188
+ - For instance you may need to disable an existing CPU offload logic that already exists (such as manual calls to move tensors between cuda and the cpu)
189
+ - mmpg to tries to fake the device as being "cuda" but sometimes some code won't be fooled and it will create tensors in the cpu device and this may cause some issues.
190
+
191
+ You are free to use my module for non commercial use as long you give me proper credits. You may contact me on twitter @deepbeepmeep
192
+
193
+ Thanks to
194
+ ---------
195
+ - Huggingface / accelerate for the hooking examples
196
+ - Huggingface / quanto for their very useful quantizer
197
+ - gau-nernst for his Pinnig RAM samples
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=LOx2y6wTxktpbhkPmXx8oVOCq2uMjR4BfIGGEoWY12A,108582
4
+ mmgp/safetensors2.py,sha256=rmUbBmK3Dra5prUTTRSVi6-XUFAa9Mj6B5CNPgzt9To,17333
5
+ mmgp-3.4.0.dist-info/licenses/LICENSE.md,sha256=DD-WIS0BkPoWJ_8hQO3J8hMP9K_1-dyrYv1YCbkxcDU,94
6
+ mmgp-3.4.0.dist-info/METADATA,sha256=Nv5a1uTGAse-G6FpoWChJv_gScHwMcSt_NMd7c99JQA,16350
7
+ mmgp-3.4.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
8
+ mmgp-3.4.0.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.4.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.0.2)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,2 +1,2 @@
1
- GNU GENERAL PUBLIC LICENSE
1
+ GNU GENERAL PUBLIC LICENSE
2
2
  Version 3, 29 June 2007
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=R0UbOXEGAFKd_6090o8v5CkVmJiWmHDQsww7A3-LZEU,106550
4
- mmgp/safetensors2.py,sha256=rmUbBmK3Dra5prUTTRSVi6-XUFAa9Mj6B5CNPgzt9To,17333
5
- mmgp-3.3.3.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
- mmgp-3.3.3.dist-info/METADATA,sha256=xcODp7uhIfvy7Il1xEp8ed2VYmH1Eln-EnLy3MM4VGM,16153
7
- mmgp-3.3.3.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
8
- mmgp-3.3.3.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.3.3.dist-info/RECORD,,