mmgp 3.1.4.post15__py3-none-any.whl → 3.1.4.post151__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/offload.py CHANGED
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.1.4-1591 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -76,7 +76,18 @@ except:
76
76
  from mmgp import safetensors2
77
77
  from mmgp import profile_type
78
78
 
79
- from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module
79
+ from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module, register_qmodule
80
+
81
+ # support for Embedding module quantization that is not supported by default by quanto
82
+ @register_qmodule(torch.nn.Embedding)
83
+ class QEmbedding(QModuleMixin, torch.nn.Embedding):
84
+ @classmethod
85
+ def qcreate(cls, module, weights, activations = None, optimizer = None, device = None):
86
+ module.bias = None
87
+ return cls( module.num_embeddings, module.embedding_dim, module.padding_idx , module.max_norm, module.norm_type, module.scale_grad_by_freq, module.sparse, dtype=module.weight.dtype, device=device, weights=weights,
88
+ activations=activations, optimizer=optimizer, quantize_input=True)
89
+ def forward(self, input: torch.Tensor) -> torch.Tensor:
90
+ return torch.nn.functional.embedding( input, self.qweight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse )
80
91
 
81
92
 
82
93
  shared_state = {}
@@ -96,11 +107,6 @@ ENDC = '\033[0m'
96
107
  BOLD ='\033[1m'
97
108
  UNBOLD ='\033[0m'
98
109
 
99
- cotenants_map = {
100
- "text_encoder": ["vae", "text_encoder_2"],
101
- "text_encoder_2": ["vae", "text_encoder"],
102
- }
103
-
104
110
  class clock:
105
111
  def __init__(self):
106
112
  self.start_time = 0
@@ -216,15 +222,17 @@ def _get_model(model_path):
216
222
  if len(_path)<=1:
217
223
  raise("file not found")
218
224
  else:
219
- from huggingface_hub import hf_hub_download #snapshot_download,
220
- repoId= os.path.join(*_path[0:2] ).replace("\\", "/")
221
-
222
- if len(_path) > 2:
223
- _subfolder = os.path.join(*_path[2:] )
224
- model_path = hf_hub_download(repo_id=repoId, filename=_filename, subfolder=_subfolder)
225
- else:
226
- model_path = hf_hub_download(repo_id=repoId, filename=_filename)
225
+ try:
226
+ from huggingface_hub import hf_hub_download #snapshot_download,
227
+ repoId= os.path.join(*_path[0:2] ).replace("\\", "/")
227
228
 
229
+ if len(_path) > 2:
230
+ _subfolder = os.path.join(*_path[2:] )
231
+ model_path = hf_hub_download(repo_id=repoId, filename=_filename, subfolder=_subfolder)
232
+ else:
233
+ model_path = hf_hub_download(repo_id=repoId, filename=_filename)
234
+ except:
235
+ model_path = None
228
236
  return model_path
229
237
 
230
238
 
@@ -278,9 +286,17 @@ def _force_load_parameter(p):
278
286
  torch.utils.swap_tensors(p, q)
279
287
  del q
280
288
 
281
- def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
289
+ def _get_tensor_ref(p):
290
+ if isinstance(p, QTensor):
291
+ if p._qtype == qint4:
292
+ return p._data._data.data_ptr()
293
+ else:
294
+ return p._data.data_ptr()
295
+ else:
296
+ return p.data_ptr()
282
297
 
283
298
 
299
+ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
284
300
  if partialPinning:
285
301
  towers_names, _ = _detect_main_towers(model)
286
302
 
@@ -292,56 +308,63 @@ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
292
308
  tensor_map_indexes = []
293
309
  total_tensor_bytes = 0
294
310
 
295
- params_list = []
311
+ params_dict = {} # OrderedDict
296
312
  for k, sub_module in model.named_modules():
297
313
  include = True
298
314
  if partialPinning:
299
315
  include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
300
316
  if include:
301
- params_list = params_list + [ (k + '.' + n, p, False) for n, p in sub_module.named_parameters(recurse=False)] + [ (k + '.' + n, p, True) for n, p in sub_module.named_buffers(recurse=False)]
302
-
317
+ params_dict.update( { k + '.' + n : (p, False) for n, p in sub_module.named_parameters(recurse=False) } )
318
+ params_dict.update( { k + '.' + n : (b, True) for n, b in sub_module.named_buffers(recurse=False) } )
303
319
 
304
320
  if verboseLevel>=1 :
305
321
  if partialPinning:
306
- if len(params_list) == 0:
322
+ if len(params_dict) == 0:
307
323
  print(f"Unable to apply Partial of '{model_id}' as no isolated main structures were found")
308
324
  else:
309
325
  print(f"Partial pinning of data of '{model_id}' to reserved RAM")
310
326
  else:
311
327
  print(f"Pinning data of '{model_id}' to reserved RAM")
312
328
 
313
- if partialPinning and len(params_list) == 0:
329
+ if partialPinning and len(params_dict) == 0:
314
330
  return
315
331
 
316
-
317
-
318
- for n, p, _ in params_list:
319
- if isinstance(p, QTensor):
320
- if p._qtype == qint4:
321
- if hasattr(p,"_scale_shift"):
322
- length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
332
+ ref_cache = {}
333
+ tied_weights = {}
334
+ for n, (p, _) in params_dict.items():
335
+ ref = _get_tensor_ref(p)
336
+ match = ref_cache.get(ref, None)
337
+ if match != None:
338
+ match_name, match_size = match
339
+ if verboseLevel >=1:
340
+ print(f"Tied weights of {match_size/ONE_MB:0.2f} MB detected: {match_name} <-> {n}")
341
+ tied_weights[n] = match_name
342
+ else:
343
+ if isinstance(p, QTensor):
344
+ if p._qtype == qint4:
345
+ if hasattr(p,"_scale_shift"):
346
+ length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
347
+ else:
348
+ length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
323
349
  else:
324
- length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
350
+ length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
325
351
  else:
326
- length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
327
- else:
328
- length = torch.numel(p.data) * p.data.element_size()
329
-
330
-
331
- if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
332
- big_tensors_sizes.append(current_big_tensor_size)
333
- current_big_tensor_size = 0
334
- big_tensor_no += 1
352
+ length = torch.numel(p.data) * p.data.element_size()
335
353
 
354
+ ref_cache[ref] = (n, length)
355
+ if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
356
+ big_tensors_sizes.append(current_big_tensor_size)
357
+ current_big_tensor_size = 0
358
+ big_tensor_no += 1
336
359
 
337
- itemsize = p.data.dtype.itemsize
338
- if current_big_tensor_size % itemsize:
339
- current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
340
- tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
341
- current_big_tensor_size += length
342
360
 
343
- total_tensor_bytes += length
361
+ itemsize = p.data.dtype.itemsize
362
+ if current_big_tensor_size % itemsize:
363
+ current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
364
+ tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
365
+ current_big_tensor_size += length
344
366
 
367
+ total_tensor_bytes += length
345
368
 
346
369
  big_tensors_sizes.append(current_big_tensor_size)
347
370
 
@@ -368,39 +391,53 @@ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
368
391
 
369
392
  tensor_no = 0
370
393
  # prev_big_tensor = 0
371
- for n, p, is_buffer in params_list:
372
- big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
373
- # if big_tensor_no != prev_big_tensor:
374
- # gc.collect()
375
- # prev_big_tensor = big_tensor_no
376
- if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
377
- current_big_tensor = big_tensors[big_tensor_no]
378
- if is_buffer :
379
- _force_load_buffer(p) # otherwise potential memory leak
394
+ for n, (p, is_buffer) in params_dict.items():
395
+ if n in tied_weights:
380
396
  if isinstance(p, QTensor):
381
- if p._qtype == qint4:
382
- length1 = torch.numel(p._data._data) * p._data._data.element_size()
383
- p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
384
- if hasattr(p,"_scale_shift"):
385
- length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
386
- p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
397
+ if p._qtype == qint4:
398
+ assert p._data._data.data.is_pinned()
399
+ else:
400
+ assert p._data.is_pinned()
401
+ else:
402
+ assert p.data.is_pinned()
403
+ else:
404
+ big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
405
+ # if big_tensor_no != prev_big_tensor:
406
+ # gc.collect()
407
+ # prev_big_tensor = big_tensor_no
408
+ # match_param, match_isbuffer = tied_weights.get(n, (None, False))
409
+ # if match_param != None:
410
+
411
+ if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
412
+ current_big_tensor = big_tensors[big_tensor_no]
413
+ if is_buffer :
414
+ _force_load_buffer(p) # otherwise potential memory leak
415
+ if isinstance(p, QTensor):
416
+ if p._qtype == qint4:
417
+ length1 = torch.numel(p._data._data) * p._data._data.element_size()
418
+ p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
419
+ if hasattr(p,"_scale_shift"):
420
+ length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
421
+ p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
422
+ else:
423
+ length2 = torch.numel(p._scale) * p._scale.element_size()
424
+ p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
425
+ length3 = torch.numel(p._shift) * p._shift.element_size()
426
+ p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
387
427
  else:
428
+ length1 = torch.numel(p._data) * p._data.element_size()
429
+ p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
388
430
  length2 = torch.numel(p._scale) * p._scale.element_size()
389
431
  p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
390
- length3 = torch.numel(p._shift) * p._shift.element_size()
391
- p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
392
432
  else:
393
- length1 = torch.numel(p._data) * p._data.element_size()
394
- p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
395
- length2 = torch.numel(p._scale) * p._scale.element_size()
396
- p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
397
- else:
398
- length = torch.numel(p.data) * p.data.element_size()
399
- p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
433
+ length = torch.numel(p.data) * p.data.element_size()
434
+ p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
400
435
 
401
- tensor_no += 1
436
+ tensor_no += 1
437
+ del p
402
438
  global total_pinned_bytes
403
439
  total_pinned_bytes += total
440
+ del params_dict
404
441
  gc.collect()
405
442
 
406
443
  if verboseLevel >=1:
@@ -420,7 +457,7 @@ def _welcome():
420
457
  if welcome_displayed:
421
458
  return
422
459
  welcome_displayed = True
423
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-15) by DeepBeepMeep ************{ENDC}{UNBOLD}")
460
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-151) by DeepBeepMeep ************{ENDC}{UNBOLD}")
424
461
 
425
462
  def _extract_num_from_str(num_in_str):
426
463
  size = len(num_in_str)
@@ -518,16 +555,6 @@ def _requantize(model: torch.nn.Module, state_dict: dict, quantization_map: dict
518
555
 
519
556
  def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 1000000000, model_id = 'Unknown'):
520
557
 
521
- def compute_submodule_size(submodule):
522
- size = 0
523
- for p in submodule.parameters(recurse=False):
524
- size += torch.numel(p.data) * sizeofbfloat16
525
-
526
- for p in submodule.buffers(recurse=False):
527
- size += torch.numel(p.data) * sizeofbfloat16
528
-
529
- return size
530
-
531
558
  total_size =0
532
559
  total_excluded = 0
533
560
  exclude_list = []
@@ -549,16 +576,31 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
549
576
  tower_names ,_ = _detect_main_towers(model_to_quantize)
550
577
  tower_names = [ n[:-1] for n in tower_names]
551
578
 
579
+
580
+ cache_ref = {}
581
+ tied_weights= {}
582
+
552
583
  for submodule_name, submodule in model_to_quantize.named_modules():
553
584
  if isinstance(submodule, QModuleMixin):
554
585
  if verboseLevel>=1:
555
586
  print("No quantization to do as model is already quantized")
556
587
  return False
557
588
 
558
- if submodule_name=='':
559
- continue
589
+ size = 0
590
+ for n, p in submodule.named_parameters(recurse = False):
591
+ ref = _get_tensor_ref(p)
592
+ match = cache_ref.get(ref, None)
593
+ if match != None:
594
+ tied_weights[submodule_name]= (n, ) + match
595
+ else:
596
+ cache_ref[ref] = (submodule_name, n)
597
+ size += torch.numel(p.data) * sizeofbfloat16
598
+
599
+ for p in submodule.buffers(recurse=False):
600
+ size += torch.numel(p.data) * sizeofbfloat16
601
+
602
+
560
603
 
561
- size = compute_submodule_size(submodule)
562
604
  if not any(submodule_name.startswith(pre) for pre in tower_names):
563
605
  flush = False
564
606
  if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
@@ -590,12 +632,13 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
590
632
  submodule_names.append(submodule_name)
591
633
  total_size += size
592
634
 
593
- if submodule_size > 0 and submodule_size <= threshold:
635
+ if submodule_size >0 and submodule_size <= threshold :
594
636
  exclude_list += submodule_names
595
637
  if verboseLevel >=2:
596
638
  print(f"Excluded size {submodule_size/ONE_MB:.1f} MB: {prev_blocks_prefix} : {submodule_names}")
597
639
  total_excluded += submodule_size
598
640
 
641
+
599
642
  perc_excluded =total_excluded/ total_size if total_size >0 else 1
600
643
  if verboseLevel >=2:
601
644
  if total_excluded == 0:
@@ -608,7 +651,10 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
608
651
  exclude_list = None
609
652
 
610
653
 
611
- quantize(model_to_quantize,weights, exclude= exclude_list)
654
+ exclude_list += list(tied_weights)
655
+ quantize(model_to_quantize, weights= weights, exclude= exclude_list)
656
+
657
+
612
658
  # quantize(model_to_quantize,weights, include= [ "*1.block.attn.to_out*"]) #"
613
659
 
614
660
  # for name, m in model_to_quantize.named_modules():
@@ -618,24 +664,40 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
618
664
 
619
665
  # force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
620
666
  # otherwise we may end up keeping in memory both the quantized and the non quantize model
621
- for n,m in model_to_quantize.named_modules():
667
+ named_modules = {n:m for n,m in model_to_quantize.named_modules()}
668
+ for module_name, module in named_modules.items():
622
669
  # do not read quantized weights (detected them directly or behind an adapter)
623
- if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
624
- if hasattr(m, "bias") and m.bias is not None:
625
- _force_load_parameter(m.bias)
670
+ if isinstance(module, QModuleMixin) or hasattr(module, "base_layer") and isinstance(module.base_layer, QModuleMixin):
671
+ if hasattr(module, "bias") and module.bias is not None:
672
+ _force_load_parameter(module.bias)
626
673
  else:
627
- for p in m.parameters(recurse = False):
628
- _force_load_parameter(p)
629
-
630
- for b in m.buffers(recurse = False):
674
+ tied_w = tied_weights.get(module_name, None)
675
+ for n, p in module.named_parameters(recurse = False):
676
+ if tied_w != None and n == tied_w[0]:
677
+ if isinstance( named_modules[tied_w[1]], QModuleMixin) :
678
+ setattr(module, n, None) # release refs of tied weights if source is going to be quantized
679
+ # otherwise don't force load as it will be loaded in the source anyway
680
+ else:
681
+ _force_load_parameter(p)
682
+ del p # del p if not it will still contain a ref to a tensor when leaving the loop
683
+ for b in module.buffers(recurse = False):
631
684
  _force_load_buffer(b)
632
-
685
+ del b
633
686
 
634
687
 
635
688
  freeze(model_to_quantize)
636
689
  torch.cuda.empty_cache()
637
- gc.collect()
690
+ gc.collect()
691
+
692
+ for tied_module, (tied_weight, src_module, src_weight) in tied_weights.items():
693
+ p = getattr(named_modules[src_module], src_weight)
694
+ if isinstance(p, QTensor):
695
+ setattr(named_modules[tied_module], tied_weight, p ) # copy refs to quantized sources
696
+
697
+ del named_modules
698
+
638
699
  quantization_map = _quantization_map(model_to_quantize)
700
+
639
701
  model_to_quantize._quanto_map = quantization_map
640
702
 
641
703
  if hasattr(model_to_quantize, "_already_pinned"):
@@ -647,12 +709,81 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
647
709
 
648
710
  return True
649
711
 
712
+ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
713
+ self._check_forward_args(x, *args, **kwargs)
714
+ adapter_names = kwargs.pop("adapter_names", None)
715
+ if self.disable_adapters:
716
+ if self.merged:
717
+ self.unmerge()
718
+ result = self.base_layer(x, *args, **kwargs)
719
+ elif adapter_names is not None:
720
+ result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
721
+ elif self.merged:
722
+ result = self.base_layer(x, *args, **kwargs)
723
+ else:
724
+ base_weight = self.base_layer.weight
725
+ if base_weight.shape[-1] < x.shape[-2]: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
726
+ for active_adapter in self.active_adapters:
727
+ lora_A = self.lora_A[active_adapter]
728
+ lora_B = self.lora_B[active_adapter]
729
+ scaling = self.scaling[active_adapter]
730
+ lora_A_weight = lora_A.weight
731
+ lora_B_weight = lora_B.weight
732
+ lora_BA = lora_B_weight @ lora_A_weight
733
+ base_weight += scaling * lora_BA
734
+
735
+ result = torch.nn.functional.linear(x, base_weight, bias=self.base_layer.bias)
736
+ torch_result_dtype = result.dtype
737
+
738
+ else:
739
+ result = self.base_layer(x, *args, **kwargs)
740
+ torch_result_dtype = result.dtype
741
+ x = x.to(torch.bfloat16)
742
+
743
+ for active_adapter in self.active_adapters:
744
+ if active_adapter not in self.lora_A.keys():
745
+ continue
746
+ lora_A = self.lora_A[active_adapter]
747
+ lora_B = self.lora_B[active_adapter]
748
+ dropout = self.lora_dropout[active_adapter]
749
+ scaling = self.scaling[active_adapter]
750
+ x = x.to(lora_A.weight.dtype)
751
+
752
+ if not self.use_dora[active_adapter]:
753
+ y = lora_A(x)
754
+ y = lora_B(y)
755
+ y*= scaling
756
+ result+= y
757
+ del lora_A, lora_B, y
758
+ # result = result + lora_B(lora_A(dropout(x))) * scaling
759
+ else:
760
+ if isinstance(dropout, nn.Identity) or not self.training:
761
+ base_result = result
762
+ else:
763
+ x = dropout(x)
764
+ base_result = None
765
+
766
+ result = result + self.lora_magnitude_vector[active_adapter](
767
+ x,
768
+ lora_A=lora_A,
769
+ lora_B=lora_B,
770
+ scaling=scaling,
771
+ base_layer=self.get_base_layer(),
772
+ base_result=base_result,
773
+ )
774
+
775
+ result = result.to(torch_result_dtype)
776
+ return result
777
+
650
778
  def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, verboseLevel = -1,):
651
779
  verboseLevel = _compute_verbose_level(verboseLevel)
652
780
 
653
781
  if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or get_peft_kwargs == None:
654
782
  raise Exception("Unable to load Lora, missing 'peft' and / or 'diffusers' modules")
655
-
783
+
784
+ from peft.tuners.lora import Linear
785
+ Linear.forward = _lora_linear_forward
786
+
656
787
  if not isinstance(lora_path, list):
657
788
  lora_path = [lora_path]
658
789
 
@@ -662,6 +793,9 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
662
793
  for i, path in enumerate(lora_path):
663
794
  adapter_name = str(i)
664
795
 
796
+
797
+
798
+
665
799
  state_dict = safetensors2.torch_load_file(path)
666
800
 
667
801
  keys = list(state_dict.keys())
@@ -843,7 +977,6 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
843
977
  verboseLevel = _compute_verbose_level(verboseLevel)
844
978
 
845
979
  model = _remove_model_wrapper(model)
846
-
847
980
  if not (".safetensors" in file_path or ".sft" in file_path):
848
981
  if pinToMemory:
849
982
  raise Exception("Pinning to memory while loading only supported for safe tensors files")
@@ -855,12 +988,20 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
855
988
 
856
989
  if metadata is None:
857
990
  quantization_map = None
991
+ tied_weights_map = None
858
992
  else:
859
993
  quantization_map = metadata.get("quantization_map", None)
860
994
  config = metadata.get("config", None)
861
995
  if config is not None:
862
996
  model._config = config
863
997
 
998
+ tied_weights_map = metadata.get("tied_weights_map", None)
999
+ if tied_weights_map != None:
1000
+ for name, tied_weights_list in tied_weights_map.items():
1001
+ mapped_weight = state_dict[name]
1002
+ for tied_weights in tied_weights_list:
1003
+ state_dict[tied_weights] = mapped_weight
1004
+
864
1005
 
865
1006
 
866
1007
  if quantization_map is None:
@@ -915,6 +1056,7 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
915
1056
  """
916
1057
 
917
1058
  config = None
1059
+ extra_meta = None
918
1060
  verboseLevel = _compute_verbose_level(verboseLevel)
919
1061
  if config_file_path !=None:
920
1062
  with open(config_file_path, "r", encoding="utf-8") as reader:
@@ -928,8 +1070,10 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
928
1070
  config_path = getattr(config_obj,"_name_or_path", None)
929
1071
  if config_path != None:
930
1072
  config_fullpath = os.path.join(config_path, "config.json")
931
- if not os.path.isfile(config_fullpath):
932
- config_fullpath = None
1073
+ config_fullpath = _get_model(config_fullpath)
1074
+
1075
+ # if not os.path.isfile(config_fullpath):
1076
+ # config_fullpath = None
933
1077
  if config_fullpath is None:
934
1078
  config_fullpath = os.path.join(os.path.dirname(file_path), "config.json")
935
1079
  if os.path.isfile(config_fullpath):
@@ -942,15 +1086,50 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
942
1086
 
943
1087
  quantization_map = getattr(model, "_quanto_map", None)
944
1088
 
1089
+ from collections import OrderedDict
1090
+
1091
+ cache_ref = {}
1092
+ tied_weights_map = {}
1093
+ sd = model.state_dict()
1094
+ out_sd = OrderedDict()
1095
+
1096
+
1097
+ for name, weight in sd.items():
1098
+ ref = _get_tensor_ref(weight)
1099
+ match = cache_ref.get(ref, None)
1100
+ if match != None:
1101
+ tied_list = tied_weights_map.get(match, [])
1102
+ tied_list.append(name)
1103
+ tied_weights_map[match] = tied_list
1104
+ else:
1105
+ out_sd[name] = weight
1106
+ cache_ref[ref] = name
1107
+
1108
+ if len(tied_weights_map) > 0:
1109
+ extra_meta = { "tied_weights_map" : tied_weights_map }
1110
+
945
1111
  if verboseLevel >=1:
946
1112
  print(f"Saving file '{file_path}")
947
- safetensors2.torch_write_file(model.state_dict(), file_path , quantization_map = quantization_map, config = config)
1113
+
1114
+ safetensors2.torch_write_file(out_sd, file_path , quantization_map = quantization_map, config = config, extra_meta= extra_meta)
948
1115
  if verboseLevel >=1:
949
1116
  print(f"File '{file_path}' saved")
950
1117
 
951
1118
 
952
- def extract_models(prefix, obj):
1119
+ def extract_models(obj = None, prefix = None):
1120
+ if isinstance(obj, str): # for compatibility as the two args were switched
1121
+ bkp = prefix
1122
+ prefix = obj
1123
+ obj = bkp
1124
+
953
1125
  pipe = {}
1126
+ if obj == None:
1127
+ raise Exception("an object to analyze must be provided")
1128
+ if prefix==None or len(prefix)==0:
1129
+ prefix = ""
1130
+ elif prefix[ -1:] != "/":
1131
+ prefix + "/"
1132
+
954
1133
  for name in dir(obj):
955
1134
  element = getattr(obj,name)
956
1135
  if name in ("pipeline", "pipe"):
@@ -958,16 +1137,16 @@ def extract_models(prefix, obj):
958
1137
  if hasattr(pipeline , "components") and isinstance(pipeline.components, dict):
959
1138
  for k, model in pipeline.components.items():
960
1139
  if model != None:
961
- pipe[prefix + "/" + k ] = model
962
- elif isinstance(element, torch.nn.Module):
963
- if prefix + "/" + name in pipe:
964
- pipe[prefix + "/_" + name ] = element
1140
+ pipe[prefix + k ] = model
1141
+ elif isinstance(element, torch.nn.Module) and name!="base_model":
1142
+ if prefix + name in pipe:
1143
+ pipe[prefix + "_" + name ] = element
965
1144
  else:
966
- pipe[prefix + "/" + name ] = element
1145
+ pipe[prefix + name ] = element
967
1146
  elif isinstance(element, dict):
968
1147
  for k, element in element.items():
969
1148
  if hasattr(element , "pipeline"):
970
- pipe.update( extract_models(prefix + "/" + k,element ))
1149
+ pipe.update( extract_models(prefix + k,element ))
971
1150
 
972
1151
 
973
1152
  return pipe
@@ -989,6 +1168,10 @@ class offload:
989
1168
  self.active_models_ids = []
990
1169
  self.active_subcaches = {}
991
1170
  self.models = {}
1171
+ self.cotenants_map = {
1172
+ "text_encoder": ["vae", "text_encoder_2"],
1173
+ "text_encoder_2": ["vae", "text_encoder"],
1174
+ }
992
1175
  self.verboseLevel = 0
993
1176
  self.blocks_of_modules = {}
994
1177
  self.blocks_of_modules_sizes = {}
@@ -1002,14 +1185,13 @@ class offload:
1002
1185
  self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
1003
1186
  self.transfer_stream = torch.cuda.Stream()
1004
1187
  self.async_transfers = False
1188
+ self.parameters_ref = {}
1005
1189
  global last_offload_obj
1006
1190
  last_offload_obj = self
1007
1191
 
1008
1192
 
1009
- def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name):
1193
+ def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
1010
1194
 
1011
- if blocks_name is None:
1012
- pass
1013
1195
  entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
1014
1196
  if entry_name in self.blocks_of_modules:
1015
1197
  blocks_params = self.blocks_of_modules[entry_name]
@@ -1023,39 +1205,54 @@ class offload:
1023
1205
  self.prev_blocks_names[entry_name] = prev_entry_name
1024
1206
  if not prev_block_name == None:
1025
1207
  self.next_blocks_names[prev_entry_name] = entry_name
1026
-
1208
+ bef = blocks_params_size
1027
1209
  for k,p in submodule.named_parameters(recurse=False):
1210
+ param_size = 0
1211
+ ref = _get_tensor_ref(p)
1212
+ tied_param = self.parameters_ref.get(ref, None)
1028
1213
 
1029
1214
  if isinstance(p, QTensor):
1030
- blocks_params.append( (submodule, k, p, False ) )
1215
+ blocks_params.append( (submodule, k, p, False, tied_param ) )
1031
1216
 
1032
1217
  if p._qtype == qint4:
1033
1218
  if hasattr(p,"_scale_shift"):
1034
- blocks_params_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
1035
- blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
1219
+ param_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
1220
+ param_size += torch.numel(p._data._data) * p._data._data.element_size()
1036
1221
  else:
1037
- blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
1038
- blocks_params_size += torch.numel(p._shift) * p._shift.element_size()
1039
- blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
1222
+ param_size += torch.numel(p._scale) * p._scale.element_size()
1223
+ param_size += torch.numel(p._shift) * p._shift.element_size()
1224
+ param_size += torch.numel(p._data._data) * p._data._data.element_size()
1040
1225
  else:
1041
- blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
1042
- blocks_params_size += torch.numel(p._data) * p._data.element_size()
1226
+ param_size += torch.numel(p._scale) * p._scale.element_size()
1227
+ param_size += torch.numel(p._data) * p._data.element_size()
1043
1228
  else:
1044
- blocks_params.append( (submodule, k, p, False) )
1045
- blocks_params_size += torch.numel(p.data) * p.data.element_size()
1229
+ blocks_params.append( (submodule, k, p, False, tied_param) )
1230
+ param_size += torch.numel(p.data) * p.data.element_size()
1231
+
1232
+
1233
+ if tied_param == None:
1234
+ blocks_params_size += param_size
1235
+ self.parameters_ref[ref] = (submodule, k)
1046
1236
 
1047
1237
  for k, p in submodule.named_buffers(recurse=False):
1048
- blocks_params.append( (submodule, k, p, True) )
1238
+ blocks_params.append( (submodule, k, p, True, None) )
1049
1239
  blocks_params_size += p.data.nbytes
1050
1240
 
1241
+ aft = blocks_params_size
1242
+
1243
+ # if blocks_name is None:
1244
+ # print(f"Default: {model_id}/{submodule_name} : {(aft-bef)/ONE_MB:0.2f} MB")
1245
+ # pass
1246
+
1051
1247
 
1052
1248
  self.blocks_of_modules_sizes[entry_name] = blocks_params_size
1053
1249
 
1250
+
1054
1251
  return blocks_params_size
1055
1252
 
1056
1253
 
1057
1254
  def can_model_be_cotenant(self, model_id):
1058
- potential_cotenants= cotenants_map.get(model_id, None)
1255
+ potential_cotenants= self.cotenants_map.get(model_id, None)
1059
1256
  if potential_cotenants is None:
1060
1257
  return False
1061
1258
  for existing_cotenant in self.active_models_ids:
@@ -1073,20 +1270,23 @@ class offload:
1073
1270
  def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
1074
1271
  with torch.cuda.stream(stream_to_use):
1075
1272
  for param in blocks_params:
1076
- parent_module, n, p, is_buffer = param
1273
+ parent_module, n, p, is_buffer, tied_param = param
1274
+ if tied_param != None:
1275
+ tied_p = getattr( tied_param[0], tied_param[1])
1276
+ if tied_p.is_cuda:
1277
+ setattr(parent_module, n , tied_p)
1278
+ continue
1279
+
1077
1280
  q = p.to("cuda", non_blocking=True)
1078
1281
  if is_buffer:
1079
1282
  q = torch.nn.Buffer(q)
1080
1283
  else:
1081
1284
  q = torch.nn.Parameter(q , requires_grad=False)
1082
1285
  setattr(parent_module, n , q)
1083
- # if record_for_stream != None:
1084
- # if isinstance(p, QTensor):
1085
- # q._data.record_stream(record_for_stream)
1086
- # q._scale.record_stream(record_for_stream)
1087
- # else:
1088
- # p.data.record_stream(record_for_stream)
1089
1286
 
1287
+ if tied_param != None:
1288
+ setattr( tied_param[0], tied_param[1], q)
1289
+ del p, q
1090
1290
  any_past_block = False
1091
1291
 
1092
1292
  loaded_block = self.loaded_blocks[model_id]
@@ -1108,24 +1308,24 @@ class offload:
1108
1308
  first = self.prev_blocks_names[entry_name] == None or not any_past_block
1109
1309
  next_blocks_entry = self.next_blocks_names[entry_name] if entry_name in self.next_blocks_names else None
1110
1310
  if first:
1111
- cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
1112
1311
  if self.verboseLevel >=2:
1113
1312
  if preload:
1114
1313
  print(f"Preloading model {entry_name} ({model_name}) in GPU")
1115
1314
  else:
1116
1315
  print(f"Loading model {entry_name} ({model_name}) in GPU")
1316
+ cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
1117
1317
 
1118
1318
  torch.cuda.synchronize()
1119
1319
 
1120
1320
  if next_blocks_entry != None:
1121
- cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
1122
1321
  if self.verboseLevel >=2:
1123
1322
  print(f"Prefetching model {next_blocks_entry} ({model_name}) in GPU")
1323
+ cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
1124
1324
 
1125
1325
  else:
1126
- cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
1127
1326
  if self.verboseLevel >=2:
1128
1327
  print(f"Loading model {entry_name} ({model_name}) in GPU")
1328
+ cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
1129
1329
  torch.cuda.synchronize()
1130
1330
 
1131
1331
  if not preload:
@@ -1149,12 +1349,13 @@ class offload:
1149
1349
 
1150
1350
  blocks_params = self.blocks_of_modules[blocks_name]
1151
1351
  for param in blocks_params:
1152
- parent_module, n, p, is_buffer = param
1352
+ parent_module, n, p, is_buffer, _ = param
1153
1353
  if is_buffer:
1154
1354
  q = torch.nn.Buffer(p)
1155
1355
  else:
1156
1356
  q = torch.nn.Parameter(p , requires_grad=False)
1157
1357
  setattr(parent_module, n , q)
1358
+ del p, q
1158
1359
  # cl.stop()
1159
1360
  # print(f"unload time: {cl.format_time_gap()}")
1160
1361
 
@@ -1168,9 +1369,6 @@ class offload:
1168
1369
  for block_name in self.preloaded_blocks_per_model[model_id]:
1169
1370
  self.gpu_load_blocks(model_id, block_name, True)
1170
1371
 
1171
-
1172
- # torch.cuda.current_stream().synchronize()
1173
-
1174
1372
  def unload_all(self):
1175
1373
  for model_id in self.active_models_ids:
1176
1374
  self.gpu_unload_blocks(model_id, None)
@@ -1246,6 +1444,16 @@ class offload:
1246
1444
 
1247
1445
  return False
1248
1446
 
1447
+ def ensure_model_loaded(self, model_id):
1448
+ if model_id in self.active_models_ids:
1449
+ return
1450
+ # new_model_id = getattr(module, "_mm_id")
1451
+ # do not always unload existing models if it is more efficient to keep in them in the GPU
1452
+ # (e.g: small modules whose calls are text encoders)
1453
+ if not self.can_model_be_cotenant(model_id) :
1454
+ self.unload_all()
1455
+ self.gpu_load(model_id)
1456
+
1249
1457
  def hook_preload_blocks_for_compilation(self, target_module, model_id,blocks_name, context):
1250
1458
 
1251
1459
  # @torch.compiler.disable()
@@ -1259,16 +1467,27 @@ class offload:
1259
1467
  target_module.register_forward_pre_hook(preload_blocks_for_compile)
1260
1468
 
1261
1469
 
1262
- def hook_check_empty_cache_needed(self, target_module, model_id,blocks_name, previous_method, context):
1470
+ def hook_check_empty_cache_needed(self, target_module, model_id, blocks_name, previous_method, context):
1263
1471
 
1264
1472
  qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
1265
1473
  if qint4quantization:
1266
1474
  pass
1267
1475
 
1268
- def check_empty_cuda_cache(module, *args, **kwargs):
1269
- # if self.ready_to_check_mem():
1270
- # self.empty_cache_if_needed()
1476
+ if hasattr(target_module, "_mm_id"):
1477
+ # no hook for a shared module with no weights (otherwise this will cause models loading / unloading for nothing)
1478
+ orig_model_id = getattr(target_module, "_mm_id")
1479
+ if self.verboseLevel >=2:
1480
+ print(f"Model '{model_id}' shares module '{target_module._get_name()}' with module(s) '{orig_model_id}' ")
1481
+ assert not self.any_param_or_buffer(target_module)
1482
+ if not isinstance(orig_model_id, list):
1483
+ orig_model_id = [orig_model_id]
1484
+ orig_model_id.append(model_id)
1485
+ setattr(target_module, "_mm_id", orig_model_id)
1486
+ target_module.forward = target_module._mm_forward
1487
+ return
1271
1488
 
1489
+ def check_empty_cuda_cache(module, *args, **kwargs):
1490
+ self.ensure_model_loaded(model_id)
1272
1491
  if blocks_name == None:
1273
1492
  if self.ready_to_check_mem():
1274
1493
  self.empty_cache_if_needed()
@@ -1279,34 +1498,18 @@ class offload:
1279
1498
 
1280
1499
  return previous_method(*args, **kwargs)
1281
1500
 
1282
-
1283
- if hasattr(target_module, "_mm_id"):
1284
- orig_model_id = getattr(target_module, "_mm_id")
1285
- if self.verboseLevel >=2:
1286
- print(f"Model '{model_id}' shares module '{target_module._get_name()}' with module '{orig_model_id}' ")
1287
- assert not self.any_param_or_buffer(target_module)
1288
-
1289
- return
1290
1501
  setattr(target_module, "_mm_id", model_id)
1502
+ setattr(target_module, "_mm_forward", previous_method)
1503
+
1291
1504
  setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
1292
1505
 
1293
1506
 
1294
1507
  def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
1295
- def check_change_module(module, *args, **kwargs):
1296
- performEmptyCacheTest = False
1297
- if not model_id in self.active_models_ids:
1298
- new_model_id = getattr(module, "_mm_id")
1299
- # do not always unload existing models if it is more efficient to keep in them in the GPU
1300
- # (e.g: small modules whose calls are text encoders)
1301
- if not self.can_model_be_cotenant(new_model_id) :
1302
- self.unload_all()
1303
- performEmptyCacheTest = False
1304
- self.gpu_load(new_model_id)
1508
+
1509
+ def check_change_module(module, *args, **kwargs):
1510
+ self.ensure_model_loaded(model_id)
1305
1511
  # transfer leftovers inputs that were incorrectly created in the RAM (mostly due to some .device tests that returned incorrectly "cpu")
1306
1512
  args, kwargs = self.move_args_to_gpu(*args, **kwargs)
1307
- if performEmptyCacheTest:
1308
- self.empty_cache_if_needed()
1309
-
1310
1513
  return previous_method(*args, **kwargs)
1311
1514
 
1312
1515
  if hasattr(target_module, "_mm_id"):
@@ -1337,6 +1540,8 @@ class offload:
1337
1540
  base_size = self.blocks_of_modules_sizes[model_id]
1338
1541
  current_budget -= base_size
1339
1542
  if current_budget <= 0:
1543
+ if self.verboseLevel >=1:
1544
+ print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only only base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1340
1545
  return
1341
1546
 
1342
1547
  towers = []
@@ -1357,6 +1562,8 @@ class offload:
1357
1562
  total_size += tower_size
1358
1563
  current_budget -= 2 * max_floor_size
1359
1564
  if current_budget <= 0:
1565
+ if self.verboseLevel >=1:
1566
+ print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1360
1567
  return
1361
1568
 
1362
1569
 
@@ -1366,6 +1573,8 @@ class offload:
1366
1573
  preload_total += preload_blocks_count * max_floor_size
1367
1574
  max_blocks_fetch = max(max_floor_size, max_blocks_fetch)
1368
1575
  if preload_blocks_count <= 0:
1576
+ if self.verboseLevel >=1:
1577
+ print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1369
1578
  return
1370
1579
 
1371
1580
  nb_blocks= len(floors)
@@ -1396,11 +1605,11 @@ class offload:
1396
1605
 
1397
1606
  self.preloaded_blocks_per_model[model_id] = preloaded_blocks
1398
1607
 
1399
- if self.verboseLevel >=2:
1400
- print(f"Async loading plan for model '{model_id}' : {preload_total/ONE_MB:0.2f} MB will be preloaded ({preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async shuttle")
1608
+ if self.verboseLevel >=1:
1609
+ print(f"Async loading plan for model '{model_id}' : {(preload_total+base_size)/ONE_MB:0.2f} MB will be preloaded (base size of {base_size/ONE_MB:0.2f} MB + {preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async" + (" circular" if len(towers) == 1 else "") + " shuttle")
1401
1610
 
1402
1611
 
1403
- def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
1612
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
1404
1613
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
1405
1614
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
1406
1615
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -1417,9 +1626,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1417
1626
  model_budgets = {}
1418
1627
 
1419
1628
  windows_os = os.name == 'nt'
1420
- global total_pinned_bytes
1421
1629
 
1422
-
1423
1630
  budget = 0
1424
1631
  if not budgets is None:
1425
1632
  if isinstance(budgets , dict):
@@ -1448,6 +1655,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1448
1655
  verboseLevel = _compute_verbose_level(verboseLevel)
1449
1656
 
1450
1657
  _welcome()
1658
+ if coTenantsMap != None:
1659
+ self.cotenants_map = coTenantsMap
1451
1660
 
1452
1661
  self.models = models
1453
1662
 
@@ -1528,9 +1737,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1528
1737
  current_model_size += torch.numel(p.data) * p.data.element_size()
1529
1738
 
1530
1739
  for b in current_model.buffers():
1531
- if b.data.dtype == torch.float32:
1532
- # convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
1533
- b.data = b.data.to(torch.bfloat16)
1740
+ # do not convert 32 bits float to 16 bits since buffers are few (and potential gain low) and usually they are needed for precision calculation (for instance Rope)
1534
1741
  current_model_size += torch.numel(b.data) * b.data.element_size()
1535
1742
 
1536
1743
  if modelPinned:
@@ -1538,17 +1745,39 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1538
1745
 
1539
1746
 
1540
1747
  model_budget = model_budgets[model_id] * ONE_MB if model_id in model_budgets else budget
1541
-
1748
+ if workingVRAM != None:
1749
+ model_minimumVRAM = -1
1750
+ if isinstance(workingVRAM, dict):
1751
+ if model_id in workingVRAM:
1752
+ model_minimumVRAM = workingVRAM[model_id]
1753
+ elif "*" in model_id in workingVRAM:
1754
+ model_minimumVRAM = workingVRAM["*"]
1755
+ else:
1756
+ model_minimumVRAM = workingVRAM
1757
+ if model_minimumVRAM > 0:
1758
+ new_budget = self.device_mem_capacity - model_minimumVRAM * ONE_MB
1759
+ new_budget = 1 if new_budget < 0 else new_budget
1760
+ model_budget = new_budget if model_budget == 0 or new_budget < model_budget else model_budget
1542
1761
  if model_budget > 0 and model_budget > current_model_size:
1543
1762
  model_budget = 0
1763
+ coef =0.8
1764
+ if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
1765
+ if verboseLevel >= 1:
1766
+ if model_budget == 0:
1767
+ print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
1768
+ else:
1769
+ print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
1770
+ print(f"Budget allocation for this model has been consequently reduced to the 80% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
1771
+ model_budget = coef * self.device_mem_capacity
1772
+
1544
1773
 
1545
- model_budgets[model_id] = model_budget #/ 2 if asyncTransfers else model_budget
1774
+ model_budgets[model_id] = model_budget
1546
1775
 
1547
1776
  partialPinning = False
1548
1777
 
1549
1778
  if estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
1550
1779
  if self.verboseLevel >=1:
1551
- print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated reservable RAM is {max_reservable_memory/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
1780
+ print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated available reservable RAM is {(max_reservable_memory-total_pinned_bytes)/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
1552
1781
  partialPinning = True
1553
1782
 
1554
1783
  # Hook forward methods of modules
@@ -1577,15 +1806,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1577
1806
  _pin_to_memory(current_model, model_id, partialPinning= partialPinning, verboseLevel=verboseLevel)
1578
1807
 
1579
1808
  current_budget = model_budgets[model_id]
1580
- cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
1809
+ cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
1581
1810
  self.loaded_blocks[model_id] = None
1582
1811
 
1583
1812
  for submodule_name, submodule in current_model.named_modules():
1584
1813
  # create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
1585
1814
  # (it is queried in many pipelines even if offloading is not properly implemented)
1586
- if not hasattr(submodule, "_hf_hook"):
1815
+ if not hasattr(submodule, "_hf_hook"):
1587
1816
  setattr(submodule, "_hf_hook", HfHook())
1588
-
1589
1817
  if current_budget > 0 and len(submodule_name) > 0:
1590
1818
  if cur_blocks_prefix != None:
1591
1819
  if submodule_name.startswith(cur_blocks_prefix):
@@ -1593,20 +1821,20 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1593
1821
  depth_name = submodule_name.split(".")
1594
1822
  level = depth_name[len(depth_prefix)-1]
1595
1823
  pre , num = _extract_num_from_str(level)
1596
- if num != cur_blocks_seq: #and (cur_blocks_seq == -1 or current_size > current_budget)
1824
+ if num != cur_blocks_seq and not (is_mod_seq and cur_blocks_seq>=0):
1597
1825
  prev_blocks_name = cur_blocks_name
1598
1826
  cur_blocks_name = cur_blocks_prefix + str(num)
1599
1827
  # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
1600
1828
  cur_blocks_seq = num
1601
1829
  else:
1602
- cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
1830
+ cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
1603
1831
 
1604
1832
  if cur_blocks_prefix == None:
1605
1833
  pre , num = _extract_num_from_str(submodule_name)
1606
1834
  if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
1607
- cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre + ".", None, -1
1835
+ cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre + ".", None, -1, isinstance(submodule, torch.nn.Sequential)
1608
1836
  elif num >=0:
1609
- cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre, None, num
1837
+ cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre, None, num, False
1610
1838
  cur_blocks_name = submodule_name
1611
1839
  # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
1612
1840
 
@@ -1621,7 +1849,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1621
1849
  else:
1622
1850
  self.hook_check_empty_cache_needed(submodule, model_id, cur_blocks_name, submodule_method, context = submodule_name )
1623
1851
 
1624
- self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
1852
+ self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
1625
1853
 
1626
1854
  self.tune_preloading(model_id, current_budget, towers_names)
1627
1855
 
@@ -1635,9 +1863,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1635
1863
  elif prev_num - start_num <=1:
1636
1864
  print(f"Size of submodel '{n+ str(start_num)}': {prev_size/ONE_MB:.1f} MB")
1637
1865
  else:
1638
- print(f"Size of submodel '{n+ str(start_num) +'-'+ str(prev_num)}': {prev_size/ONE_MB:.1f} MB")
1866
+ print(f"Size of submodel '{n+ str(start_num) +'-'+ str(prev_num)}': {(prev_num-start_num+1)*prev_size/ONE_MB:.1f} MB ({prev_size/ONE_MB:.1f} MB x {prev_num-start_num+1})")
1639
1867
 
1640
1868
  for n, size in self.blocks_of_modules_sizes.items():
1869
+ size = int(size / 10000)* 10000
1641
1870
  pre, num = _extract_num_from_str(n) if "/" in n else (n, -1)
1642
1871
  if prev_pre == None :
1643
1872
  start_num = num
@@ -1709,21 +1938,21 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1709
1938
  if profile_no == profile_type.HighRAM_HighVRAM:
1710
1939
  pinnedMemory= True
1711
1940
  budgets = None
1712
- info = "You have chosen a profile that requires at least 48 GB of RAM and 24 GB of VRAM. Some VRAM is consumed just to make the model runs faster."
1941
+ info = "You have chosen a profile that may require 48 GB of RAM and up to 24 GB of VRAM on some applications."
1713
1942
  elif profile_no == profile_type.HighRAM_LowVRAM:
1714
1943
  pinnedMemory= True
1715
1944
  budgets["*"] = 3000
1716
- info = "You have chosen a profile that requires at least 48 GB of RAM and 12 GB of VRAM. Some RAM is consumed to reduce VRAM consumption."
1945
+ info = "You have chosen a profile that may require 48 GB of RAM and up to 12 GB of VRAM on some applications."
1717
1946
  elif profile_no == profile_type.LowRAM_HighVRAM:
1718
1947
  pinnedMemory= "transformer"
1719
1948
  extraModelsToQuantize = default_extraModelsToQuantize
1720
1949
  budgets = None
1721
- info = "You have chosen a Medium speed profile that requires at least 32 GB of RAM and 24 GB of VRAM. Some VRAM is consuming just to make the model runs faster"
1950
+ info = "You have chosen a Medium speed profile that may require 32 GB of RAM and up to 24 GB of VRAM on some applications."
1722
1951
  elif profile_no == profile_type.LowRAM_LowVRAM:
1723
1952
  pinnedMemory= "transformer"
1724
1953
  extraModelsToQuantize = default_extraModelsToQuantize
1725
1954
  budgets["*"] = 3000
1726
- info = "You have chosen a profile that requires at least 32 GB of RAM and 12 GB of VRAM. Some RAM is consumed to reduce VRAM consumption. "
1955
+ info = "You have chosen a profile that usually may require 32 GB of RAM and up to 12 GB of VRAM on some applications."
1727
1956
  elif profile_no == profile_type.VerylowRAM_LowVRAM:
1728
1957
  pinnedMemory= False
1729
1958
  extraModelsToQuantize = default_extraModelsToQuantize
@@ -1731,9 +1960,10 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1731
1960
  if "transformer" in modules:
1732
1961
  budgets["transformer"] = 400
1733
1962
  #asyncTransfers = False
1734
- info = "You have chosen the slowest profile that requires at least 24 GB of RAM and 10 GB of VRAM."
1963
+ info = "You have chosen the slowest profile that may require 24 GB of RAM and up to 10 GB of VRAM on some applications."
1735
1964
  else:
1736
1965
  raise Exception("Unknown profile")
1966
+ info += " Actual requirements may varry depending on the application or on the tuning done to the profile."
1737
1967
 
1738
1968
  if budgets != None and len(budgets) == 0:
1739
1969
  budgets = None
mmgp/safetensors2.py CHANGED
@@ -146,7 +146,7 @@ def _read_safetensors_header(path, file):
146
146
  return catalog, metadata, length_of_header + 8
147
147
 
148
148
 
149
- def torch_write_file(sd, file_path, quantization_map = None, config = None):
149
+ def torch_write_file(sd, file_path, quantization_map = None, config = None, extra_meta = None):
150
150
  from collections import OrderedDict
151
151
  sf_sd = OrderedDict()
152
152
 
@@ -189,6 +189,14 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
189
189
  if not config is None:
190
190
  metadata["config_base64"] = base64.b64encode(json.dumps(config, ensure_ascii=False).encode('utf8')).decode('utf8')
191
191
 
192
+ if not extra_meta is None:
193
+ for n , m in extra_meta.items():
194
+ if isinstance(m, str):
195
+ metadata[n] = m
196
+ else:
197
+ metadata[n + "_base64"] = base64.b64encode(json.dumps(m, ensure_ascii=False).encode('utf8')).decode('utf8')
198
+
199
+
192
200
  if len(metadata) > 0:
193
201
  sf_sd["__metadata__"] = metadata
194
202
 
@@ -443,6 +451,4 @@ try:
443
451
  transformers.modeling_utils.safe_open = safe_open
444
452
  transformers.modeling_utils.safe_load_file = torch_load_file
445
453
  except:
446
- pass
447
-
448
-
454
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.1.4.post15
3
+ Version: 3.1.4.post151
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Requires-Dist: peft
17
17
 
18
18
 
19
19
  <p align="center">
20
- <H2>Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep</H2>
20
+ <H2>Memory Management 3.1.4-151 for the GPU Poor by DeepBeepMeep</H2>
21
21
  </p>
22
22
 
23
23
 
@@ -44,21 +44,23 @@ Each profile may use a combination of the following:
44
44
 
45
45
  ## Sample applications that use mmgp
46
46
  It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
47
- - Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP\
47
+ - Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
48
48
  A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
49
49
 
50
- - HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP\
50
+ - HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP :\
51
51
  One of the best open source Text to Video generator
52
52
 
53
- - FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP\
53
+ - FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP :\
54
54
  One of the best inpainting / outpainting tools based on Flux that can run with less than 12 GB of VRAM.
55
55
 
56
- - Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP\
56
+ - Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP :\
57
57
  This application include two models: a text to world generator and a image / video to world (probably the best open source image to video generator).
58
58
 
59
- - OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP\
59
+ - OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP :\
60
60
  A Flux derived application very powerful that can be used to transfer an object of your choice in a prompted scene. With mmgp you can run it with only 6 GB of VRAM.
61
61
 
62
+ - YuE GP: https://github.com/deepbeepmeep/YuEGP :\
63
+ A great song generator (instruments + singer's voice) based on prompted Lyrics and a genre description. Thanks to mmgp you can run it with less than 10 GB of VRAM without waiting forever.
62
64
 
63
65
  ## Installation
64
66
  First you need to install the module in your current project with:
@@ -88,7 +90,7 @@ You can choose between 5 profiles depending on your hardware:
88
90
  - LowRAM_LowVRAM (4): at least 32 GB of RAM and 12 GB of VRAM : if you have little VRAM or want to generate longer videos / more images
89
91
  - VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
90
92
 
91
- Profile 2 (High RAM) and 4 (Low RAM)are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
93
+ Profile 2 (High RAM) and 4 (Low RAM) are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
92
94
  If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
93
95
  In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
94
96
 
@@ -114,11 +116,13 @@ For example:
114
116
  - pinnedMemory: Boolean (for all models) or List of models ids to pin to RAM. Every model pinned to RAM will load much faster (up to 2 times) but this requires more RAM
115
117
  - quantizeTransformer: boolean by default True. The 'transformer' model in the pipe contains usually the video or image generator is by defaut; quantized on the fly by default to 8 bits. If you want to save time on disk and reduce the loading time, you may want to load directly a prequantized model. If you don't want to quantize the image generator, you need to set the option *quantizeTransformer* to *False* to turn off on the fly quantization.
116
118
  - extraModelsToQuantize: list of additional modelids of models to quantize on the fly. If the corresponding model is already quantized, this option will be ignored.
117
- - budgets: either a number in mega bytes (for all models, if 0 unlimited budget) or a dictionary that maps model ids to mega bytes : define the budget in VRAM (in fact the real number is 1.5 this number or 2.5 if asyncTransfers are also enabled) that is allocated in VRAM for each model.
119
+ - budgets: either a number in mega bytes (for all models, if 0 unlimited budget) or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
118
120
  The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
121
+ - workingVRAM: either a number in mega bytes or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
119
122
  - asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
120
123
  - verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
121
124
  - compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows
125
+ - coTenantsMap: a dictionary that maps a model id to a list of other models with which it accepts to share the VRAM at the same time. This is useful to avoid unefficient loading / unloading when two models processes are interleaved. For instance *coTenantsMap = { "text_encoder_2": ["text_encoder"] }* , here when *text_encoder_2* is loaded it won't unload *text_encoder*. Please note that the reverse is not true as these maps by design are not symetrical to allow tailored workflows. If you need to have as well *text_encoder* that won't unload *text_encoder_2* if it is already loaded *coTenantsMap = { "text_encoder_2": ["text_encoder"], "text_encoder": ["text_encoder_2"] }*
122
126
 
123
127
  If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
124
128
 
@@ -126,11 +130,14 @@ If you are short on RAM and plan to work with quantized models, it is recommende
126
130
 
127
131
  The module includes several tools to package a light version of your favorite video / image generator:
128
132
  - *extract_models(string prefix, obj to explore)*\
129
- This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required par *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
133
+ This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required by *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
130
134
 
131
- - *load_loras_into_model(model, lora_path, lora_multi)*\
135
+ - *load_loras_into_model(model, lora_path, lora_multi, activate_all_loras = True)*\
132
136
  Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
133
- The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
137
+ The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions. By default all the load loras will be activated or they can be activated later using *activate_loras*.
138
+
139
+ -*activate_loras(model, lora_nos, lora_multi = None )*\
140
+ Activate the loras whose nos are in the list of nos. Every lora that is not this list and that was activated previously will be disactivated.
134
141
 
135
142
  - *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
136
143
  Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=4gGj7ibuilYEk7N4_x9M_fx6tYO2OUU8jeSq5YsF_0E,85992
4
+ mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
5
+ mmgp-3.1.4.post151.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
+ mmgp-3.1.4.post151.dist-info/METADATA,sha256=qBS_HYUidog3kLKr25x0YJ7EyCCCcHbghXrFJcYoUZE,15946
7
+ mmgp-3.1.4.post151.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
8
+ mmgp-3.1.4.post151.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.1.4.post151.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=DEGTt5RPoLx9JK-d7Ld_B_rIuQrmhblQJw3V5CL9Lo8,74519
4
- mmgp/safetensors2.py,sha256=OkJAvENfWeb-PL0FcxS1-eYeHLbemTaNXYvNxURrzIs,16154
5
- mmgp-3.1.4.post15.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
- mmgp-3.1.4.post15.dist-info/METADATA,sha256=IMmhK6xAu0A96mLlpby9V2H-K8RYIqRpORaBngvtC0U,14278
7
- mmgp-3.1.4.post15.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
8
- mmgp-3.1.4.post15.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.1.4.post15.dist-info/RECORD,,