mmgp 3.1.4.post15__py3-none-any.whl → 3.1.4.post1519__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/offload.py CHANGED
@@ -1,4 +1,4 @@
1
- # ------------------ Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
1
+ # ------------------ Memory Management 3.1.4-1519 for the GPU Poor by DeepBeepMeep (mmgp)------------------
2
2
  #
3
3
  # This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
4
4
  # This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
@@ -76,7 +76,18 @@ except:
76
76
  from mmgp import safetensors2
77
77
  from mmgp import profile_type
78
78
 
79
- from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module
79
+ from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module, register_qmodule
80
+
81
+ # support for Embedding module quantization that is not supported by default by quanto
82
+ @register_qmodule(torch.nn.Embedding)
83
+ class QEmbedding(QModuleMixin, torch.nn.Embedding):
84
+ @classmethod
85
+ def qcreate(cls, module, weights, activations = None, optimizer = None, device = None):
86
+ module.bias = None
87
+ return cls( module.num_embeddings, module.embedding_dim, module.padding_idx , module.max_norm, module.norm_type, module.scale_grad_by_freq, module.sparse, dtype=module.weight.dtype, device=device, weights=weights,
88
+ activations=activations, optimizer=optimizer, quantize_input=True)
89
+ def forward(self, input: torch.Tensor) -> torch.Tensor:
90
+ return torch.nn.functional.embedding( input, self.qweight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse )
80
91
 
81
92
 
82
93
  shared_state = {}
@@ -96,11 +107,6 @@ ENDC = '\033[0m'
96
107
  BOLD ='\033[1m'
97
108
  UNBOLD ='\033[0m'
98
109
 
99
- cotenants_map = {
100
- "text_encoder": ["vae", "text_encoder_2"],
101
- "text_encoder_2": ["vae", "text_encoder"],
102
- }
103
-
104
110
  class clock:
105
111
  def __init__(self):
106
112
  self.start_time = 0
@@ -216,15 +222,17 @@ def _get_model(model_path):
216
222
  if len(_path)<=1:
217
223
  raise("file not found")
218
224
  else:
219
- from huggingface_hub import hf_hub_download #snapshot_download,
220
- repoId= os.path.join(*_path[0:2] ).replace("\\", "/")
221
-
222
- if len(_path) > 2:
223
- _subfolder = os.path.join(*_path[2:] )
224
- model_path = hf_hub_download(repo_id=repoId, filename=_filename, subfolder=_subfolder)
225
- else:
226
- model_path = hf_hub_download(repo_id=repoId, filename=_filename)
225
+ try:
226
+ from huggingface_hub import hf_hub_download #snapshot_download,
227
+ repoId= os.path.join(*_path[0:2] ).replace("\\", "/")
227
228
 
229
+ if len(_path) > 2:
230
+ _subfolder = os.path.join(*_path[2:] )
231
+ model_path = hf_hub_download(repo_id=repoId, filename=_filename, subfolder=_subfolder)
232
+ else:
233
+ model_path = hf_hub_download(repo_id=repoId, filename=_filename)
234
+ except:
235
+ model_path = None
228
236
  return model_path
229
237
 
230
238
 
@@ -278,9 +286,17 @@ def _force_load_parameter(p):
278
286
  torch.utils.swap_tensors(p, q)
279
287
  del q
280
288
 
281
- def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
289
+ def _get_tensor_ref(p):
290
+ if isinstance(p, QTensor):
291
+ if p._qtype == qint4:
292
+ return p._data._data.data_ptr()
293
+ else:
294
+ return p._data.data_ptr()
295
+ else:
296
+ return p.data_ptr()
282
297
 
283
298
 
299
+ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
284
300
  if partialPinning:
285
301
  towers_names, _ = _detect_main_towers(model)
286
302
 
@@ -292,56 +308,63 @@ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
292
308
  tensor_map_indexes = []
293
309
  total_tensor_bytes = 0
294
310
 
295
- params_list = []
311
+ params_dict = {} # OrderedDict
296
312
  for k, sub_module in model.named_modules():
297
313
  include = True
298
314
  if partialPinning:
299
315
  include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
300
316
  if include:
301
- params_list = params_list + [ (k + '.' + n, p, False) for n, p in sub_module.named_parameters(recurse=False)] + [ (k + '.' + n, p, True) for n, p in sub_module.named_buffers(recurse=False)]
302
-
317
+ params_dict.update( { k + '.' + n : (p, False) for n, p in sub_module.named_parameters(recurse=False) } )
318
+ params_dict.update( { k + '.' + n : (b, True) for n, b in sub_module.named_buffers(recurse=False) } )
303
319
 
304
320
  if verboseLevel>=1 :
305
321
  if partialPinning:
306
- if len(params_list) == 0:
322
+ if len(params_dict) == 0:
307
323
  print(f"Unable to apply Partial of '{model_id}' as no isolated main structures were found")
308
324
  else:
309
325
  print(f"Partial pinning of data of '{model_id}' to reserved RAM")
310
326
  else:
311
327
  print(f"Pinning data of '{model_id}' to reserved RAM")
312
328
 
313
- if partialPinning and len(params_list) == 0:
329
+ if partialPinning and len(params_dict) == 0:
314
330
  return
315
331
 
316
-
317
-
318
- for n, p, _ in params_list:
319
- if isinstance(p, QTensor):
320
- if p._qtype == qint4:
321
- if hasattr(p,"_scale_shift"):
322
- length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
332
+ ref_cache = {}
333
+ tied_weights = {}
334
+ for n, (p, _) in params_dict.items():
335
+ ref = _get_tensor_ref(p)
336
+ match = ref_cache.get(ref, None)
337
+ if match != None:
338
+ match_name, match_size = match
339
+ if verboseLevel >=1:
340
+ print(f"Tied weights of {match_size/ONE_MB:0.2f} MB detected: {match_name} <-> {n}")
341
+ tied_weights[n] = match_name
342
+ else:
343
+ if isinstance(p, QTensor):
344
+ if p._qtype == qint4:
345
+ if hasattr(p,"_scale_shift"):
346
+ length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
347
+ else:
348
+ length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
323
349
  else:
324
- length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
350
+ length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
325
351
  else:
326
- length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
327
- else:
328
- length = torch.numel(p.data) * p.data.element_size()
329
-
330
-
331
- if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
332
- big_tensors_sizes.append(current_big_tensor_size)
333
- current_big_tensor_size = 0
334
- big_tensor_no += 1
352
+ length = torch.numel(p.data) * p.data.element_size()
335
353
 
354
+ ref_cache[ref] = (n, length)
355
+ if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
356
+ big_tensors_sizes.append(current_big_tensor_size)
357
+ current_big_tensor_size = 0
358
+ big_tensor_no += 1
336
359
 
337
- itemsize = p.data.dtype.itemsize
338
- if current_big_tensor_size % itemsize:
339
- current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
340
- tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
341
- current_big_tensor_size += length
342
360
 
343
- total_tensor_bytes += length
361
+ itemsize = p.data.dtype.itemsize
362
+ if current_big_tensor_size % itemsize:
363
+ current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
364
+ tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
365
+ current_big_tensor_size += length
344
366
 
367
+ total_tensor_bytes += length
345
368
 
346
369
  big_tensors_sizes.append(current_big_tensor_size)
347
370
 
@@ -368,39 +391,53 @@ def _pin_to_memory(model, model_id, partialPinning = False, verboseLevel = 1):
368
391
 
369
392
  tensor_no = 0
370
393
  # prev_big_tensor = 0
371
- for n, p, is_buffer in params_list:
372
- big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
373
- # if big_tensor_no != prev_big_tensor:
374
- # gc.collect()
375
- # prev_big_tensor = big_tensor_no
376
- if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
377
- current_big_tensor = big_tensors[big_tensor_no]
378
- if is_buffer :
379
- _force_load_buffer(p) # otherwise potential memory leak
394
+ for n, (p, is_buffer) in params_dict.items():
395
+ if n in tied_weights:
380
396
  if isinstance(p, QTensor):
381
- if p._qtype == qint4:
382
- length1 = torch.numel(p._data._data) * p._data._data.element_size()
383
- p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
384
- if hasattr(p,"_scale_shift"):
385
- length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
386
- p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
397
+ if p._qtype == qint4:
398
+ assert p._data._data.data.is_pinned()
399
+ else:
400
+ assert p._data.is_pinned()
401
+ else:
402
+ assert p.data.is_pinned()
403
+ else:
404
+ big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
405
+ # if big_tensor_no != prev_big_tensor:
406
+ # gc.collect()
407
+ # prev_big_tensor = big_tensor_no
408
+ # match_param, match_isbuffer = tied_weights.get(n, (None, False))
409
+ # if match_param != None:
410
+
411
+ if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
412
+ current_big_tensor = big_tensors[big_tensor_no]
413
+ if is_buffer :
414
+ _force_load_buffer(p) # otherwise potential memory leak
415
+ if isinstance(p, QTensor):
416
+ if p._qtype == qint4:
417
+ length1 = torch.numel(p._data._data) * p._data._data.element_size()
418
+ p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
419
+ if hasattr(p,"_scale_shift"):
420
+ length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
421
+ p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
422
+ else:
423
+ length2 = torch.numel(p._scale) * p._scale.element_size()
424
+ p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
425
+ length3 = torch.numel(p._shift) * p._shift.element_size()
426
+ p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
387
427
  else:
428
+ length1 = torch.numel(p._data) * p._data.element_size()
429
+ p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
388
430
  length2 = torch.numel(p._scale) * p._scale.element_size()
389
431
  p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
390
- length3 = torch.numel(p._shift) * p._shift.element_size()
391
- p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
392
432
  else:
393
- length1 = torch.numel(p._data) * p._data.element_size()
394
- p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
395
- length2 = torch.numel(p._scale) * p._scale.element_size()
396
- p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
397
- else:
398
- length = torch.numel(p.data) * p.data.element_size()
399
- p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
433
+ length = torch.numel(p.data) * p.data.element_size()
434
+ p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
400
435
 
401
- tensor_no += 1
436
+ tensor_no += 1
437
+ del p
402
438
  global total_pinned_bytes
403
439
  total_pinned_bytes += total
440
+ del params_dict
404
441
  gc.collect()
405
442
 
406
443
  if verboseLevel >=1:
@@ -420,7 +457,7 @@ def _welcome():
420
457
  if welcome_displayed:
421
458
  return
422
459
  welcome_displayed = True
423
- print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-15) by DeepBeepMeep ************{ENDC}{UNBOLD}")
460
+ print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.1.4-151) by DeepBeepMeep ************{ENDC}{UNBOLD}")
424
461
 
425
462
  def _extract_num_from_str(num_in_str):
426
463
  size = len(num_in_str)
@@ -518,16 +555,6 @@ def _requantize(model: torch.nn.Module, state_dict: dict, quantization_map: dict
518
555
 
519
556
  def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 1000000000, model_id = 'Unknown'):
520
557
 
521
- def compute_submodule_size(submodule):
522
- size = 0
523
- for p in submodule.parameters(recurse=False):
524
- size += torch.numel(p.data) * sizeofbfloat16
525
-
526
- for p in submodule.buffers(recurse=False):
527
- size += torch.numel(p.data) * sizeofbfloat16
528
-
529
- return size
530
-
531
558
  total_size =0
532
559
  total_excluded = 0
533
560
  exclude_list = []
@@ -549,16 +576,31 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
549
576
  tower_names ,_ = _detect_main_towers(model_to_quantize)
550
577
  tower_names = [ n[:-1] for n in tower_names]
551
578
 
579
+
580
+ cache_ref = {}
581
+ tied_weights= {}
582
+
552
583
  for submodule_name, submodule in model_to_quantize.named_modules():
553
584
  if isinstance(submodule, QModuleMixin):
554
585
  if verboseLevel>=1:
555
586
  print("No quantization to do as model is already quantized")
556
587
  return False
557
588
 
558
- if submodule_name=='':
559
- continue
589
+ size = 0
590
+ for n, p in submodule.named_parameters(recurse = False):
591
+ ref = _get_tensor_ref(p)
592
+ match = cache_ref.get(ref, None)
593
+ if match != None:
594
+ tied_weights[submodule_name]= (n, ) + match
595
+ else:
596
+ cache_ref[ref] = (submodule_name, n)
597
+ size += torch.numel(p.data) * sizeofbfloat16
598
+
599
+ for p in submodule.buffers(recurse=False):
600
+ size += torch.numel(p.data) * sizeofbfloat16
601
+
602
+
560
603
 
561
- size = compute_submodule_size(submodule)
562
604
  if not any(submodule_name.startswith(pre) for pre in tower_names):
563
605
  flush = False
564
606
  if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
@@ -590,12 +632,13 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
590
632
  submodule_names.append(submodule_name)
591
633
  total_size += size
592
634
 
593
- if submodule_size > 0 and submodule_size <= threshold:
635
+ if submodule_size >0 and submodule_size <= threshold :
594
636
  exclude_list += submodule_names
595
637
  if verboseLevel >=2:
596
638
  print(f"Excluded size {submodule_size/ONE_MB:.1f} MB: {prev_blocks_prefix} : {submodule_names}")
597
639
  total_excluded += submodule_size
598
640
 
641
+
599
642
  perc_excluded =total_excluded/ total_size if total_size >0 else 1
600
643
  if verboseLevel >=2:
601
644
  if total_excluded == 0:
@@ -608,7 +651,10 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
608
651
  exclude_list = None
609
652
 
610
653
 
611
- quantize(model_to_quantize,weights, exclude= exclude_list)
654
+ exclude_list += list(tied_weights)
655
+ quantize(model_to_quantize, weights= weights, exclude= exclude_list)
656
+
657
+
612
658
  # quantize(model_to_quantize,weights, include= [ "*1.block.attn.to_out*"]) #"
613
659
 
614
660
  # for name, m in model_to_quantize.named_modules():
@@ -618,24 +664,40 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
618
664
 
619
665
  # force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
620
666
  # otherwise we may end up keeping in memory both the quantized and the non quantize model
621
- for n,m in model_to_quantize.named_modules():
667
+ named_modules = {n:m for n,m in model_to_quantize.named_modules()}
668
+ for module_name, module in named_modules.items():
622
669
  # do not read quantized weights (detected them directly or behind an adapter)
623
- if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
624
- if hasattr(m, "bias") and m.bias is not None:
625
- _force_load_parameter(m.bias)
670
+ if isinstance(module, QModuleMixin) or hasattr(module, "base_layer") and isinstance(module.base_layer, QModuleMixin):
671
+ if hasattr(module, "bias") and module.bias is not None:
672
+ _force_load_parameter(module.bias)
626
673
  else:
627
- for p in m.parameters(recurse = False):
628
- _force_load_parameter(p)
629
-
630
- for b in m.buffers(recurse = False):
674
+ tied_w = tied_weights.get(module_name, None)
675
+ for n, p in module.named_parameters(recurse = False):
676
+ if tied_w != None and n == tied_w[0]:
677
+ if isinstance( named_modules[tied_w[1]], QModuleMixin) :
678
+ setattr(module, n, None) # release refs of tied weights if source is going to be quantized
679
+ # otherwise don't force load as it will be loaded in the source anyway
680
+ else:
681
+ _force_load_parameter(p)
682
+ del p # del p if not it will still contain a ref to a tensor when leaving the loop
683
+ for b in module.buffers(recurse = False):
631
684
  _force_load_buffer(b)
632
-
685
+ del b
633
686
 
634
687
 
635
688
  freeze(model_to_quantize)
636
689
  torch.cuda.empty_cache()
637
- gc.collect()
690
+ gc.collect()
691
+
692
+ for tied_module, (tied_weight, src_module, src_weight) in tied_weights.items():
693
+ p = getattr(named_modules[src_module], src_weight)
694
+ if isinstance(p, QTensor):
695
+ setattr(named_modules[tied_module], tied_weight, p ) # copy refs to quantized sources
696
+
697
+ del named_modules
698
+
638
699
  quantization_map = _quantization_map(model_to_quantize)
700
+
639
701
  model_to_quantize._quanto_map = quantization_map
640
702
 
641
703
  if hasattr(model_to_quantize, "_already_pinned"):
@@ -647,12 +709,85 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
647
709
 
648
710
  return True
649
711
 
712
+ def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
713
+ self._check_forward_args(x, *args, **kwargs)
714
+ adapter_names = kwargs.pop("adapter_names", None)
715
+ if self.disable_adapters:
716
+ if self.merged:
717
+ self.unmerge()
718
+ result = self.base_layer(x, *args, **kwargs)
719
+ elif adapter_names is not None:
720
+ result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
721
+ elif self.merged:
722
+ result = self.base_layer(x, *args, **kwargs)
723
+ else:
724
+ base_weight = self.base_layer.weight
725
+ if base_weight.shape[-1] < x.shape[-2]: # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
726
+ for active_adapter in self.active_adapters:
727
+ if active_adapter not in self.lora_A.keys():
728
+ continue
729
+ if self.use_dora[active_adapter]:
730
+ raise Exception("Dora not yet supported by mmgp")
731
+ lora_A = self.lora_A[active_adapter]
732
+ lora_B = self.lora_B[active_adapter]
733
+ scaling = self.scaling[active_adapter]
734
+ lora_A_weight = lora_A.weight
735
+ lora_B_weight = lora_B.weight
736
+ lora_BA = lora_B_weight @ lora_A_weight
737
+ base_weight += scaling * lora_BA
738
+
739
+ result = torch.nn.functional.linear(x, base_weight, bias=self.base_layer.bias)
740
+ torch_result_dtype = result.dtype
741
+
742
+ else:
743
+ result = self.base_layer(x, *args, **kwargs)
744
+ torch_result_dtype = result.dtype
745
+ x = x.to(torch.bfloat16)
746
+
747
+ for active_adapter in self.active_adapters:
748
+ if active_adapter not in self.lora_A.keys():
749
+ continue
750
+ lora_A = self.lora_A[active_adapter]
751
+ lora_B = self.lora_B[active_adapter]
752
+ dropout = self.lora_dropout[active_adapter]
753
+ scaling = self.scaling[active_adapter]
754
+ x = x.to(lora_A.weight.dtype)
755
+
756
+ if not self.use_dora[active_adapter]:
757
+ y = lora_A(x)
758
+ y = lora_B(y)
759
+ y*= scaling
760
+ result+= y
761
+ del lora_A, lora_B, y
762
+ # result = result + lora_B(lora_A(dropout(x))) * scaling
763
+ else:
764
+ if isinstance(dropout, nn.Identity) or not self.training:
765
+ base_result = result
766
+ else:
767
+ x = dropout(x)
768
+ base_result = None
769
+
770
+ result = result + self.lora_magnitude_vector[active_adapter](
771
+ x,
772
+ lora_A=lora_A,
773
+ lora_B=lora_B,
774
+ scaling=scaling,
775
+ base_layer=self.get_base_layer(),
776
+ base_result=base_result,
777
+ )
778
+
779
+ result = result.to(torch_result_dtype)
780
+ return result
781
+
650
782
  def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, verboseLevel = -1,):
651
783
  verboseLevel = _compute_verbose_level(verboseLevel)
652
784
 
653
785
  if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or get_peft_kwargs == None:
654
786
  raise Exception("Unable to load Lora, missing 'peft' and / or 'diffusers' modules")
655
-
787
+
788
+ from peft.tuners.lora import Linear
789
+ Linear.forward = _lora_linear_forward
790
+
656
791
  if not isinstance(lora_path, list):
657
792
  lora_path = [lora_path]
658
793
 
@@ -662,6 +797,9 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
662
797
  for i, path in enumerate(lora_path):
663
798
  adapter_name = str(i)
664
799
 
800
+
801
+
802
+
665
803
  state_dict = safetensors2.torch_load_file(path)
666
804
 
667
805
  keys = list(state_dict.keys())
@@ -843,7 +981,6 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
843
981
  verboseLevel = _compute_verbose_level(verboseLevel)
844
982
 
845
983
  model = _remove_model_wrapper(model)
846
-
847
984
  if not (".safetensors" in file_path or ".sft" in file_path):
848
985
  if pinToMemory:
849
986
  raise Exception("Pinning to memory while loading only supported for safe tensors files")
@@ -855,12 +992,20 @@ def load_model_data(model, file_path: str, do_quantize = False, quantizationType
855
992
 
856
993
  if metadata is None:
857
994
  quantization_map = None
995
+ tied_weights_map = None
858
996
  else:
859
997
  quantization_map = metadata.get("quantization_map", None)
860
998
  config = metadata.get("config", None)
861
999
  if config is not None:
862
1000
  model._config = config
863
1001
 
1002
+ tied_weights_map = metadata.get("tied_weights_map", None)
1003
+ if tied_weights_map != None:
1004
+ for name, tied_weights_list in tied_weights_map.items():
1005
+ mapped_weight = state_dict[name]
1006
+ for tied_weights in tied_weights_list:
1007
+ state_dict[tied_weights] = mapped_weight
1008
+
864
1009
 
865
1010
 
866
1011
  if quantization_map is None:
@@ -915,6 +1060,7 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
915
1060
  """
916
1061
 
917
1062
  config = None
1063
+ extra_meta = None
918
1064
  verboseLevel = _compute_verbose_level(verboseLevel)
919
1065
  if config_file_path !=None:
920
1066
  with open(config_file_path, "r", encoding="utf-8") as reader:
@@ -928,8 +1074,10 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
928
1074
  config_path = getattr(config_obj,"_name_or_path", None)
929
1075
  if config_path != None:
930
1076
  config_fullpath = os.path.join(config_path, "config.json")
931
- if not os.path.isfile(config_fullpath):
932
- config_fullpath = None
1077
+ config_fullpath = _get_model(config_fullpath)
1078
+
1079
+ # if not os.path.isfile(config_fullpath):
1080
+ # config_fullpath = None
933
1081
  if config_fullpath is None:
934
1082
  config_fullpath = os.path.join(os.path.dirname(file_path), "config.json")
935
1083
  if os.path.isfile(config_fullpath):
@@ -942,15 +1090,50 @@ def save_model(model, file_path, do_quantize = False, quantizationType = qint8,
942
1090
 
943
1091
  quantization_map = getattr(model, "_quanto_map", None)
944
1092
 
1093
+ from collections import OrderedDict
1094
+
1095
+ cache_ref = {}
1096
+ tied_weights_map = {}
1097
+ sd = model.state_dict()
1098
+ out_sd = OrderedDict()
1099
+
1100
+
1101
+ for name, weight in sd.items():
1102
+ ref = _get_tensor_ref(weight)
1103
+ match = cache_ref.get(ref, None)
1104
+ if match != None:
1105
+ tied_list = tied_weights_map.get(match, [])
1106
+ tied_list.append(name)
1107
+ tied_weights_map[match] = tied_list
1108
+ else:
1109
+ out_sd[name] = weight
1110
+ cache_ref[ref] = name
1111
+
1112
+ if len(tied_weights_map) > 0:
1113
+ extra_meta = { "tied_weights_map" : tied_weights_map }
1114
+
945
1115
  if verboseLevel >=1:
946
1116
  print(f"Saving file '{file_path}")
947
- safetensors2.torch_write_file(model.state_dict(), file_path , quantization_map = quantization_map, config = config)
1117
+
1118
+ safetensors2.torch_write_file(out_sd, file_path , quantization_map = quantization_map, config = config, extra_meta= extra_meta)
948
1119
  if verboseLevel >=1:
949
1120
  print(f"File '{file_path}' saved")
950
1121
 
951
1122
 
952
- def extract_models(prefix, obj):
1123
+ def extract_models(obj = None, prefix = None):
1124
+ if isinstance(obj, str): # for compatibility as the two args were switched
1125
+ bkp = prefix
1126
+ prefix = obj
1127
+ obj = bkp
1128
+
953
1129
  pipe = {}
1130
+ if obj == None:
1131
+ raise Exception("an object to analyze must be provided")
1132
+ if prefix==None or len(prefix)==0:
1133
+ prefix = ""
1134
+ elif prefix[ -1:] != "/":
1135
+ prefix + "/"
1136
+
954
1137
  for name in dir(obj):
955
1138
  element = getattr(obj,name)
956
1139
  if name in ("pipeline", "pipe"):
@@ -958,16 +1141,16 @@ def extract_models(prefix, obj):
958
1141
  if hasattr(pipeline , "components") and isinstance(pipeline.components, dict):
959
1142
  for k, model in pipeline.components.items():
960
1143
  if model != None:
961
- pipe[prefix + "/" + k ] = model
962
- elif isinstance(element, torch.nn.Module):
963
- if prefix + "/" + name in pipe:
964
- pipe[prefix + "/_" + name ] = element
1144
+ pipe[prefix + k ] = model
1145
+ elif isinstance(element, torch.nn.Module) and name!="base_model":
1146
+ if prefix + name in pipe:
1147
+ pipe[prefix + "_" + name ] = element
965
1148
  else:
966
- pipe[prefix + "/" + name ] = element
1149
+ pipe[prefix + name ] = element
967
1150
  elif isinstance(element, dict):
968
1151
  for k, element in element.items():
969
1152
  if hasattr(element , "pipeline"):
970
- pipe.update( extract_models(prefix + "/" + k,element ))
1153
+ pipe.update( extract_models(prefix + k,element ))
971
1154
 
972
1155
 
973
1156
  return pipe
@@ -989,6 +1172,10 @@ class offload:
989
1172
  self.active_models_ids = []
990
1173
  self.active_subcaches = {}
991
1174
  self.models = {}
1175
+ self.cotenants_map = {
1176
+ "text_encoder": ["vae", "text_encoder_2"],
1177
+ "text_encoder_2": ["vae", "text_encoder"],
1178
+ }
992
1179
  self.verboseLevel = 0
993
1180
  self.blocks_of_modules = {}
994
1181
  self.blocks_of_modules_sizes = {}
@@ -1002,14 +1189,13 @@ class offload:
1002
1189
  self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
1003
1190
  self.transfer_stream = torch.cuda.Stream()
1004
1191
  self.async_transfers = False
1192
+ self.parameters_ref = {}
1005
1193
  global last_offload_obj
1006
1194
  last_offload_obj = self
1007
1195
 
1008
1196
 
1009
- def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name):
1197
+ def add_module_to_blocks(self, model_id, blocks_name, submodule, prev_block_name, submodule_name):
1010
1198
 
1011
- if blocks_name is None:
1012
- pass
1013
1199
  entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
1014
1200
  if entry_name in self.blocks_of_modules:
1015
1201
  blocks_params = self.blocks_of_modules[entry_name]
@@ -1023,39 +1209,54 @@ class offload:
1023
1209
  self.prev_blocks_names[entry_name] = prev_entry_name
1024
1210
  if not prev_block_name == None:
1025
1211
  self.next_blocks_names[prev_entry_name] = entry_name
1026
-
1212
+ bef = blocks_params_size
1027
1213
  for k,p in submodule.named_parameters(recurse=False):
1214
+ param_size = 0
1215
+ ref = _get_tensor_ref(p)
1216
+ tied_param = self.parameters_ref.get(ref, None)
1028
1217
 
1029
1218
  if isinstance(p, QTensor):
1030
- blocks_params.append( (submodule, k, p, False ) )
1219
+ blocks_params.append( (submodule, k, p, False, tied_param ) )
1031
1220
 
1032
1221
  if p._qtype == qint4:
1033
1222
  if hasattr(p,"_scale_shift"):
1034
- blocks_params_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
1035
- blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
1223
+ param_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
1224
+ param_size += torch.numel(p._data._data) * p._data._data.element_size()
1036
1225
  else:
1037
- blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
1038
- blocks_params_size += torch.numel(p._shift) * p._shift.element_size()
1039
- blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
1226
+ param_size += torch.numel(p._scale) * p._scale.element_size()
1227
+ param_size += torch.numel(p._shift) * p._shift.element_size()
1228
+ param_size += torch.numel(p._data._data) * p._data._data.element_size()
1040
1229
  else:
1041
- blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
1042
- blocks_params_size += torch.numel(p._data) * p._data.element_size()
1230
+ param_size += torch.numel(p._scale) * p._scale.element_size()
1231
+ param_size += torch.numel(p._data) * p._data.element_size()
1043
1232
  else:
1044
- blocks_params.append( (submodule, k, p, False) )
1045
- blocks_params_size += torch.numel(p.data) * p.data.element_size()
1233
+ blocks_params.append( (submodule, k, p, False, tied_param) )
1234
+ param_size += torch.numel(p.data) * p.data.element_size()
1235
+
1236
+
1237
+ if tied_param == None:
1238
+ blocks_params_size += param_size
1239
+ self.parameters_ref[ref] = (submodule, k)
1046
1240
 
1047
1241
  for k, p in submodule.named_buffers(recurse=False):
1048
- blocks_params.append( (submodule, k, p, True) )
1242
+ blocks_params.append( (submodule, k, p, True, None) )
1049
1243
  blocks_params_size += p.data.nbytes
1050
1244
 
1245
+ aft = blocks_params_size
1246
+
1247
+ # if blocks_name is None:
1248
+ # print(f"Default: {model_id}/{submodule_name} : {(aft-bef)/ONE_MB:0.2f} MB")
1249
+ # pass
1250
+
1051
1251
 
1052
1252
  self.blocks_of_modules_sizes[entry_name] = blocks_params_size
1053
1253
 
1254
+
1054
1255
  return blocks_params_size
1055
1256
 
1056
1257
 
1057
1258
  def can_model_be_cotenant(self, model_id):
1058
- potential_cotenants= cotenants_map.get(model_id, None)
1259
+ potential_cotenants= self.cotenants_map.get(model_id, None)
1059
1260
  if potential_cotenants is None:
1060
1261
  return False
1061
1262
  for existing_cotenant in self.active_models_ids:
@@ -1073,20 +1274,23 @@ class offload:
1073
1274
  def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
1074
1275
  with torch.cuda.stream(stream_to_use):
1075
1276
  for param in blocks_params:
1076
- parent_module, n, p, is_buffer = param
1277
+ parent_module, n, p, is_buffer, tied_param = param
1278
+ if tied_param != None:
1279
+ tied_p = getattr( tied_param[0], tied_param[1])
1280
+ if tied_p.is_cuda:
1281
+ setattr(parent_module, n , tied_p)
1282
+ continue
1283
+
1077
1284
  q = p.to("cuda", non_blocking=True)
1078
1285
  if is_buffer:
1079
1286
  q = torch.nn.Buffer(q)
1080
1287
  else:
1081
1288
  q = torch.nn.Parameter(q , requires_grad=False)
1082
1289
  setattr(parent_module, n , q)
1083
- # if record_for_stream != None:
1084
- # if isinstance(p, QTensor):
1085
- # q._data.record_stream(record_for_stream)
1086
- # q._scale.record_stream(record_for_stream)
1087
- # else:
1088
- # p.data.record_stream(record_for_stream)
1089
1290
 
1291
+ if tied_param != None:
1292
+ setattr( tied_param[0], tied_param[1], q)
1293
+ del p, q
1090
1294
  any_past_block = False
1091
1295
 
1092
1296
  loaded_block = self.loaded_blocks[model_id]
@@ -1108,24 +1312,24 @@ class offload:
1108
1312
  first = self.prev_blocks_names[entry_name] == None or not any_past_block
1109
1313
  next_blocks_entry = self.next_blocks_names[entry_name] if entry_name in self.next_blocks_names else None
1110
1314
  if first:
1111
- cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
1112
1315
  if self.verboseLevel >=2:
1113
1316
  if preload:
1114
1317
  print(f"Preloading model {entry_name} ({model_name}) in GPU")
1115
1318
  else:
1116
1319
  print(f"Loading model {entry_name} ({model_name}) in GPU")
1320
+ cpu_to_gpu(torch.cuda.current_stream(), self.blocks_of_modules[entry_name])
1117
1321
 
1118
1322
  torch.cuda.synchronize()
1119
1323
 
1120
1324
  if next_blocks_entry != None:
1121
- cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
1122
1325
  if self.verboseLevel >=2:
1123
1326
  print(f"Prefetching model {next_blocks_entry} ({model_name}) in GPU")
1327
+ cpu_to_gpu(self.transfer_stream, self.blocks_of_modules[next_blocks_entry]) #, self.default_stream
1124
1328
 
1125
1329
  else:
1126
- cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
1127
1330
  if self.verboseLevel >=2:
1128
1331
  print(f"Loading model {entry_name} ({model_name}) in GPU")
1332
+ cpu_to_gpu(self.default_stream, self.blocks_of_modules[entry_name])
1129
1333
  torch.cuda.synchronize()
1130
1334
 
1131
1335
  if not preload:
@@ -1149,12 +1353,13 @@ class offload:
1149
1353
 
1150
1354
  blocks_params = self.blocks_of_modules[blocks_name]
1151
1355
  for param in blocks_params:
1152
- parent_module, n, p, is_buffer = param
1356
+ parent_module, n, p, is_buffer, _ = param
1153
1357
  if is_buffer:
1154
1358
  q = torch.nn.Buffer(p)
1155
1359
  else:
1156
1360
  q = torch.nn.Parameter(p , requires_grad=False)
1157
1361
  setattr(parent_module, n , q)
1362
+ del p, q
1158
1363
  # cl.stop()
1159
1364
  # print(f"unload time: {cl.format_time_gap()}")
1160
1365
 
@@ -1168,9 +1373,6 @@ class offload:
1168
1373
  for block_name in self.preloaded_blocks_per_model[model_id]:
1169
1374
  self.gpu_load_blocks(model_id, block_name, True)
1170
1375
 
1171
-
1172
- # torch.cuda.current_stream().synchronize()
1173
-
1174
1376
  def unload_all(self):
1175
1377
  for model_id in self.active_models_ids:
1176
1378
  self.gpu_unload_blocks(model_id, None)
@@ -1246,6 +1448,16 @@ class offload:
1246
1448
 
1247
1449
  return False
1248
1450
 
1451
+ def ensure_model_loaded(self, model_id):
1452
+ if model_id in self.active_models_ids:
1453
+ return
1454
+ # new_model_id = getattr(module, "_mm_id")
1455
+ # do not always unload existing models if it is more efficient to keep in them in the GPU
1456
+ # (e.g: small modules whose calls are text encoders)
1457
+ if not self.can_model_be_cotenant(model_id) :
1458
+ self.unload_all()
1459
+ self.gpu_load(model_id)
1460
+
1249
1461
  def hook_preload_blocks_for_compilation(self, target_module, model_id,blocks_name, context):
1250
1462
 
1251
1463
  # @torch.compiler.disable()
@@ -1259,16 +1471,27 @@ class offload:
1259
1471
  target_module.register_forward_pre_hook(preload_blocks_for_compile)
1260
1472
 
1261
1473
 
1262
- def hook_check_empty_cache_needed(self, target_module, model_id,blocks_name, previous_method, context):
1474
+ def hook_check_empty_cache_needed(self, target_module, model_id, blocks_name, previous_method, context):
1263
1475
 
1264
1476
  qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
1265
1477
  if qint4quantization:
1266
1478
  pass
1267
1479
 
1268
- def check_empty_cuda_cache(module, *args, **kwargs):
1269
- # if self.ready_to_check_mem():
1270
- # self.empty_cache_if_needed()
1480
+ if hasattr(target_module, "_mm_id"):
1481
+ # no hook for a shared module with no weights (otherwise this will cause models loading / unloading for nothing)
1482
+ orig_model_id = getattr(target_module, "_mm_id")
1483
+ if self.verboseLevel >=2:
1484
+ print(f"Model '{model_id}' shares module '{target_module._get_name()}' with module(s) '{orig_model_id}' ")
1485
+ assert not self.any_param_or_buffer(target_module)
1486
+ if not isinstance(orig_model_id, list):
1487
+ orig_model_id = [orig_model_id]
1488
+ orig_model_id.append(model_id)
1489
+ setattr(target_module, "_mm_id", orig_model_id)
1490
+ target_module.forward = target_module._mm_forward
1491
+ return
1271
1492
 
1493
+ def check_empty_cuda_cache(module, *args, **kwargs):
1494
+ self.ensure_model_loaded(model_id)
1272
1495
  if blocks_name == None:
1273
1496
  if self.ready_to_check_mem():
1274
1497
  self.empty_cache_if_needed()
@@ -1279,34 +1502,18 @@ class offload:
1279
1502
 
1280
1503
  return previous_method(*args, **kwargs)
1281
1504
 
1282
-
1283
- if hasattr(target_module, "_mm_id"):
1284
- orig_model_id = getattr(target_module, "_mm_id")
1285
- if self.verboseLevel >=2:
1286
- print(f"Model '{model_id}' shares module '{target_module._get_name()}' with module '{orig_model_id}' ")
1287
- assert not self.any_param_or_buffer(target_module)
1288
-
1289
- return
1290
1505
  setattr(target_module, "_mm_id", model_id)
1506
+ setattr(target_module, "_mm_forward", previous_method)
1507
+
1291
1508
  setattr(target_module, "forward", functools.update_wrapper(functools.partial(check_empty_cuda_cache, target_module), previous_method) )
1292
1509
 
1293
1510
 
1294
1511
  def hook_change_module(self, target_module, model, model_id, module_id, previous_method):
1295
- def check_change_module(module, *args, **kwargs):
1296
- performEmptyCacheTest = False
1297
- if not model_id in self.active_models_ids:
1298
- new_model_id = getattr(module, "_mm_id")
1299
- # do not always unload existing models if it is more efficient to keep in them in the GPU
1300
- # (e.g: small modules whose calls are text encoders)
1301
- if not self.can_model_be_cotenant(new_model_id) :
1302
- self.unload_all()
1303
- performEmptyCacheTest = False
1304
- self.gpu_load(new_model_id)
1512
+
1513
+ def check_change_module(module, *args, **kwargs):
1514
+ self.ensure_model_loaded(model_id)
1305
1515
  # transfer leftovers inputs that were incorrectly created in the RAM (mostly due to some .device tests that returned incorrectly "cpu")
1306
1516
  args, kwargs = self.move_args_to_gpu(*args, **kwargs)
1307
- if performEmptyCacheTest:
1308
- self.empty_cache_if_needed()
1309
-
1310
1517
  return previous_method(*args, **kwargs)
1311
1518
 
1312
1519
  if hasattr(target_module, "_mm_id"):
@@ -1337,6 +1544,8 @@ class offload:
1337
1544
  base_size = self.blocks_of_modules_sizes[model_id]
1338
1545
  current_budget -= base_size
1339
1546
  if current_budget <= 0:
1547
+ if self.verboseLevel >=1:
1548
+ print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only only base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1340
1549
  return
1341
1550
 
1342
1551
  towers = []
@@ -1357,6 +1566,8 @@ class offload:
1357
1566
  total_size += tower_size
1358
1567
  current_budget -= 2 * max_floor_size
1359
1568
  if current_budget <= 0:
1569
+ if self.verboseLevel >=1:
1570
+ print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1360
1571
  return
1361
1572
 
1362
1573
 
@@ -1366,6 +1577,8 @@ class offload:
1366
1577
  preload_total += preload_blocks_count * max_floor_size
1367
1578
  max_blocks_fetch = max(max_floor_size, max_blocks_fetch)
1368
1579
  if preload_blocks_count <= 0:
1580
+ if self.verboseLevel >=1:
1581
+ print(f"Async loading plan for model '{model_id}' : due to limited budget, beside the async shuttle only the base model ({(base_size)/ONE_MB:0.2f} MB) will be preloaded")
1369
1582
  return
1370
1583
 
1371
1584
  nb_blocks= len(floors)
@@ -1396,11 +1609,11 @@ class offload:
1396
1609
 
1397
1610
  self.preloaded_blocks_per_model[model_id] = preloaded_blocks
1398
1611
 
1399
- if self.verboseLevel >=2:
1400
- print(f"Async loading plan for model '{model_id}' : {preload_total/ONE_MB:0.2f} MB will be preloaded ({preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async shuttle")
1612
+ if self.verboseLevel >=1:
1613
+ print(f"Async loading plan for model '{model_id}' : {(preload_total+base_size)/ONE_MB:0.2f} MB will be preloaded (base size of {base_size/ONE_MB:0.2f} MB + {preload_total/total_size*100:0.1f}% of recurrent layers data) with a {max_blocks_fetch/ONE_MB:0.2f} MB async" + (" circular" if len(towers) == 1 else "") + " shuttle")
1401
1614
 
1402
1615
 
1403
- def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
1616
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
1404
1617
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
1405
1618
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
1406
1619
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -1417,9 +1630,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1417
1630
  model_budgets = {}
1418
1631
 
1419
1632
  windows_os = os.name == 'nt'
1420
- global total_pinned_bytes
1421
1633
 
1422
-
1423
1634
  budget = 0
1424
1635
  if not budgets is None:
1425
1636
  if isinstance(budgets , dict):
@@ -1448,6 +1659,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1448
1659
  verboseLevel = _compute_verbose_level(verboseLevel)
1449
1660
 
1450
1661
  _welcome()
1662
+ if coTenantsMap != None:
1663
+ self.cotenants_map = coTenantsMap
1451
1664
 
1452
1665
  self.models = models
1453
1666
 
@@ -1528,9 +1741,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1528
1741
  current_model_size += torch.numel(p.data) * p.data.element_size()
1529
1742
 
1530
1743
  for b in current_model.buffers():
1531
- if b.data.dtype == torch.float32:
1532
- # convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
1533
- b.data = b.data.to(torch.bfloat16)
1744
+ # do not convert 32 bits float to 16 bits since buffers are few (and potential gain low) and usually they are needed for precision calculation (for instance Rope)
1534
1745
  current_model_size += torch.numel(b.data) * b.data.element_size()
1535
1746
 
1536
1747
  if modelPinned:
@@ -1538,17 +1749,39 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1538
1749
 
1539
1750
 
1540
1751
  model_budget = model_budgets[model_id] * ONE_MB if model_id in model_budgets else budget
1541
-
1752
+ if workingVRAM != None:
1753
+ model_minimumVRAM = -1
1754
+ if isinstance(workingVRAM, dict):
1755
+ if model_id in workingVRAM:
1756
+ model_minimumVRAM = workingVRAM[model_id]
1757
+ elif "*" in model_id in workingVRAM:
1758
+ model_minimumVRAM = workingVRAM["*"]
1759
+ else:
1760
+ model_minimumVRAM = workingVRAM
1761
+ if model_minimumVRAM > 0:
1762
+ new_budget = self.device_mem_capacity - model_minimumVRAM * ONE_MB
1763
+ new_budget = 1 if new_budget < 0 else new_budget
1764
+ model_budget = new_budget if model_budget == 0 or new_budget < model_budget else model_budget
1542
1765
  if model_budget > 0 and model_budget > current_model_size:
1543
1766
  model_budget = 0
1767
+ coef =0.8
1768
+ if current_model_size > coef * self.device_mem_capacity and model_budget == 0 or model_budget > coef * self.device_mem_capacity:
1769
+ if verboseLevel >= 1:
1770
+ if model_budget == 0:
1771
+ print(f"Model '{model_id}' is too large ({current_model_size/ONE_MB:0.1f} MB) to fit entirely in {coef * 100}% of the VRAM (max capacity is {coef * self.device_mem_capacity/ONE_MB}) MB)")
1772
+ else:
1773
+ print(f"Budget ({budget/ONE_MB:0.1f} MB) for Model '{model_id}' is too important so that this model can fit in the VRAM (max capacity is {self.device_mem_capacity/ONE_MB}) MB)")
1774
+ print(f"Budget allocation for this model has been consequently reduced to the 80% of max GPU Memory ({coef * self.device_mem_capacity/ONE_MB:0.1f} MB). This may not leave enough working VRAM and you will probably need to define manually a lower budget for this model.")
1775
+ model_budget = coef * self.device_mem_capacity
1776
+
1544
1777
 
1545
- model_budgets[model_id] = model_budget #/ 2 if asyncTransfers else model_budget
1778
+ model_budgets[model_id] = model_budget
1546
1779
 
1547
1780
  partialPinning = False
1548
1781
 
1549
1782
  if estimatesBytesToPin > 0 and estimatesBytesToPin >= (max_reservable_memory - total_pinned_bytes):
1550
1783
  if self.verboseLevel >=1:
1551
- print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated reservable RAM is {max_reservable_memory/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
1784
+ print(f"Switching to partial pinning since full requirements for pinned models is {estimatesBytesToPin/ONE_MB:0.1f} MB while estimated available reservable RAM is {(max_reservable_memory-total_pinned_bytes)/ONE_MB:0.1f} MB. You may increase the value of parameter 'perc_reserved_mem_max' to a value higher than {perc_reserved_mem_max:0.2f} to force full pinnning." )
1552
1785
  partialPinning = True
1553
1786
 
1554
1787
  # Hook forward methods of modules
@@ -1577,15 +1810,14 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1577
1810
  _pin_to_memory(current_model, model_id, partialPinning= partialPinning, verboseLevel=verboseLevel)
1578
1811
 
1579
1812
  current_budget = model_budgets[model_id]
1580
- cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
1813
+ cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
1581
1814
  self.loaded_blocks[model_id] = None
1582
1815
 
1583
1816
  for submodule_name, submodule in current_model.named_modules():
1584
1817
  # create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
1585
1818
  # (it is queried in many pipelines even if offloading is not properly implemented)
1586
- if not hasattr(submodule, "_hf_hook"):
1819
+ if not hasattr(submodule, "_hf_hook"):
1587
1820
  setattr(submodule, "_hf_hook", HfHook())
1588
-
1589
1821
  if current_budget > 0 and len(submodule_name) > 0:
1590
1822
  if cur_blocks_prefix != None:
1591
1823
  if submodule_name.startswith(cur_blocks_prefix):
@@ -1593,20 +1825,20 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1593
1825
  depth_name = submodule_name.split(".")
1594
1826
  level = depth_name[len(depth_prefix)-1]
1595
1827
  pre , num = _extract_num_from_str(level)
1596
- if num != cur_blocks_seq: #and (cur_blocks_seq == -1 or current_size > current_budget)
1828
+ if num != cur_blocks_seq and not (is_mod_seq and cur_blocks_seq>=0):
1597
1829
  prev_blocks_name = cur_blocks_name
1598
1830
  cur_blocks_name = cur_blocks_prefix + str(num)
1599
1831
  # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
1600
1832
  cur_blocks_seq = num
1601
1833
  else:
1602
- cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq = None, None, None, -1
1834
+ cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
1603
1835
 
1604
1836
  if cur_blocks_prefix == None:
1605
1837
  pre , num = _extract_num_from_str(submodule_name)
1606
1838
  if isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
1607
- cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre + ".", None, -1
1839
+ cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre + ".", None, -1, isinstance(submodule, torch.nn.Sequential)
1608
1840
  elif num >=0:
1609
- cur_blocks_prefix, prev_blocks_name, cur_blocks_seq = pre, None, num
1841
+ cur_blocks_prefix, prev_blocks_name, cur_blocks_seq, is_mod_seq = pre, None, num, False
1610
1842
  cur_blocks_name = submodule_name
1611
1843
  # print(f"new block: {model_id}/{cur_blocks_name} - {submodule_name}")
1612
1844
 
@@ -1621,7 +1853,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1621
1853
  else:
1622
1854
  self.hook_check_empty_cache_needed(submodule, model_id, cur_blocks_name, submodule_method, context = submodule_name )
1623
1855
 
1624
- self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
1856
+ self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
1625
1857
 
1626
1858
  self.tune_preloading(model_id, current_budget, towers_names)
1627
1859
 
@@ -1635,9 +1867,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1635
1867
  elif prev_num - start_num <=1:
1636
1868
  print(f"Size of submodel '{n+ str(start_num)}': {prev_size/ONE_MB:.1f} MB")
1637
1869
  else:
1638
- print(f"Size of submodel '{n+ str(start_num) +'-'+ str(prev_num)}': {prev_size/ONE_MB:.1f} MB")
1870
+ print(f"Size of submodel '{n+ str(start_num) +'-'+ str(prev_num)}': {(prev_num-start_num+1)*prev_size/ONE_MB:.1f} MB ({prev_size/ONE_MB:.1f} MB x {prev_num-start_num+1})")
1639
1871
 
1640
1872
  for n, size in self.blocks_of_modules_sizes.items():
1873
+ size = int(size / 10000)* 10000
1641
1874
  pre, num = _extract_num_from_str(n) if "/" in n else (n, -1)
1642
1875
  if prev_pre == None :
1643
1876
  start_num = num
@@ -1709,21 +1942,21 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1709
1942
  if profile_no == profile_type.HighRAM_HighVRAM:
1710
1943
  pinnedMemory= True
1711
1944
  budgets = None
1712
- info = "You have chosen a profile that requires at least 48 GB of RAM and 24 GB of VRAM. Some VRAM is consumed just to make the model runs faster."
1945
+ info = "You have chosen a profile that may require 48 GB of RAM and up to 24 GB of VRAM on some applications."
1713
1946
  elif profile_no == profile_type.HighRAM_LowVRAM:
1714
1947
  pinnedMemory= True
1715
1948
  budgets["*"] = 3000
1716
- info = "You have chosen a profile that requires at least 48 GB of RAM and 12 GB of VRAM. Some RAM is consumed to reduce VRAM consumption."
1949
+ info = "You have chosen a profile that may require 48 GB of RAM and up to 12 GB of VRAM on some applications."
1717
1950
  elif profile_no == profile_type.LowRAM_HighVRAM:
1718
1951
  pinnedMemory= "transformer"
1719
1952
  extraModelsToQuantize = default_extraModelsToQuantize
1720
1953
  budgets = None
1721
- info = "You have chosen a Medium speed profile that requires at least 32 GB of RAM and 24 GB of VRAM. Some VRAM is consuming just to make the model runs faster"
1954
+ info = "You have chosen a Medium speed profile that may require 32 GB of RAM and up to 24 GB of VRAM on some applications."
1722
1955
  elif profile_no == profile_type.LowRAM_LowVRAM:
1723
1956
  pinnedMemory= "transformer"
1724
1957
  extraModelsToQuantize = default_extraModelsToQuantize
1725
1958
  budgets["*"] = 3000
1726
- info = "You have chosen a profile that requires at least 32 GB of RAM and 12 GB of VRAM. Some RAM is consumed to reduce VRAM consumption. "
1959
+ info = "You have chosen a profile that usually may require 32 GB of RAM and up to 12 GB of VRAM on some applications."
1727
1960
  elif profile_no == profile_type.VerylowRAM_LowVRAM:
1728
1961
  pinnedMemory= False
1729
1962
  extraModelsToQuantize = default_extraModelsToQuantize
@@ -1731,9 +1964,10 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1731
1964
  if "transformer" in modules:
1732
1965
  budgets["transformer"] = 400
1733
1966
  #asyncTransfers = False
1734
- info = "You have chosen the slowest profile that requires at least 24 GB of RAM and 10 GB of VRAM."
1967
+ info = "You have chosen the slowest profile that may require 24 GB of RAM and up to 10 GB of VRAM on some applications."
1735
1968
  else:
1736
1969
  raise Exception("Unknown profile")
1970
+ info += " Actual requirements may varry depending on the application or on the tuning done to the profile."
1737
1971
 
1738
1972
  if budgets != None and len(budgets) == 0:
1739
1973
  budgets = None
mmgp/safetensors2.py CHANGED
@@ -146,7 +146,7 @@ def _read_safetensors_header(path, file):
146
146
  return catalog, metadata, length_of_header + 8
147
147
 
148
148
 
149
- def torch_write_file(sd, file_path, quantization_map = None, config = None):
149
+ def torch_write_file(sd, file_path, quantization_map = None, config = None, extra_meta = None):
150
150
  from collections import OrderedDict
151
151
  sf_sd = OrderedDict()
152
152
 
@@ -189,6 +189,14 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
189
189
  if not config is None:
190
190
  metadata["config_base64"] = base64.b64encode(json.dumps(config, ensure_ascii=False).encode('utf8')).decode('utf8')
191
191
 
192
+ if not extra_meta is None:
193
+ for n , m in extra_meta.items():
194
+ if isinstance(m, str):
195
+ metadata[n] = m
196
+ else:
197
+ metadata[n + "_base64"] = base64.b64encode(json.dumps(m, ensure_ascii=False).encode('utf8')).decode('utf8')
198
+
199
+
192
200
  if len(metadata) > 0:
193
201
  sf_sd["__metadata__"] = metadata
194
202
 
@@ -443,6 +451,4 @@ try:
443
451
  transformers.modeling_utils.safe_open = safe_open
444
452
  transformers.modeling_utils.safe_load_file = torch_load_file
445
453
  except:
446
- pass
447
-
448
-
454
+ pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.1.4.post15
3
+ Version: 3.1.4.post1519
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -17,7 +17,7 @@ Requires-Dist: peft
17
17
 
18
18
 
19
19
  <p align="center">
20
- <H2>Memory Management 3.1.4 for the GPU Poor by DeepBeepMeep</H2>
20
+ <H2>Memory Management 3.1.4-151 for the GPU Poor by DeepBeepMeep</H2>
21
21
  </p>
22
22
 
23
23
 
@@ -44,21 +44,23 @@ Each profile may use a combination of the following:
44
44
 
45
45
  ## Sample applications that use mmgp
46
46
  It is recommended to have a look at these applications to see how mmgp was implemented in each of them:
47
- - Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP\
47
+ - Hunyuan3D-2GP: https://github.com/deepbeepmeep/Hunyuan3D-2GP :\
48
48
  A great image to 3D and text to 3D tool by the Tencent team. Thanks to mmgp it can run with less than 6 GB of VRAM
49
49
 
50
- - HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP\
50
+ - HuanyuanVideoGP: https://github.com/deepbeepmeep/HunyuanVideoGP :\
51
51
  One of the best open source Text to Video generator
52
52
 
53
- - FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP\
53
+ - FluxFillGP: https://github.com/deepbeepmeep/FluxFillGP :\
54
54
  One of the best inpainting / outpainting tools based on Flux that can run with less than 12 GB of VRAM.
55
55
 
56
- - Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP\
56
+ - Cosmos1GP: https://github.com/deepbeepmeep/Cosmos1GP :\
57
57
  This application include two models: a text to world generator and a image / video to world (probably the best open source image to video generator).
58
58
 
59
- - OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP\
59
+ - OminiControlGP: https://github.com/deepbeepmeep/OminiControlGP :\
60
60
  A Flux derived application very powerful that can be used to transfer an object of your choice in a prompted scene. With mmgp you can run it with only 6 GB of VRAM.
61
61
 
62
+ - YuE GP: https://github.com/deepbeepmeep/YuEGP :\
63
+ A great song generator (instruments + singer's voice) based on prompted Lyrics and a genre description. Thanks to mmgp you can run it with less than 10 GB of VRAM without waiting forever.
62
64
 
63
65
  ## Installation
64
66
  First you need to install the module in your current project with:
@@ -88,7 +90,7 @@ You can choose between 5 profiles depending on your hardware:
88
90
  - LowRAM_LowVRAM (4): at least 32 GB of RAM and 12 GB of VRAM : if you have little VRAM or want to generate longer videos / more images
89
91
  - VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
90
92
 
91
- Profile 2 (High RAM) and 4 (Low RAM)are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
93
+ Profile 2 (High RAM) and 4 (Low RAM) are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
92
94
  If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
93
95
  In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
94
96
 
@@ -114,11 +116,13 @@ For example:
114
116
  - pinnedMemory: Boolean (for all models) or List of models ids to pin to RAM. Every model pinned to RAM will load much faster (up to 2 times) but this requires more RAM
115
117
  - quantizeTransformer: boolean by default True. The 'transformer' model in the pipe contains usually the video or image generator is by defaut; quantized on the fly by default to 8 bits. If you want to save time on disk and reduce the loading time, you may want to load directly a prequantized model. If you don't want to quantize the image generator, you need to set the option *quantizeTransformer* to *False* to turn off on the fly quantization.
116
118
  - extraModelsToQuantize: list of additional modelids of models to quantize on the fly. If the corresponding model is already quantized, this option will be ignored.
117
- - budgets: either a number in mega bytes (for all models, if 0 unlimited budget) or a dictionary that maps model ids to mega bytes : define the budget in VRAM (in fact the real number is 1.5 this number or 2.5 if asyncTransfers are also enabled) that is allocated in VRAM for each model.
119
+ - budgets: either a number in mega bytes (for all models, if 0 unlimited budget) or a dictionary that maps model ids to mega bytes : define the approximate budget in mega bytes that is allocated in VRAM for a model. Try not to allocate all the available VRAM so that the rest can be used to process the data. To define the default value in the dictionary, you may add entry named "*".
118
120
  The smaller this number, the more VRAM left for image data / longer video but also the slower because there will be lots of loading / unloading between the RAM and the VRAM. If model is too big to fit in a budget, it will be broken down in multiples parts that will be unloaded / loaded consequently. The speed of low budget can be increased (up to 2 times) by turning on the options pinnedMemory and asyncTransfers.
121
+ - workingVRAM: either a number in mega bytes or a dictionary that maps a model ids to a number in mega bytes that corresponds to a minimum amount of VRAM that should be left for the data processed by the model. This number will prevail if it is in conflict with a too high budget defined for the same model.
119
122
  - asyncTransfers: boolean, load to the GPU the next model part while the current part is being processed. This requires twice the budget if any is defined. This may increase speed by 20% (mostly visible on fast modern GPUs).
120
123
  - verboseLevel: number between 0 and 2 (1 by default), provides various level of feedback of the different processes
121
124
  - compile: list of model ids to compile, may accelerate up x2 depending on the type of GPU. It makes sense to compile only the model that is frequently used such as the "transformer" model in the case of video or image generation. Compilation requires Triton to be installed. Triton is available out of the box on Linux or WSL but requires to be installed with Windows: https://github.com/woct0rdho/triton-windows
125
+ - coTenantsMap: a dictionary that maps a model id to a list of other models with which it accepts to share the VRAM at the same time. This is useful to avoid unefficient loading / unloading when two models processes are interleaved. For instance *coTenantsMap = { "text_encoder_2": ["text_encoder"] }* , here when *text_encoder_2* is loaded it won't unload *text_encoder*. Please note that the reverse is not true as these maps by design are not symetrical to allow tailored workflows. If you need to have as well *text_encoder* that won't unload *text_encoder_2* if it is already loaded *coTenantsMap = { "text_encoder_2": ["text_encoder"], "text_encoder": ["text_encoder_2"] }*
122
126
 
123
127
  If you are short on RAM and plan to work with quantized models, it is recommended to load pre-quantized models direclty rather than using on the fly quantization, it will be faster and consume slightly less RAM.
124
128
 
@@ -126,11 +130,14 @@ If you are short on RAM and plan to work with quantized models, it is recommende
126
130
 
127
131
  The module includes several tools to package a light version of your favorite video / image generator:
128
132
  - *extract_models(string prefix, obj to explore)*\
129
- This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required par *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
133
+ This tool will try to detect for you models that are embedded in a pipeline or in some custom class. It will save you time by building a pipe dictionary required by *offload.all* or "offload.profile*. The prefix correponds to the text that will appear before the name of each model in the dictionary.
130
134
 
131
- - *load_loras_into_model(model, lora_path, lora_multi)*\
135
+ - *load_loras_into_model(model, lora_path, lora_multi, activate_all_loras = True)*\
132
136
  Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
133
- The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
137
+ The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions. By default all the load loras will be activated or they can be activated later using *activate_loras*.
138
+
139
+ -*activate_loras(model, lora_nos, lora_multi = None )*\
140
+ Activate the loras whose nos are in the list of nos. Every lora that is not this list and that was activated previously will be disactivated.
134
141
 
135
142
  - *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
136
143
  Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=dfmplgTm19DPJ8AKqOf8McaY2f63cz3Dqim_-Hvpcqo,86202
4
+ mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
5
+ mmgp-3.1.4.post1519.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
+ mmgp-3.1.4.post1519.dist-info/METADATA,sha256=x0gpYN-KkoW7aNwLNK-3IOV1B7pljr3eW9y5_8w7W6c,15947
7
+ mmgp-3.1.4.post1519.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
8
+ mmgp-3.1.4.post1519.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.1.4.post1519.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=DEGTt5RPoLx9JK-d7Ld_B_rIuQrmhblQJw3V5CL9Lo8,74519
4
- mmgp/safetensors2.py,sha256=OkJAvENfWeb-PL0FcxS1-eYeHLbemTaNXYvNxURrzIs,16154
5
- mmgp-3.1.4.post15.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
- mmgp-3.1.4.post15.dist-info/METADATA,sha256=IMmhK6xAu0A96mLlpby9V2H-K8RYIqRpORaBngvtC0U,14278
7
- mmgp-3.1.4.post15.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
8
- mmgp-3.1.4.post15.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.1.4.post15.dist-info/RECORD,,