mmgp 3.0.1__py3-none-any.whl → 3.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmgp might be problematic. Click here for more details.

mmgp/offload.py CHANGED
@@ -61,10 +61,22 @@ import sys
61
61
  import os
62
62
  import json
63
63
  import psutil
64
+ try:
65
+ from diffusers.utils.peft_utils import set_weights_and_activate_adapters, get_peft_kwargs
66
+ except:
67
+ set_weights_and_activate_adapters = None
68
+ get_peft_kwargs = None
69
+ pass
70
+ try:
71
+ from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
72
+ except:
73
+ inject_adapter_in_model = None
74
+ pass
75
+
64
76
  from mmgp import safetensors2
65
77
  from mmgp import profile_type
66
78
 
67
- from optimum.quanto import freeze, qfloat8, qint8, quantize, QModuleMixin, QTensor, WeightQBytesTensor, quantize_module
79
+ from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module
68
80
 
69
81
 
70
82
 
@@ -127,6 +139,9 @@ def move_tensors(obj, device):
127
139
  return _list
128
140
  else:
129
141
  raise TypeError("Tensor or list / dict of tensors expected")
142
+ def _get_module_name(v):
143
+ return v.__module__.lower()
144
+
130
145
 
131
146
  def _compute_verbose_level(level):
132
147
  if level <0:
@@ -260,9 +275,16 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
260
275
  if include:
261
276
  params_list = params_list + list(sub_module.buffers(recurse=False)) + list(sub_module.parameters(recurse=False))
262
277
 
278
+ # print(f"num params to pin {model_id}: {len(params_list)}")
263
279
  for p in params_list:
264
280
  if isinstance(p, QTensor):
265
- length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
281
+ if p._qtype == qint4:
282
+ if hasattr(p,"_scale_shift"):
283
+ length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale_shift) * p._scale_shift.element_size()
284
+ else:
285
+ length = torch.numel(p._data._data) * p._data._data.element_size() + torch.numel(p._scale) * p._scale.element_size() + torch.numel(p._shift) * p._shift.element_size()
286
+ else:
287
+ length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
266
288
  else:
267
289
  length = torch.numel(p.data) * p.data.element_size()
268
290
 
@@ -305,10 +327,22 @@ def _pin_to_memory(model, model_id, partialPinning = False, perc_reserved_mem_ma
305
327
  if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
306
328
  current_big_tensor = big_tensors[big_tensor_no]
307
329
  if isinstance(p, QTensor):
308
- length1 = torch.numel(p._data) * p._data.element_size()
309
- p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
310
- length2 = torch.numel(p._scale) * p._scale.element_size()
311
- p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
330
+ if p._qtype == qint4:
331
+ length1 = torch.numel(p._data._data) * p._data._data.element_size()
332
+ p._data._data = _move_to_pinned_tensor(p._data._data, current_big_tensor, offset, length1)
333
+ if hasattr(p,"_scale_shift"):
334
+ length2 = torch.numel(p._scale_shift) * p._scale_shift.element_size()
335
+ p._scale_shift = _move_to_pinned_tensor(p._scale_shift, current_big_tensor, offset + length1, length2)
336
+ else:
337
+ length2 = torch.numel(p._scale) * p._scale.element_size()
338
+ p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
339
+ length3 = torch.numel(p._shift) * p._shift.element_size()
340
+ p._shift = _move_to_pinned_tensor(p._shift, current_big_tensor, offset + length1 + length2, length3)
341
+ else:
342
+ length1 = torch.numel(p._data) * p._data.element_size()
343
+ p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
344
+ length2 = torch.numel(p._scale) * p._scale.element_size()
345
+ p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
312
346
  else:
313
347
  length = torch.numel(p.data) * p.data.element_size()
314
348
  p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
@@ -338,98 +372,6 @@ def _welcome():
338
372
  print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.0) by DeepBeepMeep ************{ENDC}{UNBOLD}")
339
373
 
340
374
 
341
- # def _pin_to_memory_sd(model, sd, model_id, partialPinning = False, perc_reserved_mem_max = 0, verboseLevel = 1):
342
- # if verboseLevel>=1 :
343
- # if partialPinning:
344
- # print(f"Partial pinning to reserved RAM of data of file '{model_id}' while loading it")
345
- # else:
346
- # print(f"Pinning data to reserved RAM of file '{model_id}' while loading it")
347
-
348
- # max_reservable_memory = _get_max_reservable_memory(perc_reserved_mem_max)
349
- # if partialPinning:
350
- # towers_names, _ = _detect_main_towers(model)
351
- # towers_names = [n +"." for n in towers_names]
352
-
353
- # BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
354
- # current_big_tensor_size = 0
355
- # big_tensor_no = 0
356
- # big_tensors_sizes = []
357
- # tensor_map_indexes = []
358
- # total_tensor_bytes = 0
359
-
360
- # for k,t in sd.items():
361
- # include = True
362
- # # if isinstance(p, QTensor):
363
- # # length = torch.numel(p._data) * p._data.element_size() + torch.numel(p._scale) * p._scale.element_size()
364
- # # else:
365
- # # length = torch.numel(p.data) * p.data.element_size()
366
- # length = torch.numel(t) * t.data.element_size()
367
-
368
- # if partialPinning:
369
- # include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
370
-
371
- # if include:
372
- # if current_big_tensor_size + length > BIG_TENSOR_MAX_SIZE:
373
- # big_tensors_sizes.append(current_big_tensor_size)
374
- # current_big_tensor_size = 0
375
- # big_tensor_no += 1
376
- # tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
377
- # current_big_tensor_size += length
378
- # else:
379
- # tensor_map_indexes.append((-1, 0, 0 ))
380
- # total_tensor_bytes += length
381
-
382
- # big_tensors_sizes.append(current_big_tensor_size)
383
-
384
- # big_tensors = []
385
- # last_big_tensor = 0
386
- # total = 0
387
-
388
-
389
- # for size in big_tensors_sizes:
390
- # try:
391
- # currrent_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True)
392
- # big_tensors.append(currrent_big_tensor)
393
- # except:
394
- # print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
395
- # break
396
-
397
- # last_big_tensor += 1
398
- # total += size
399
-
400
-
401
- # tensor_no = 0
402
- # for k,t in sd.items():
403
- # big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
404
- # if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
405
- # current_big_tensor = big_tensors[big_tensor_no]
406
- # # if isinstance(p, QTensor):
407
- # # length1 = torch.numel(p._data) * p._data.element_size()
408
- # # p._data = _move_to_pinned_tensor(p._data, current_big_tensor, offset, length1)
409
- # # length2 = torch.numel(p._scale) * p._scale.element_size()
410
- # # p._scale = _move_to_pinned_tensor(p._scale, current_big_tensor, offset + length1, length2)
411
- # # else:
412
- # # length = torch.numel(p.data) * p.data.element_size()
413
- # # p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
414
- # length = torch.numel(t) * t.data.element_size()
415
- # t = _move_to_pinned_tensor(t, current_big_tensor, offset, length)
416
- # sd[k] = t
417
- # tensor_no += 1
418
-
419
- # global total_pinned_bytes
420
- # total_pinned_bytes += total
421
-
422
- # if verboseLevel >=1:
423
- # if total_tensor_bytes == total:
424
- # print(f"The whole model was pinned to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
425
- # else:
426
- # print(f"{total/ONE_MB:.2f} MB were pinned to reserved RAM out of {total_tensor_bytes/ONE_MB:.2f} MB")
427
-
428
- # model._already_pinned = True
429
-
430
-
431
- # return
432
-
433
375
  def _quantize_dirty_hack(model):
434
376
  # dirty hack: add a hook on state_dict() to return a fake non quantized state_dict if called by Lora Diffusers initialization functions
435
377
  setattr( model, "_real_state_dict", model.state_dict)
@@ -535,10 +477,14 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
535
477
  prev_blocks_prefix = None
536
478
 
537
479
  if hasattr(model_to_quantize, "_quanto_map"):
480
+ for k, entry in model_to_quantize._quanto_map.items():
481
+ weights = entry["weights"]
482
+ print(f"Model '{model_id}' is already quantized to format '{weights}'")
483
+ return False
538
484
  print(f"Model '{model_id}' is already quantized")
539
485
  return False
540
-
541
- print(f"Quantization of model '{model_id}' started")
486
+
487
+ print(f"Quantization of model '{model_id}' started to format '{weights}'")
542
488
 
543
489
  for submodule_name, submodule in model_to_quantize.named_modules():
544
490
  if isinstance(submodule, QModuleMixin):
@@ -593,38 +539,42 @@ def _quantize(model_to_quantize, weights=qint8, verboseLevel = 1, threshold = 10
593
539
  if verboseLevel >=2:
594
540
  print(f"Total Excluded {total_excluded/ONE_MB:.1f} MB oF {total_size/ONE_MB:.1f} that is {perc_excluded*100:.2f}%")
595
541
  if perc_excluded >= 0.10:
596
- print(f"Too many many modules are excluded, there is something wrong with the selection, switch back to full quantization.")
542
+ print(f"Too many modules are excluded, there is something wrong with the selection, switch back to full quantization.")
597
543
  exclude_list = None
598
544
 
599
545
 
600
546
  #quantize(model_to_quantize,weights, exclude= exclude_list)
601
- pass
547
+
602
548
  for name, m in model_to_quantize.named_modules():
603
549
  if exclude_list is None or not any( name == module_name for module_name in exclude_list):
604
550
  _quantize_submodule(model_to_quantize, name, m, weights=weights, activations=None, optimizer=None)
605
551
 
606
- # force read non quantized parameters so that their lazy tensors and corresponding mmap are released
607
- # otherwise we may end up to keep in memory both the quantized and the non quantize model
608
-
609
-
610
- for name, m in model_to_quantize.named_modules():
552
+ # force to read non quantized parameters so that their lazy tensors and corresponding mmap are released
553
+ # otherwise we may end up keeping in memory both the quantized and the non quantize model
554
+ for m in model_to_quantize.modules():
611
555
  # do not read quantized weights (detected them directly or behind an adapter)
612
- if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
613
- pass
556
+ if isinstance(m, QModuleMixin) or hasattr(m, "base_layer") and isinstance(m.base_layer, QModuleMixin):
557
+ if hasattr(m, "bias") and m.bias is not None:
558
+ m.bias.data = m.bias.data + 0
614
559
  else:
615
- if hasattr(m, "weight") and m.weight is not None:
616
- m.weight.data = m.weight.data + 0
560
+ for n, p in m.named_parameters(recurse = False):
561
+ data = getattr(m, n)
562
+ setattr(m,n, torch.nn.Parameter(data + 0 ) )
617
563
 
618
- if hasattr(m, "bias") and m.bias is not None:
619
- m.bias.data = m.bias.data + 0
564
+ for b in m.buffers(recurse = False):
565
+ b.data = b.data + 0
620
566
 
621
567
 
568
+
622
569
  freeze(model_to_quantize)
623
570
  torch.cuda.empty_cache()
624
571
  gc.collect()
625
572
  quantization_map = _quantization_map(model_to_quantize)
626
573
  model_to_quantize._quanto_map = quantization_map
627
574
 
575
+ if hasattr(model_to_quantize, "_already_pinned"):
576
+ delattr(model_to_quantize, "_already_pinned")
577
+
628
578
  _quantize_dirty_hack(model_to_quantize)
629
579
 
630
580
  print(f"Quantization of model '{model_id}' done")
@@ -683,15 +633,25 @@ class offload:
683
633
 
684
634
  for k,p in submodule.named_parameters(recurse=False):
685
635
  if isinstance(p, QTensor):
686
- blocks_params.append( (submodule, k, p._data, p._scale) )
687
- blocks_params_size += p._data.nbytes
688
- blocks_params_size += p._scale.nbytes
636
+ blocks_params.append( (submodule, k, p ) )
637
+
638
+ if p._qtype == qint4:
639
+ if hasattr(p,"_scale_shift"):
640
+ blocks_params_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
641
+ blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
642
+ else:
643
+ blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
644
+ blocks_params_size += torch.numel(p._shift) * p._shift.element_size()
645
+ blocks_params_size += torch.numel(p._data._data) * p._data._data.element_size()
646
+ else:
647
+ blocks_params_size += torch.numel(p._scale) * p._scale.element_size()
648
+ blocks_params_size += torch.numel(p._data) * p._data.element_size()
689
649
  else:
690
- blocks_params.append( (submodule, k, p.data, None) )
691
- blocks_params_size += p.data.nbytes
650
+ blocks_params.append( (submodule, k, p ) )
651
+ blocks_params_size += torch.numel(p.data) * p.data.element_size()
692
652
 
693
653
  for k, p in submodule.named_buffers(recurse=False):
694
- blocks_params.append( (submodule, k, p.data, None) )
654
+ blocks_params.append( (submodule, k, p) )
695
655
  blocks_params_size += p.data.nbytes
696
656
 
697
657
 
@@ -709,34 +669,28 @@ class offload:
709
669
  return False
710
670
  return True
711
671
 
712
- def gpu_load_blocks(self, model_id, blocks_name, async_load = False):
672
+
673
+ def gpu_load_blocks(self, model_id, blocks_name):
713
674
  # cl = clock.start()
714
675
 
715
676
  if blocks_name != None:
716
677
  self.loaded_blocks[model_id] = blocks_name
717
678
 
718
679
  entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
719
-
720
- def cpu_to_gpu(stream_to_use, blocks_params, record_for_stream = None):
680
+
681
+ def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
721
682
  with torch.cuda.stream(stream_to_use):
722
683
  for param in blocks_params:
723
- parent_module, n, data, scale = param
724
- p = getattr(parent_module, n)
725
- if isinstance(p, QTensor):
726
- q = WeightQBytesTensor.create(p.qtype, p.axis, p.size(), p.stride(), data.cuda(non_blocking=True), scale.cuda(non_blocking=True), activation_qtype=p.activation_qtype, requires_grad=p.requires_grad )
727
- #q = p.to("cuda", non_blocking=True)
728
- q = torch.nn.Parameter(q , requires_grad=False)
729
- setattr(parent_module, n , q)
730
- del p
731
- else:
732
- p.data = p.data.cuda(non_blocking=True)
733
-
734
- if record_for_stream != None:
735
- if isinstance(p, QTensor):
736
- q._data.record_stream(record_for_stream)
737
- q._scale.record_stream(record_for_stream)
738
- else:
739
- p.data.record_stream(record_for_stream)
684
+ parent_module, n, p = param
685
+ q = p.to("cuda", non_blocking=True)
686
+ q = torch.nn.Parameter(q , requires_grad=False)
687
+ setattr(parent_module, n , q)
688
+ # if record_for_stream != None:
689
+ # if isinstance(p, QTensor):
690
+ # q._data.record_stream(record_for_stream)
691
+ # q._scale.record_stream(record_for_stream)
692
+ # else:
693
+ # p.data.record_stream(record_for_stream)
740
694
 
741
695
 
742
696
  if self.verboseLevel >=2:
@@ -775,19 +729,10 @@ class offload:
775
729
  print(f"Unloading model {blocks_name} ({model_name}) from GPU")
776
730
 
777
731
  blocks_params = self.blocks_of_modules[blocks_name]
778
-
779
732
  for param in blocks_params:
780
- parent_module, n, data, scale = param
781
- p = getattr(parent_module, n)
782
- if isinstance(p, QTensor):
783
- # need to change the parameter directly from the module as it can't be swapped in place due to a memory leak in the pytorch compiler
784
- q = WeightQBytesTensor.create(p.qtype, p.axis, p.size(), p.stride(), data, scale, activation_qtype=p.activation_qtype, requires_grad=p.requires_grad )
785
- q = torch.nn.Parameter(q , requires_grad=False)
786
- setattr(parent_module, n , q)
787
- del p
788
- else:
789
- p.data = data
790
-
733
+ parent_module, n, p = param
734
+ q = torch.nn.Parameter(p , requires_grad=False)
735
+ setattr(parent_module, n , q)
791
736
  # cl.stop()
792
737
  # print(f"unload time: {cl.format_time_gap()}")
793
738
 
@@ -823,8 +768,8 @@ class offload:
823
768
  if torch.is_tensor(arg):
824
769
  if arg.dtype == torch.float32:
825
770
  arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
826
- else:
827
- arg = arg.cuda(non_blocking=True)
771
+ elif not arg.is_cuda:
772
+ arg = arg.cuda(non_blocking=True)
828
773
  new_args.append(arg)
829
774
 
830
775
  for k in kwargs:
@@ -832,7 +777,7 @@ class offload:
832
777
  if torch.is_tensor(arg):
833
778
  if arg.dtype == torch.float32:
834
779
  arg = arg.to(torch.bfloat16).cuda(non_blocking=True)
835
- else:
780
+ elif not arg.is_cuda:
836
781
  arg = arg.cuda(non_blocking=True)
837
782
  new_kwargs[k]= arg
838
783
 
@@ -896,6 +841,10 @@ class offload:
896
841
 
897
842
  def hook_check_empty_cache_needed(self, target_module, model_id,blocks_name, previous_method, context):
898
843
 
844
+ qint4quantization = isinstance(target_module, QModuleMixin) and target_module.weight!= None and target_module.weight.qtype == qint4
845
+ if qint4quantization:
846
+ pass
847
+
899
848
  def check_empty_cuda_cache(module, *args, **kwargs):
900
849
  # if self.ready_to_check_mem():
901
850
  # self.empty_cache_if_needed()
@@ -911,6 +860,8 @@ class offload:
911
860
  self.empty_cache_if_needed()
912
861
  self.loaded_blocks[model_id] = blocks_name
913
862
  self.gpu_load_blocks(model_id, blocks_name)
863
+ if qint4quantization:
864
+ args, kwargs = self.move_args_to_gpu(*args, **kwargs)
914
865
 
915
866
  return previous_method(*args, **kwargs)
916
867
 
@@ -969,11 +920,111 @@ class offload:
969
920
  # for module in parent_module.components.items():
970
921
  # self.unhook_module(module)
971
922
 
972
- def fast_load_transformers_model(model_path: str, do_quantize = False, quantization_type = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
923
+ import torch
924
+
925
+
926
+
927
+
928
+ def load_loras_into_model(model, lora_path, lora_multi = None, verboseLevel = -1):
929
+ verboseLevel = _compute_verbose_level(verboseLevel)
930
+
931
+ if inject_adapter_in_model == None or set_weights_and_activate_adapters == None or get_peft_kwargs == None:
932
+ raise Exception("Unable to load Lora, missing 'peft' and / or 'diffusers' modules")
933
+
934
+ if not isinstance(lora_path, list):
935
+ lora_path = [lora_path]
936
+
937
+ if lora_multi is None:
938
+ lora_multi = [1. for _ in lora_path]
939
+
940
+ for i, path in enumerate(lora_path):
941
+ adapter_name = str(i)
942
+
943
+ state_dict = safetensors2.torch_load_file(path)
944
+
945
+ keys = list(state_dict.keys())
946
+ if len(keys) == 0:
947
+ raise Exception(f"Empty Lora '{path}'")
948
+
949
+
950
+ network_alphas = {}
951
+ for k in keys:
952
+ if "alpha" in k:
953
+ alpha_value = state_dict.pop(k)
954
+ if not ( (torch.is_tensor(alpha_value) and torch.is_floating_point(alpha_value)) or isinstance(
955
+ alpha_value, float
956
+ )):
957
+ network_alphas[k] = torch.tensor( float(alpha_value.item() ) )
958
+
959
+ pos = keys[0].find(".")
960
+ prefix = keys[0][0:pos]
961
+ if not any( prefix.startswith(some_prefix) for some_prefix in ["diffusion_model", "transformer"]):
962
+ msg = f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format."
963
+ raise Exception(msg)
964
+
965
+ transformer = model
966
+
967
+ transformer_keys = [k for k in keys if k.startswith(prefix)]
968
+ state_dict = {
969
+ k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in transformer_keys
970
+ }
971
+
972
+ sd_keys = state_dict.keys()
973
+ if len(sd_keys) == 0:
974
+ print(f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format.")
975
+ return
976
+
977
+ # is_correct_format = all("lora" in key for key in state_dict.keys())
978
+
979
+
980
+
981
+
982
+ # check with first key if is not in peft format
983
+ # first_key = next(iter(state_dict.keys()))
984
+ # if "lora_A" not in first_key:
985
+ # state_dict = convert_unet_state_dict_to_peft(state_dict)
986
+
987
+ if adapter_name in getattr(transformer, "peft_config", {}):
988
+ raise ValueError(
989
+ f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
990
+ )
991
+
992
+ rank = {}
993
+ for key, val in state_dict.items():
994
+ if "lora_B" in key:
995
+ rank[key] = val.shape[1]
996
+
997
+ if network_alphas is not None and len(network_alphas) >= 1:
998
+ alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
999
+ network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
1000
+
1001
+ lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
1002
+
1003
+ lora_config = LoraConfig(**lora_config_kwargs)
1004
+ peft_kwargs = {}
1005
+ peft_kwargs["low_cpu_mem_usage"] = True
1006
+ inject_adapter_in_model(lora_config, model, adapter_name=adapter_name, **peft_kwargs)
1007
+
1008
+ incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name, **peft_kwargs)
1009
+
1010
+ warn_msg = ""
1011
+ if incompatible_keys is not None:
1012
+ # Check only for unexpected keys.
1013
+ unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
1014
+ if unexpected_keys:
1015
+ pass
1016
+ if verboseLevel >=1:
1017
+ print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
1018
+ set_weights_and_activate_adapters(model,[ str(i) for i in range(len(lora_multi))], lora_multi)
1019
+
1020
+
1021
+ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
973
1022
  """
974
1023
  quick version of .LoadfromPretrained of the transformers library
975
1024
  used to build a model and load the corresponding weights (quantized or not)
976
1025
  """
1026
+
1027
+
977
1028
  import os.path
978
1029
  from accelerate import init_empty_weights
979
1030
 
@@ -1037,13 +1088,13 @@ def fast_load_transformers_model(model_path: str, do_quantize = False, quantizat
1037
1088
 
1038
1089
  model._config = transformer_config
1039
1090
 
1040
- load_model_data(model,model_path, do_quantize = do_quantize, quantization_type = quantization_type, pinToMemory= pinToMemory, partialPinning= partialPinning, verboseLevel=verboseLevel )
1091
+ load_model_data(model,model_path, do_quantize = do_quantize, quantizationType = quantizationType, pinToMemory= pinToMemory, partialPinning= partialPinning, verboseLevel=verboseLevel )
1041
1092
 
1042
1093
  return model
1043
1094
 
1044
1095
 
1045
1096
 
1046
- def load_model_data(model, file_path: str, do_quantize = False, quantization_type = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
1097
+ def load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToMemory = False, partialPinning = False, verboseLevel = -1):
1047
1098
  """
1048
1099
  Load a model, detect if it has been previously quantized using quanto and do the extra setup if necessary
1049
1100
  """
@@ -1064,14 +1115,6 @@ def load_model_data(model, file_path: str, do_quantize = False, quantization_typ
1064
1115
  state_dict = state_dict["module"]
1065
1116
  else:
1066
1117
  state_dict, metadata = _safetensors_load_file(file_path)
1067
-
1068
-
1069
- # if pinToMemory:
1070
- # _pin_to_memory_sd(model,state_dict, file_path, partialPinning = partialPinning, perc_reserved_mem_max = perc_reserved_mem_max, verboseLevel = verboseLevel)
1071
-
1072
- # with safetensors2.safe_open(file_path) as f:
1073
- # metadata = f.metadata()
1074
-
1075
1118
 
1076
1119
  if metadata is None:
1077
1120
  quantization_map = None
@@ -1106,7 +1149,7 @@ def load_model_data(model, file_path: str, do_quantize = False, quantization_typ
1106
1149
 
1107
1150
  if do_quantize:
1108
1151
  if quantization_map is None:
1109
- if _quantize(model, quantization_type, verboseLevel=verboseLevel, model_id=file_path):
1152
+ if _quantize(model, quantizationType, verboseLevel=verboseLevel, model_id=file_path):
1110
1153
  quantization_map = model._quanto_map
1111
1154
  else:
1112
1155
  if verboseLevel >=1:
@@ -1117,7 +1160,7 @@ def load_model_data(model, file_path: str, do_quantize = False, quantization_typ
1117
1160
 
1118
1161
  return
1119
1162
 
1120
- def save_model(model, file_path, do_quantize = False, quantization_type = qint8, verboseLevel = -1 ):
1163
+ def save_model(model, file_path, do_quantize = False, quantizationType = qint8, verboseLevel = -1 ):
1121
1164
  """save the weights of a model and quantize them if requested
1122
1165
  These weights can be loaded again using 'load_model_data'
1123
1166
  """
@@ -1144,7 +1187,7 @@ def save_model(model, file_path, do_quantize = False, quantization_type = qint8,
1144
1187
  config= json.loads(text)
1145
1188
 
1146
1189
  if do_quantize:
1147
- _quantize(model, weights=quantization_type, model_id=file_path)
1190
+ _quantize(model, weights=quantizationType, model_id=file_path)
1148
1191
 
1149
1192
  quantization_map = getattr(model, "_quanto_map", None)
1150
1193
 
@@ -1157,7 +1200,7 @@ def save_model(model, file_path, do_quantize = False, quantization_type = qint8,
1157
1200
 
1158
1201
 
1159
1202
 
1160
- def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
1203
+ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, verboseLevel = -1):
1161
1204
  """Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
1162
1205
  pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
1163
1206
  quantizeTransformer: set True by default will quantize on the fly the video / image model
@@ -1235,6 +1278,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1235
1278
  self.anyCompiledModule = compileAllModels or len(modelsToCompile)>0
1236
1279
  if self.anyCompiledModule:
1237
1280
  torch._dynamo.config.cache_size_limit = 10000
1281
+ torch.compiler.reset()
1282
+
1238
1283
  # torch._logging.set_logs(recompiles=True)
1239
1284
  # torch._inductor.config.realize_opcount_threshold = 100 # workaround bug "AssertionError: increase TRITON_MAX_BLOCK['X'] to 4096."
1240
1285
 
@@ -1249,19 +1294,31 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1249
1294
 
1250
1295
  # if the model has just been quantized so there is no need to quantize it again
1251
1296
  if model_id in models_to_quantize:
1252
- _quantize(current_model, weights=qint8, verboseLevel = self.verboseLevel, model_id=model_id)
1297
+ _quantize(current_model, weights=quantizationType, verboseLevel = self.verboseLevel, model_id=model_id)
1253
1298
 
1254
1299
  modelPinned = (pinAllModels or model_id in modelsToPin) and not hasattr(current_model,"_already_pinned")
1255
1300
 
1256
- current_model_size = 0
1257
- # load all the remaining unread lazy safetensors in RAM to free open cache files
1258
- for p in current_model.parameters():
1301
+ current_model_size = 0
1302
+
1303
+ for n, p in current_model.named_parameters():
1304
+ p.requires_grad = False
1305
+ p = p.detach()
1259
1306
  if isinstance(p, QTensor):
1260
1307
  # # fix quanto bug (seems to have been fixed)
1261
1308
  # if not modelPinned and p._scale.dtype == torch.float32:
1262
1309
  # p._scale = p._scale.to(torch.bfloat16)
1263
- current_model_size += torch.numel(p._scale) * p._scale.element_size()
1264
- current_model_size += torch.numel(p._data) * p._data.element_size()
1310
+ if p._qtype == qint4:
1311
+ if hasattr(p,"_scale_shift"):
1312
+ current_model_size += torch.numel(p._scale_shift) * p._scale_shift.element_size()
1313
+ else:
1314
+ current_model_size += torch.numel(p._scale) * p._shift.element_size() + torch.numel(p._scale) * p._shift.element_size()
1315
+
1316
+ current_model_size += torch.numel(p._data._data) * p._data._data.element_size()
1317
+
1318
+ else:
1319
+ current_model_size += torch.numel(p._scale) * p._scale.element_size()
1320
+ current_model_size += torch.numel(p._data) * p._data.element_size()
1321
+
1265
1322
  else:
1266
1323
  if p.data.dtype == torch.float32:
1267
1324
  # convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
@@ -1269,7 +1326,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1269
1326
  current_model_size += torch.numel(p.data) * p.data.element_size()
1270
1327
 
1271
1328
  for b in current_model.buffers():
1272
- if b.data.dtype == torch.float32:
1329
+ if b.data.dtype == torch.float32:
1273
1330
  # convert any left overs float32 weight to bloat16 to divide by 2 the model memory footprint
1274
1331
  b.data = b.data.to(torch.bfloat16)
1275
1332
  current_model_size += torch.numel(b.data) * b.data.element_size()
@@ -1305,7 +1362,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1305
1362
  print(f"Potential iterative blocks found in model '{model_id}':{towers_names}")
1306
1363
  # compile main iterative modules stacks ("towers")
1307
1364
  if compileAllModels or model_id in modelsToCompile :
1308
- #torch.compiler.reset()
1309
1365
  if self.verboseLevel>=1:
1310
1366
  print(f"Pytorch compilation of model '{model_id}' is scheduled.")
1311
1367
  for tower in towers_modules:
@@ -1313,6 +1369,13 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1313
1369
  submodel.forward= torch.compile(submodel.forward, backend= "inductor", mode="default" ) # , fullgraph= True, mode= "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs",
1314
1370
  #dynamic=True,
1315
1371
 
1372
+ if pinAllModels or model_id in modelsToPin:
1373
+ if hasattr(current_model,"_already_pinned"):
1374
+ if self.verboseLevel >=1:
1375
+ print(f"Model '{model_id}' already pinned to reserved memory")
1376
+ else:
1377
+ _pin_to_memory(current_model, model_id, partialPinning= partialPinning, perc_reserved_mem_max=perc_reserved_mem_max, verboseLevel=verboseLevel)
1378
+
1316
1379
  for submodule_name, submodule in current_model.named_modules():
1317
1380
  # create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
1318
1381
  # (it is queried in many pipelines even if offloading is not properly implemented)
@@ -1358,12 +1421,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, quantizeTransformer = Tru
1358
1421
 
1359
1422
  current_size = self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name)
1360
1423
 
1361
- if pinAllModels or model_id in modelsToPin:
1362
- if hasattr(current_model,"_already_pinned"):
1363
- if self.verboseLevel >=1:
1364
- print(f"Model '{model_id}' already pinned to reserved memory")
1365
- else:
1366
- _pin_to_memory(current_model, model_id, partialPinning= partialPinning, perc_reserved_mem_max=perc_reserved_mem_max, verboseLevel=verboseLevel)
1367
1424
 
1368
1425
 
1369
1426
 
@@ -1402,7 +1459,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1402
1459
  modules= modules.components
1403
1460
 
1404
1461
  modules = {k: _remove_model_wrapper(v) for k, v in modules.items() if isinstance(v, torch.nn.Module)}
1405
- module_names = {k: v.__module__.lower() for k, v in modules.items() }
1462
+ module_names = {k: _get_module_name(v) for k, v in modules.items() }
1406
1463
 
1407
1464
  default_extraModelsToQuantize = []
1408
1465
  quantizeTransformer = True
@@ -1422,6 +1479,7 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1422
1479
 
1423
1480
  default_budgets = { "transformer" : 600 , "text_encoder": 3000, "text_encoder_2": 3000 }
1424
1481
  extraModelsToQuantize = None
1482
+ asyncTransfers = True
1425
1483
 
1426
1484
  if profile_no == profile_type.HighRAM_HighVRAM:
1427
1485
  pinnedMemory= True
@@ -1439,7 +1497,6 @@ def profile(pipe_or_dict_of_modules, profile_no: profile_type = profile_type.Ve
1439
1497
  pinnedMemory= "transformer"
1440
1498
  extraModelsToQuantize = default_extraModelsToQuantize
1441
1499
  budgets=default_budgets
1442
- asyncTransfers = True
1443
1500
  info = "You have chosen a profile that requires at least 32 GB of RAM and 12 GB of VRAM. Some RAM is consumed to reduce VRAM consumption. "
1444
1501
  elif profile_no == profile_type.VerylowRAM_LowVRAM:
1445
1502
  pinnedMemory= False
mmgp/safetensors2.py CHANGED
@@ -155,7 +155,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
155
155
  torch.bool : 'BOOL' , torch.float64 : 'F64' , torch.float32 : 'F32' , torch.float16 : 'F16', torch.float8_e5m2 : "F8_E5M2", torch.float8_e4m3fn: "F8_E4M3" }
156
156
  pos = 0
157
157
  i = 0
158
- mx = 1000000
158
+ mx = 100000
159
159
  for k , t in sd.items():
160
160
  entry = {}
161
161
  dtypestr= map[t.dtype]
@@ -186,8 +186,6 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
186
186
 
187
187
  length_of_header_bytes = struct.pack('<Q', size_header)
188
188
 
189
- empty_tensor = b'\x80\x3f'
190
-
191
189
  with open(file_path, "wb") as writer:
192
190
  bytes_written = writer.write(length_of_header_bytes)
193
191
  bytes_written = writer.write(header_bytes)
@@ -195,12 +193,20 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
195
193
  i = 0
196
194
  for k , t in sd.items():
197
195
  size = torch.numel(t) * t.element_size()
198
- if len(t.shape) == 0:
199
- bytes_written = writer.write(empty_tensor)
200
- else:
201
- buffer = t.view(torch.uint8).numpy().tobytes()
202
- bytes_written = writer.write(buffer)
203
- assert bytes_written == size
196
+ if size != 0:
197
+ if len(t.shape) == 0:
198
+ dtype = t.dtype
199
+ # convert in a friendly format, scalars types not supported by numpy
200
+ if dtype == torch.bfloat16:
201
+ t = t.view(torch.uint16)
202
+ elif dtype == torch.float8_e5m2 or dtype == torch.float8_e4m3fn:
203
+ t = t.view(torch.uint8)
204
+ buffer = t.numpy().tobytes()
205
+ else:
206
+ buffer = t.view(torch.uint8).numpy().tobytes()
207
+ bytes_written = writer.write(buffer)
208
+ assert bytes_written == size
209
+
204
210
  i+=1
205
211
  if i==mx:
206
212
  break
@@ -208,7 +214,7 @@ def torch_write_file(sd, file_path, quantization_map = None, config = None):
208
214
  class SafeTensorFile:
209
215
  """Main class for accessing safetensors files that provides memory-efficient access"""
210
216
 
211
- def __init__(self, file_path, metadata, catalog, skip_bytes):
217
+ def __init__(self, file_path, metadata, catalog, skip_bytes, lazy_loading = True):
212
218
  self._file_path = file_path
213
219
  self._metadata = metadata
214
220
  self._catalog = catalog
@@ -216,20 +222,30 @@ class SafeTensorFile:
216
222
  self._keys = None
217
223
  self.sd = None
218
224
  self.mtracker = None
225
+ self.lazy_loading = lazy_loading
219
226
 
220
227
  @classmethod
221
- def load_metadata(cls, file_path):
228
+ def load_metadata(cls, file_path, lazy_loading = True):
222
229
  with open(file_path, 'rb') as f:
223
230
  catalog, metadata, skip_bytes = _read_safetensors_header(file_path, f)
224
231
 
225
- return cls(file_path, metadata, catalog, skip_bytes)
232
+ return cls(file_path, metadata, catalog, skip_bytes, lazy_loading)
226
233
 
227
- def init_tensors(self):
234
+ def init_tensors(self, lazyTensors = True):
228
235
  if self.sd is None:
229
- self.sd = self.create_tensors()
236
+ self.lazy_loading = lazyTensors
237
+ if lazyTensors:
238
+ self.sd = self.create_tensors_with_mmap()
239
+ else:
240
+ self.sd = self.create_tensors_without_mmap()
241
+ # else:
242
+ # if not self.lazy_loading and lazyTensors:
243
+ # raise Exception("Every tensor should be either lazy loaded or not lazy loaded")
244
+
230
245
  return self.sd
231
246
 
232
- def create_tensors(self):
247
+
248
+ def create_tensors_with_mmap(self):
233
249
 
234
250
  self.mtracker = MmapTracker(self._file_path)
235
251
  import mmap
@@ -282,7 +298,12 @@ class SafeTensorFile:
282
298
  map_idx = next(iter_tensor_no)
283
299
  offset = current_pos - maps[map_idx][1]
284
300
  if len(shape) == 0:
285
- t = torch.ones((), dtype=dtype, device="cpu")
301
+ if length == 0:
302
+ t = torch.empty(0, dtype=dtype)
303
+ else:
304
+ # don't waste a memory view for a scalar
305
+ t = torch.frombuffer(bytearray(maps[map_idx][0][offset:offset + length]), dtype=torch.uint8)
306
+ t = t.view(dtype)
286
307
  else:
287
308
  mv = memoryview(maps[map_idx][0])[offset:offset + length]
288
309
  t = torch.frombuffer(mv, dtype=dtype)
@@ -293,8 +314,33 @@ class SafeTensorFile:
293
314
 
294
315
  return sd
295
316
 
317
+ def create_tensors_without_mmap(self):
318
+ sd = OrderedDict()
319
+
320
+ with open(self._file_path, 'rb') as f:
321
+ f.seek(self._skip_bytes, 0)
322
+ for k,v in self._catalog.items():
323
+ dtypestr = v["dtype"]
324
+ dtype= _map_to_dtype[dtypestr]
325
+ shape = v["shape"]
326
+ data_offsets = v["data_offsets"]
327
+ length = data_offsets[1]-data_offsets[0]
328
+ buffer = f.read(length)
329
+ if len(shape) == 0:
330
+ if length == 0:
331
+ t = torch.empty(0, dtype=dtype)
332
+ else:
333
+ t = torch.frombuffer(bytearray(buffer), dtype=torch.uint8)
334
+ t = t.view(dtype)
335
+ else:
336
+ t = torch.frombuffer(bytearray(buffer), dtype=dtype)
337
+ t = torch.reshape(t, shape)
338
+ sd[k] = t
339
+ return sd
340
+
296
341
  def get_tensor(self, name: str) -> torch.tensor:
297
342
  """Get a tensor by name"""
343
+ # To do : switch to a JIT tensor creation per tensor
298
344
  self.init_tensors()
299
345
  return self.sd[name]
300
346
 
@@ -310,7 +356,7 @@ class SafeTensorFile:
310
356
 
311
357
  def tensors(self) -> Dict[str, torch.tensor]:
312
358
  """Get dictionary of all tensors"""
313
- self.init_tensors()
359
+ self.init_tensors(self.lazy_loading)
314
360
  return self.sd
315
361
 
316
362
  def metadata(self) -> Optional[Dict[str, str]]:
@@ -319,7 +365,7 @@ class SafeTensorFile:
319
365
 
320
366
  def __len__(self) -> int:
321
367
  """Get number of tensors"""
322
- self.init_tensors()
368
+ self.init_tensors(self.lazy_loading)
323
369
  return len(self.keys())
324
370
 
325
371
  def __contains__(self, key: str) -> bool:
@@ -337,10 +383,9 @@ class SafeTensorFile:
337
383
  class _SafeTensorLoader:
338
384
  """Context manager for loading SafeTensorFile"""
339
385
 
340
- def __init__(self, filename: str):
386
+ def __init__(self, filename: str ):
341
387
  self.filename = Path(filename)
342
388
  self.sft = None
343
-
344
389
  if not self.filename.exists():
345
390
  raise FileNotFoundError(f"File not found: {filename}")
346
391
 
@@ -367,7 +412,6 @@ class _SafeTensorLoader:
367
412
 
368
413
  def safe_open(filename: str, framework: str = "pt",device = "cpu") -> _SafeTensorLoader:
369
414
  if device != "cpu" or framework !="pt":
370
- pass
371
415
  return _old_safe_open(filename =filename, framework=framework, device=device)
372
416
  return _SafeTensorLoader(filename)
373
417
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: mmgp
3
- Version: 3.0.1
3
+ Version: 3.0.9
4
4
  Summary: Memory Management for the GPU Poor
5
5
  Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -13,10 +13,11 @@ Requires-Dist: optimum-quanto
13
13
  Requires-Dist: accelerate
14
14
  Requires-Dist: safetensors
15
15
  Requires-Dist: psutil
16
+ Requires-Dist: peft
16
17
 
17
18
 
18
19
  <p align="center">
19
- <H2>Memory Management 3.0 for the GPU Poor by DeepBeepMeep</H2>
20
+ <H2>Memory Management 3.0.9 for the GPU Poor by DeepBeepMeep</H2>
20
21
  </p>
21
22
 
22
23
 
@@ -38,8 +39,9 @@ Each profile may use a combination of the following:
38
39
  - Ability to pin models to reserved RAM to accelerate transfers to VRAM
39
40
  - Async transfers to VRAM to avoid a pause when loading a new slice of a model
40
41
  - Automated on the fly quantization or ability to load pre quantized models
41
- - support for pytorch compilation on Linux and WSL (not supported so far on pure Windows).
42
-
42
+ - Pretrained Lora support with low RAM requirements
43
+ - Support for pytorch compilation on Linux and WSL (supported on pure Windows but requires a complex Triton Installation).
44
+ -
43
45
  ## Installation
44
46
  First you need to install the module in your current project with:
45
47
  ```shell
@@ -69,7 +71,8 @@ You can choose between 5 profiles depending on your hardware:
69
71
  - VerylowRAM_LowVRAM (5): at least 24 GB of RAM and 10 GB of VRAM : if you don't have much it won't be fast but maybe it will work
70
72
 
71
73
  Profile 2 (High RAM) and 4 (Low RAM)are the most recommended profiles since they are versatile (support for long videos for a slight performance cost).\
72
- However, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
74
+ If you use Flux derived applciation profile 1 and 3 will offer much faster generation times.
75
+ In any case, a safe approach is to start from profile 5 (default profile) and then go down progressively to profile 4 and then to profile 2 as long as the app remains responsive or doesn't trigger any out of memory error.
73
76
 
74
77
  By default the 'transformer' will be quantized to 8 bits for all profiles. If you don't want that you may specify the optional parameter *quantizeTransformer = False*.
75
78
 
@@ -80,6 +83,9 @@ Every parameter set automatically by a profile can be overridden with one or mul
80
83
  ```
81
84
  If you want to know which parameter are set by one specific profile you can use the parameter *verboseLevel=2*
82
85
 
86
+ **It is highly recommended to put the *from mmgp import offload, profile_type* at the top of your main python file (that is as the first import) so that all the existing safetensors calls are redirected to mmpg.**
87
+
88
+
83
89
  ## Alternatively you may want to create your own profile with specific parameters:
84
90
 
85
91
  For example:
@@ -101,20 +107,22 @@ If you are short on RAM and plan to work with quantized models, it is recommende
101
107
  ## Going further
102
108
 
103
109
  The module includes several tools to package a light version of your favorite video / image generator:
104
- - *save_model(model, file_path, do_quantize = False, quantization_type = qint8 )*\
110
+ - *save_model(model, file_path, do_quantize = False, quantizationType = qint8 )*\
105
111
  Save tensors of a model already loaded in memory in a safetensor format (much faster to reload). You can save it in a quantized format (default qint8 quantization recommended).
106
112
  The resulting safetensor file will contain extra fields in its metadata such as the quantization map and its configuration, so you will be able to move the file around without files such as *config.json* or *file_map.json*.
107
113
  You will need *load_model_data* or *fast_load_transformers_model* to read the file again . You may also load it using the default *safetensor* librar however you will need to provide in the same directory any complementary file that are usually requested (for instance *config.json*)
108
114
 
109
- - *load_model_data(model, file_path: str, do_quantize = False, quantization_type = qint8, pinToRAM = False, partialPin = False)*\
115
+ - *load_model_data(model, file_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
110
116
  Load the tensors data of a model in RAM of a model already initialized with no data. Detect and handle quantized models saved previously with *save_model*.A model can also be quantized on the fly while being loaded. The model which is loaded can be pinned to RAM while it is loaded, this is more RAM efficient than pinning tensors later using *offline.all* or *offline.profile*
111
117
 
112
- - *fast_load_transformers_model(model_path: str, do_quantize = False, quantization_type = qint8, pinToRAM = False, partialPin = False)*\
118
+ - *fast_load_transformers_model(model_path: str, do_quantize = False, quantizationType = qint8, pinToRAM = False, partialPin = False)*\
113
119
  Initialize (build the model hierarchy in memory) and fast load the corresponding tensors of a 'transformers' or 'diffusers' library model.
114
120
  The advantages over the original *from_pretrained* method is that a full model can fit into a single file with a filename of your choosing (thefore you can have multiple 'transformers' versions of the same model in the same directory) and prequantized models are processed in a transparent way.
115
121
  Last but not least, you can also on the fly pin to RAM the whole model or the most important part of it (partialPin = True) in a more efficient way (faster and requires less RAM) than if you did through *offload.all* or *offload.profile*.
116
122
 
117
-
123
+ - *load_loras_into_model(model, lora_path, lora_multi)
124
+ Load in a model a list of Lora described by a list of path *lora_path* and a list of *weights coefficients*.
125
+ The Lora file must be in the *diffusers* format. This function works also on non diffusers models. However if there is already an official Lora support for a model it is recommended to use the official diffusers functions.
118
126
 
119
127
  The typical workflow wil be:
120
128
  1) temporarly insert the *save_model* function just after a model has been fully loaded to save a copy of the model / quantized model.
@@ -0,0 +1,9 @@
1
+ __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
+ mmgp/offload.py,sha256=bYjpbAHbVX2Vf3nBJXYEc1u9B5JIYvJxv4eMS8L5Tco,64209
4
+ mmgp/safetensors2.py,sha256=G6uzvpGauJLPEvN74MX1ib4YK0E4wzNMyrZO5wOX2k0,15812
5
+ mmgp-3.0.9.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
+ mmgp-3.0.9.dist-info/METADATA,sha256=0vNt8lNKfMkyBrFUN8pOfkDRf8i_jmndgH2ePIekmdg,12570
7
+ mmgp-3.0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
8
+ mmgp-3.0.9.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
+ mmgp-3.0.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- __init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
3
- mmgp/offload.py,sha256=T9RBAibAyAnKV-8AiYmop_UOGl_N1l5EJo5ucCZfxK8,61611
4
- mmgp/safetensors2.py,sha256=CSv8HdrjURUzBazpaBDU1WNwUL1lhzpCyzG0GWygbGE,13602
5
- mmgp-3.0.1.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
6
- mmgp-3.0.1.dist-info/METADATA,sha256=uSsBc5pBaYBL4Ek3TR99J9hP7AQQlwnnUM_JQlkNwbE,11765
7
- mmgp-3.0.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
8
- mmgp-3.0.1.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
9
- mmgp-3.0.1.dist-info/RECORD,,