mmgp 3.2.6__py3-none-any.whl → 3.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +450 -267
- {mmgp-3.2.6.dist-info → mmgp-3.2.8.dist-info}/METADATA +2 -3
- mmgp-3.2.8.dist-info/RECORD +9 -0
- {mmgp-3.2.6.dist-info → mmgp-3.2.8.dist-info}/WHEEL +1 -1
- mmgp-3.2.6.dist-info/RECORD +0 -9
- {mmgp-3.2.6.dist-info → mmgp-3.2.8.dist-info}/LICENSE.md +0 -0
- {mmgp-3.2.6.dist-info → mmgp-3.2.8.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.2.
|
|
1
|
+
# ------------------ Memory Management 3.2.8 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -63,25 +63,11 @@ import json
|
|
|
63
63
|
import psutil
|
|
64
64
|
from accelerate import init_empty_weights
|
|
65
65
|
|
|
66
|
-
try:
|
|
67
|
-
|
|
68
|
-
from peft.tuners.tuners_utils import BaseTuner
|
|
69
|
-
|
|
70
|
-
from diffusers.utils.peft_utils import set_weights_and_activate_adapters, get_peft_kwargs
|
|
71
|
-
except:
|
|
72
|
-
set_weights_and_activate_adapters = None
|
|
73
|
-
get_peft_kwargs = None
|
|
74
|
-
pass
|
|
75
|
-
try:
|
|
76
|
-
from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
|
|
77
|
-
except:
|
|
78
|
-
inject_adapter_in_model = None
|
|
79
|
-
pass
|
|
80
66
|
|
|
81
67
|
from mmgp import safetensors2
|
|
82
68
|
from mmgp import profile_type
|
|
83
69
|
|
|
84
|
-
from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module, register_qmodule
|
|
70
|
+
from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QLinear, QTensor, quantize_module, register_qmodule
|
|
85
71
|
|
|
86
72
|
# support for Embedding module quantization that is not supported by default by quanto
|
|
87
73
|
@register_qmodule(torch.nn.Embedding)
|
|
@@ -302,13 +288,115 @@ def _get_tensor_ref(p):
|
|
|
302
288
|
return p.data_ptr()
|
|
303
289
|
|
|
304
290
|
|
|
305
|
-
|
|
291
|
+
# BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
292
|
+
BIG_TENSOR_MAX_SIZE = 2**27 # 128 MB
|
|
293
|
+
|
|
294
|
+
def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
|
|
295
|
+
tied_weights = {}
|
|
296
|
+
tied_weights_count = 0
|
|
297
|
+
tied_weights_total = 0
|
|
298
|
+
tied_weights_last = None
|
|
299
|
+
ref_cache = {}
|
|
300
|
+
|
|
301
|
+
for n, p in sd.items():
|
|
302
|
+
ref = _get_tensor_ref(p)
|
|
303
|
+
match = ref_cache.get(ref, None)
|
|
304
|
+
if match != None:
|
|
305
|
+
match_name, match_size = match
|
|
306
|
+
tied_weights_count += 1
|
|
307
|
+
tied_weights_total += match_size
|
|
308
|
+
if verboseLevel >=1:
|
|
309
|
+
tied_weights_last = f"{match_name} <-> {n}"
|
|
310
|
+
tied_weights[n] = match_name
|
|
311
|
+
else:
|
|
312
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
313
|
+
ref_cache[ref] = (n, length)
|
|
314
|
+
|
|
315
|
+
if verboseLevel >=1 and tied_weights_count > 0:
|
|
316
|
+
if tied_weights_count == 1:
|
|
317
|
+
print(f"Tied weights of {tied_weights_total/ONE_MB:0.2f} MB detected: {tied_weights_last}")
|
|
318
|
+
else:
|
|
319
|
+
print(f"Found {tied_weights_count} tied weights for a total of {tied_weights_total/ONE_MB:0.2f} MB, last : {tied_weights_last}")
|
|
320
|
+
|
|
321
|
+
def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
322
|
+
current_big_tensor_size = 0
|
|
323
|
+
big_tensor_no = 0
|
|
324
|
+
big_tensors_sizes = []
|
|
325
|
+
tensor_map_indexes = []
|
|
326
|
+
total_tensor_bytes = 0
|
|
327
|
+
|
|
328
|
+
for n, p in sd.items():
|
|
329
|
+
if tied_weights == None or not n in tied_weights :
|
|
330
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
331
|
+
|
|
332
|
+
if current_big_tensor_size + length > gig_tensor_size :
|
|
333
|
+
big_tensors_sizes.append(current_big_tensor_size)
|
|
334
|
+
current_big_tensor_size = 0
|
|
335
|
+
big_tensor_no += 1
|
|
336
|
+
|
|
337
|
+
itemsize = p.data.dtype.itemsize
|
|
338
|
+
if current_big_tensor_size % itemsize:
|
|
339
|
+
current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
|
|
340
|
+
tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
|
|
341
|
+
current_big_tensor_size += length
|
|
342
|
+
|
|
343
|
+
total_tensor_bytes += length
|
|
344
|
+
|
|
345
|
+
big_tensors_sizes.append(current_big_tensor_size)
|
|
346
|
+
|
|
347
|
+
big_tensors = []
|
|
348
|
+
last_big_tensor = 0
|
|
349
|
+
total = 0
|
|
350
|
+
|
|
351
|
+
for size in big_tensors_sizes:
|
|
352
|
+
try:
|
|
353
|
+
current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
354
|
+
big_tensors.append(current_big_tensor)
|
|
355
|
+
except:
|
|
356
|
+
print(f"Unable to pin more tensors for '{sd_name}' as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
357
|
+
break
|
|
358
|
+
|
|
359
|
+
last_big_tensor += 1
|
|
360
|
+
total += size
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
tensor_no = 0
|
|
364
|
+
# prev_big_tensor = 0
|
|
365
|
+
q_name = None
|
|
366
|
+
for n, p in sd.items():
|
|
367
|
+
if tied_weights != None:
|
|
368
|
+
q_name = tied_weights.get(n,None)
|
|
369
|
+
if q_name != None:
|
|
370
|
+
q = sd[q_name]
|
|
371
|
+
p.data = q.data
|
|
372
|
+
assert p.data.is_pinned()
|
|
373
|
+
q = None
|
|
374
|
+
else:
|
|
375
|
+
big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
|
|
376
|
+
|
|
377
|
+
if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
378
|
+
current_big_tensor = big_tensors[big_tensor_no]
|
|
379
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
380
|
+
q = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
381
|
+
torch.utils.swap_tensors(p, q)
|
|
382
|
+
del q
|
|
383
|
+
tensor_no += 1
|
|
384
|
+
del p
|
|
385
|
+
# global total_pinned_bytes
|
|
386
|
+
# total_pinned_bytes += total
|
|
387
|
+
gc.collect()
|
|
388
|
+
|
|
389
|
+
if verboseLevel >=1:
|
|
390
|
+
print(f"'{sd_name}' was pinned entirely to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
391
|
+
|
|
392
|
+
return
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
306
396
|
if partialPinning:
|
|
307
397
|
towers_names, _ = _detect_main_towers(model)
|
|
308
398
|
|
|
309
399
|
|
|
310
|
-
# BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
311
|
-
BIG_TENSOR_MAX_SIZE = 2**27 # 128 MB
|
|
312
400
|
current_big_tensor_size = 0
|
|
313
401
|
big_tensor_no = 0
|
|
314
402
|
big_tensors_sizes = []
|
|
@@ -320,7 +408,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, v
|
|
|
320
408
|
include = True
|
|
321
409
|
if partialPinning:
|
|
322
410
|
include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
|
|
323
|
-
if include and not
|
|
411
|
+
if include and not pinnedPEFTLora and ".lora_" in k:
|
|
324
412
|
include = False
|
|
325
413
|
|
|
326
414
|
if include:
|
|
@@ -368,7 +456,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, v
|
|
|
368
456
|
length = torch.numel(p.data) * p.data.element_size()
|
|
369
457
|
|
|
370
458
|
ref_cache[ref] = (n, length)
|
|
371
|
-
if current_big_tensor_size + length >
|
|
459
|
+
if current_big_tensor_size + length > gig_tensor_size :
|
|
372
460
|
big_tensors_sizes.append(current_big_tensor_size)
|
|
373
461
|
current_big_tensor_size = 0
|
|
374
462
|
big_tensor_no += 1
|
|
@@ -463,7 +551,6 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, v
|
|
|
463
551
|
else:
|
|
464
552
|
length = torch.numel(p.data) * p.data.element_size()
|
|
465
553
|
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
466
|
-
p.aaaaa = n
|
|
467
554
|
tensor_no += 1
|
|
468
555
|
del p
|
|
469
556
|
global total_pinned_bytes
|
|
@@ -488,7 +575,7 @@ def _welcome():
|
|
|
488
575
|
if welcome_displayed:
|
|
489
576
|
return
|
|
490
577
|
welcome_displayed = True
|
|
491
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.
|
|
578
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.8) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
492
579
|
|
|
493
580
|
def _extract_num_from_str(num_in_str):
|
|
494
581
|
size = len(num_in_str)
|
|
@@ -771,167 +858,66 @@ def split_linear_modules(model, map ):
|
|
|
771
858
|
|
|
772
859
|
delattr(parent_module, module_suffix)
|
|
773
860
|
|
|
774
|
-
def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
|
775
|
-
self._check_forward_args(x, *args, **kwargs)
|
|
776
|
-
adapter_names = kwargs.pop("adapter_names", None)
|
|
777
|
-
if self.disable_adapters:
|
|
778
|
-
if self.merged:
|
|
779
|
-
self.unmerge()
|
|
780
|
-
result = self.base_layer(x, *args, **kwargs)
|
|
781
|
-
elif adapter_names is not None:
|
|
782
|
-
result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
|
|
783
|
-
elif self.merged:
|
|
784
|
-
result = self.base_layer(x, *args, **kwargs)
|
|
785
|
-
else:
|
|
786
|
-
def get_scaling(active_adapter):
|
|
787
|
-
scaling_dict = shared_state.get("_lora_scaling", None)
|
|
788
|
-
if scaling_dict == None:
|
|
789
|
-
return self.scaling[active_adapter]
|
|
790
|
-
scaling_list = scaling_dict[active_adapter]
|
|
791
|
-
if isinstance(scaling_list, list):
|
|
792
|
-
step_no =shared_state.get("_lora_step_no", 0)
|
|
793
|
-
return scaling_list[step_no]
|
|
794
|
-
else:
|
|
795
|
-
return float(scaling_list)
|
|
796
|
-
|
|
797
|
-
base_weight = self.base_layer.weight
|
|
798
|
-
new_weights = not isinstance(self.base_layer, QModuleMixin)
|
|
799
|
-
if base_weight.shape[-1] < x.shape[-2] : # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
800
|
-
for active_adapter in self.active_adapters:
|
|
801
|
-
if active_adapter not in self.lora_A.keys():
|
|
802
|
-
continue
|
|
803
|
-
if self.use_dora[active_adapter]:
|
|
804
|
-
raise Exception("Dora not yet supported by mmgp")
|
|
805
|
-
|
|
806
|
-
lora_A = self.lora_A[active_adapter]
|
|
807
|
-
lora_B = self.lora_B[active_adapter]
|
|
808
|
-
dropout = self.lora_dropout[active_adapter]
|
|
809
|
-
scaling = get_scaling(active_adapter)
|
|
810
|
-
lora_A_weight = lora_A.weight
|
|
811
|
-
lora_B_weight = lora_B.weight
|
|
812
|
-
if new_weights or True:
|
|
813
|
-
base_weight = torch.addmm(base_weight, lora_B_weight, lora_A_weight, alpha= scaling )
|
|
814
|
-
# base_weight = base_weight + scaling * lora_B_weight @ lora_A_weight
|
|
815
|
-
else:
|
|
816
|
-
base_weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
|
|
817
|
-
# base_weight += scaling * lora_B_weight @ lora_A_weight
|
|
818
|
-
new_weights = False
|
|
819
|
-
|
|
820
|
-
if self.training:
|
|
821
|
-
result = torch.nn.functional.linear(dropout(x), base_weight, bias=self.base_layer.bias)
|
|
822
|
-
else:
|
|
823
|
-
result = torch.nn.functional.linear(x, base_weight, bias=self.base_layer.bias)
|
|
824
|
-
torch_result_dtype = result.dtype
|
|
825
|
-
|
|
826
|
-
else:
|
|
827
|
-
result = self.base_layer(x, *args, **kwargs)
|
|
828
|
-
torch_result_dtype = result.dtype
|
|
829
|
-
x = x.to(torch.bfloat16)
|
|
830
|
-
|
|
831
|
-
for active_adapter in self.active_adapters:
|
|
832
|
-
if active_adapter not in self.lora_A.keys():
|
|
833
|
-
continue
|
|
834
|
-
lora_A = self.lora_A[active_adapter]
|
|
835
|
-
lora_B = self.lora_B[active_adapter]
|
|
836
|
-
dropout = self.lora_dropout[active_adapter]
|
|
837
|
-
scaling = get_scaling(active_adapter)
|
|
838
|
-
x = x.to(lora_A.weight.dtype)
|
|
839
|
-
|
|
840
|
-
if not self.use_dora[active_adapter]:
|
|
841
|
-
if self.training:
|
|
842
|
-
y = lora_A(dropout(x))
|
|
843
|
-
else:
|
|
844
|
-
y = lora_A(x)
|
|
845
|
-
|
|
846
|
-
y = lora_B(y)
|
|
847
|
-
y*= scaling
|
|
848
|
-
result+= y
|
|
849
|
-
del lora_A, lora_B, y
|
|
850
|
-
# result = result + lora_B(lora_A(dropout(x))) * scaling
|
|
851
|
-
else:
|
|
852
|
-
if isinstance(dropout, torch.nn.Identity) or not self.training:
|
|
853
|
-
base_result = result
|
|
854
|
-
else:
|
|
855
|
-
x = dropout(x)
|
|
856
|
-
base_result = None
|
|
857
|
-
|
|
858
|
-
result = result + self.lora_magnitude_vector[active_adapter](
|
|
859
|
-
x,
|
|
860
|
-
lora_A=lora_A,
|
|
861
|
-
lora_B=lora_B,
|
|
862
|
-
scaling=scaling,
|
|
863
|
-
base_layer=self.get_base_layer(),
|
|
864
|
-
base_result=base_result,
|
|
865
|
-
)
|
|
866
|
-
|
|
867
|
-
result = result.to(torch_result_dtype)
|
|
868
|
-
return result
|
|
869
|
-
|
|
870
|
-
def _inject_adapter(
|
|
871
|
-
self, model: torch.nn.Module, adapter_name: str, autocast_adapter_dtype: bool = True, low_cpu_mem_usage: bool = False
|
|
872
|
-
) -> None:
|
|
873
|
-
|
|
874
|
-
def _get_submodules(model, key):
|
|
875
|
-
parent = model.get_submodule(".".join(key.split(".")[:-1]))
|
|
876
|
-
target_name = key.split(".")[-1]
|
|
877
|
-
target = model.get_submodule(key)
|
|
878
|
-
return parent, target, target_name
|
|
879
|
-
|
|
880
|
-
peft_config = self.peft_config[adapter_name]
|
|
881
|
-
self._check_new_adapter_config(peft_config)
|
|
882
|
-
|
|
883
|
-
model_config = self.get_model_config(model)
|
|
884
|
-
|
|
885
|
-
peft_config = self._prepare_adapter_config(peft_config, model_config)
|
|
886
861
|
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
target_modules = peft_config.target_modules.copy()
|
|
890
|
-
|
|
891
|
-
# unexpected_modules = []
|
|
892
|
-
for key, target in model.named_modules():
|
|
893
|
-
if not key:
|
|
894
|
-
continue
|
|
895
|
-
if key in target_modules:
|
|
896
|
-
target_modules.remove(key)
|
|
897
|
-
self.targeted_module_names.append(key)
|
|
898
|
-
# pos = key.rfind(".")
|
|
899
|
-
# parent = key[:pos]
|
|
900
|
-
# target_name = key[pos+1:]
|
|
901
|
-
parent, target, target_name = _get_submodules(model, key)
|
|
902
|
-
with init_empty_weights():
|
|
903
|
-
self._create_and_replace(peft_config, adapter_name, target, target_name, parent, current_key=key)
|
|
904
|
-
|
|
905
|
-
self.set_adapter(self.active_adapters)
|
|
906
|
-
self._mark_only_adapters_as_trainable(model)
|
|
907
|
-
|
|
908
|
-
return target_modules
|
|
909
|
-
|
|
910
|
-
def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
|
|
862
|
+
def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, check_only = False, ignore_model_variations = False, pinnedLora = False, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
|
|
911
863
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
864
|
+
modules_dict = {k: v for k,v in model.named_modules()}
|
|
865
|
+
|
|
866
|
+
if not check_only:
|
|
867
|
+
loras_model_data = dict()
|
|
868
|
+
model._loras_model_data = loras_model_data
|
|
869
|
+
loras_active_adapters = set()
|
|
870
|
+
model._loras_active_adapters = loras_active_adapters
|
|
871
|
+
loras_scaling = dict()
|
|
872
|
+
model._loras_scaling = loras_scaling
|
|
873
|
+
loras_tied_weights = dict()
|
|
874
|
+
model._loras_tied_weights = loras_tied_weights
|
|
912
875
|
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
876
|
+
CrLf = '\r\n'
|
|
877
|
+
error_msg = ""
|
|
878
|
+
def append(source, text ):
|
|
879
|
+
if len(source) == 0:
|
|
880
|
+
return text
|
|
881
|
+
else:
|
|
882
|
+
return source + CrLf + text
|
|
883
|
+
|
|
884
|
+
def trunc(text, sz):
|
|
885
|
+
if len(text) < sz:
|
|
886
|
+
return str(text)
|
|
887
|
+
else:
|
|
888
|
+
return str(text)[0:sz] + '...'
|
|
919
889
|
|
|
920
890
|
if not isinstance(lora_path, list):
|
|
921
891
|
lora_path = [lora_path]
|
|
922
892
|
|
|
923
893
|
if lora_multi is None:
|
|
924
894
|
lora_multi = [1. for _ in lora_path]
|
|
925
|
-
|
|
895
|
+
loras_nos = []
|
|
896
|
+
loras_multi = []
|
|
897
|
+
new_lora_path = []
|
|
898
|
+
errors = []
|
|
899
|
+
adapters = {}
|
|
900
|
+
adapter_no = 0
|
|
926
901
|
for i, path in enumerate(lora_path):
|
|
927
|
-
adapter_name = str(
|
|
928
|
-
|
|
902
|
+
adapter_name = str(adapter_no)
|
|
903
|
+
error_msg = ""
|
|
904
|
+
if not os.path.isfile(path):
|
|
905
|
+
error_msg = f"Lora '{path}' was not found"
|
|
906
|
+
errors.append((path, error_msg))
|
|
907
|
+
print(error_msg)
|
|
908
|
+
continue
|
|
909
|
+
fail = False
|
|
910
|
+
skip = False
|
|
929
911
|
state_dict = safetensors2.torch_load_file(path)
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
|
|
930
916
|
if preprocess_sd != None:
|
|
931
917
|
state_dict = preprocess_sd(state_dict)
|
|
932
918
|
|
|
933
919
|
if split_linear_modules_map != None:
|
|
934
|
-
new_state_dict =
|
|
920
|
+
new_state_dict = dict()
|
|
935
921
|
targets_A = { "."+k+".lora_A.weight" : k for k in split_linear_modules_map }
|
|
936
922
|
targets_B = { "."+k+".lora_B.weight" : k for k in split_linear_modules_map }
|
|
937
923
|
for module_name, module_data in state_dict.items():
|
|
@@ -961,82 +947,158 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
961
947
|
new_state_dict[module_name] = module_data
|
|
962
948
|
state_dict = new_state_dict
|
|
963
949
|
del new_state_dict
|
|
950
|
+
# tied_weights = _extract_tie_weights_from_sd(state_dict, path) # to do
|
|
951
|
+
|
|
952
|
+
clean_up = False
|
|
953
|
+
first_key = next(iter(state_dict), None)
|
|
954
|
+
if first_key == None:
|
|
955
|
+
msg = f"Empty Lora '{path}'"
|
|
956
|
+
error_msg = append(error_msg, msg)
|
|
957
|
+
fail = True
|
|
958
|
+
|
|
959
|
+
if not fail:
|
|
960
|
+
pos = first_key.find(".")
|
|
961
|
+
prefix = first_key[0:pos]
|
|
962
|
+
if prefix not in ["diffusion_model", "transformer"]:
|
|
963
|
+
msg = f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format."
|
|
964
|
+
error_msg = append(error_msg, msg)
|
|
965
|
+
fail = True
|
|
966
|
+
|
|
967
|
+
if not fail:
|
|
968
|
+
|
|
969
|
+
state_dict = { k[ len(prefix) + 1:]: v for k, v in state_dict.items() if k.startswith(prefix) }
|
|
970
|
+
clean_up = True
|
|
971
|
+
|
|
972
|
+
keys = list(state_dict.keys())
|
|
973
|
+
|
|
974
|
+
lora_alphas = {}
|
|
975
|
+
for k in keys:
|
|
976
|
+
if "alpha" in k:
|
|
977
|
+
alpha_value = state_dict.pop(k)
|
|
978
|
+
if torch.is_tensor(alpha_value):
|
|
979
|
+
alpha_value = float(alpha_value.item())
|
|
980
|
+
lora_alphas[k] = alpha_value
|
|
981
|
+
|
|
982
|
+
invalid_keys = []
|
|
983
|
+
unexpected_keys = []
|
|
984
|
+
for k, v in state_dict.items():
|
|
985
|
+
pos = k.rfind(".lora_")
|
|
986
|
+
if pos <=0:
|
|
987
|
+
invalid_keys.append(k)
|
|
988
|
+
continue
|
|
989
|
+
module_name = k[ : pos]
|
|
990
|
+
lora_key = k[ pos+1:]
|
|
991
|
+
lora_A = None
|
|
992
|
+
lora_B = None
|
|
993
|
+
if lora_key == "lora_A.weight":
|
|
994
|
+
lora_A = v
|
|
995
|
+
elif lora_key == "lora_B.weight":
|
|
996
|
+
lora_B = v
|
|
997
|
+
else:
|
|
998
|
+
invalid_keys.append(k)
|
|
999
|
+
continue
|
|
964
1000
|
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1001
|
+
module = modules_dict.get(module_name, None)
|
|
1002
|
+
if module == None:
|
|
1003
|
+
unexpected_keys.append(k)
|
|
1004
|
+
continue
|
|
1005
|
+
if not isinstance(module, (QLinear, torch.nn.Linear)):
|
|
1006
|
+
msg = f"Lora '{path}' contains a non linear layer '{k}'"
|
|
1007
|
+
error_msg = append(error_msg, msg)
|
|
1008
|
+
fail = True
|
|
1009
|
+
break
|
|
1010
|
+
module_shape = module.weight.shape
|
|
1011
|
+
if lora_A != None:
|
|
1012
|
+
if module_shape[1] != v.shape[1]:
|
|
1013
|
+
if ignore_model_variations:
|
|
1014
|
+
skip = True
|
|
1015
|
+
else:
|
|
1016
|
+
msg = f"Lora '{path}': Lora A dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[1]}, lora A = {v.shape[1]}). It is likely this Lora has been made for another version of this model."
|
|
1017
|
+
error_msg = append(error_msg, msg)
|
|
1018
|
+
fail = True
|
|
1019
|
+
break
|
|
1020
|
+
if lora_B != None:
|
|
1021
|
+
if module_shape[0] != v.shape[0]:
|
|
1022
|
+
if ignore_model_variations:
|
|
1023
|
+
skip = True
|
|
1024
|
+
else:
|
|
1025
|
+
msg = f"Lora '{path}': Lora B dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, lora B = {v.shape[0]}). It is likely this Lora has been made for another version of this model."
|
|
1026
|
+
error_msg = append(error_msg, msg)
|
|
1027
|
+
fail = True
|
|
1028
|
+
break
|
|
1029
|
+
if not check_only:
|
|
1030
|
+
loras_module_data = loras_model_data.get(module, None)
|
|
1031
|
+
if loras_module_data == None:
|
|
1032
|
+
loras_module_data = dict()
|
|
1033
|
+
loras_model_data[module] = loras_module_data
|
|
1034
|
+
loras_adapter_data = loras_module_data.get(adapter_name, None)
|
|
1035
|
+
lora_A = None if lora_A == None else lora_A.to(torch.bfloat16)
|
|
1036
|
+
lora_B = None if lora_B == None else lora_B.to(torch.bfloat16)
|
|
1037
|
+
if loras_adapter_data == None:
|
|
1038
|
+
alpha = lora_alphas.get(k[:-len("lora_X.weight")] + "alpha", 1.)
|
|
1039
|
+
loras_adapter_data = [lora_A, lora_B, alpha]
|
|
1040
|
+
loras_module_data[adapter_name] = loras_adapter_data
|
|
1041
|
+
elif lora_A != None:
|
|
1042
|
+
loras_adapter_data[0] = lora_A
|
|
1043
|
+
else:
|
|
1044
|
+
loras_adapter_data[1] = lora_B
|
|
1045
|
+
lora_A, lora_B, v, loras_module_data, loras_adapter_data = None, None, None, None, None
|
|
1046
|
+
lora_alphas = None
|
|
1047
|
+
|
|
1048
|
+
if len(invalid_keys) > 0:
|
|
1049
|
+
msg = "Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
|
|
1050
|
+
error_msg = append(error_msg, msg)
|
|
1051
|
+
fail = True
|
|
1052
|
+
if len(unexpected_keys) > 0:
|
|
1053
|
+
msg = f"Lora '{path}' contains unexpected module keys, it is likely that this Lora is for a different model : '{trunc(unexpected_keys,200)}'"
|
|
1054
|
+
error_msg = append(error_msg, msg)
|
|
1055
|
+
fail = True
|
|
1056
|
+
if fail or skip:
|
|
1057
|
+
if fail:
|
|
1058
|
+
errors.append((path, error_msg))
|
|
1059
|
+
print(error_msg)
|
|
1060
|
+
if clean_up and not check_only:
|
|
1061
|
+
for m,loras_module_data in loras_model_data.items():
|
|
1062
|
+
if adapter_name in loras_module_data:
|
|
1063
|
+
del loras_module_data[adapter_name]
|
|
1002
1064
|
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1065
|
+
else:
|
|
1066
|
+
if not check_only:
|
|
1067
|
+
# model._loras_tied_weights[adapter_name] = tied_weights
|
|
1068
|
+
if pinnedLora:
|
|
1069
|
+
_pin_sd_to_memory(state_dict, path)
|
|
1007
1070
|
|
|
1008
|
-
|
|
1009
|
-
for key, val in state_dict.items():
|
|
1010
|
-
if "lora_B" in key:
|
|
1011
|
-
rank[key] = val.shape[1]
|
|
1071
|
+
del state_dict
|
|
1012
1072
|
|
|
1013
|
-
if network_alphas is not None and len(network_alphas) >= 1:
|
|
1014
|
-
alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
|
|
1015
|
-
network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
|
|
1016
1073
|
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
# Check only for unexpected keys.
|
|
1029
|
-
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
|
|
1030
|
-
if unexpected_keys:
|
|
1031
|
-
raise Exception(f"Lora '{path}' contains invalid keys '{unexpected_keys}'")
|
|
1074
|
+
adapters[adapter_name] = path
|
|
1075
|
+
loras_nos.append(adapter_name)
|
|
1076
|
+
new_lora_path.append(path)
|
|
1077
|
+
loras_multi.append(1.0 if i > (len(lora_multi) -1) else lora_multi[i])
|
|
1078
|
+
pass
|
|
1079
|
+
adapter_no += 1
|
|
1080
|
+
if verboseLevel >=1:
|
|
1081
|
+
if check_only:
|
|
1082
|
+
print(f"Lora '{path}' was found for model '{_get_module_name(model)}'")
|
|
1083
|
+
else:
|
|
1084
|
+
print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
|
|
1032
1085
|
|
|
1033
|
-
|
|
1034
|
-
|
|
1086
|
+
model._loras_errors = errors
|
|
1087
|
+
if not check_only:
|
|
1088
|
+
model._loras_adapters = adapters
|
|
1035
1089
|
if activate_all_loras:
|
|
1036
|
-
|
|
1090
|
+
activate_loras(model, loras_nos, loras_multi)
|
|
1091
|
+
return new_lora_path
|
|
1037
1092
|
|
|
1038
|
-
def
|
|
1039
|
-
|
|
1093
|
+
def unload_loras_from_model(model):
|
|
1094
|
+
model._loras_model_data = None
|
|
1095
|
+
model._loras_errors = None
|
|
1096
|
+
model._loras_adapters = None
|
|
1097
|
+
model._loras_active_adapters = None
|
|
1098
|
+
model._loras_scaling = None
|
|
1099
|
+
|
|
1100
|
+
def set_step_no_for_lora(model, step_no):
|
|
1101
|
+
model._lora_step_no = step_no
|
|
1040
1102
|
|
|
1041
1103
|
def activate_loras(model, lora_nos, lora_multi = None ):
|
|
1042
1104
|
if not isinstance(lora_nos, list):
|
|
@@ -1046,15 +1108,13 @@ def activate_loras(model, lora_nos, lora_multi = None ):
|
|
|
1046
1108
|
if lora_multi is None:
|
|
1047
1109
|
lora_multi = [1. for _ in lora_nos]
|
|
1048
1110
|
|
|
1049
|
-
lora_fake_scaling = [1. if isinstance(mult, list) else mult for mult in lora_multi ]
|
|
1050
1111
|
lora_scaling_dict = {}
|
|
1051
1112
|
for no, multi in zip(lora_nos, lora_multi):
|
|
1052
1113
|
lora_scaling_dict[no] = multi
|
|
1053
1114
|
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
set_weights_and_activate_adapters(model, lora_nos, lora_fake_scaling)
|
|
1115
|
+
model._lora_step_no = 0
|
|
1116
|
+
model._loras_active_adapters = set(lora_nos)
|
|
1117
|
+
model._loras_scaling = lora_scaling_dict
|
|
1058
1118
|
|
|
1059
1119
|
|
|
1060
1120
|
def move_loras_to_device(model, device="cpu" ):
|
|
@@ -1399,12 +1459,12 @@ class offload:
|
|
|
1399
1459
|
self.loaded_blocks = {}
|
|
1400
1460
|
self.prev_blocks_names = {}
|
|
1401
1461
|
self.next_blocks_names = {}
|
|
1402
|
-
self.lora_parents = {}
|
|
1403
1462
|
self.preloaded_blocks_per_model = {}
|
|
1404
1463
|
self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
|
|
1405
1464
|
self.transfer_stream = torch.cuda.Stream()
|
|
1406
1465
|
self.async_transfers = False
|
|
1407
1466
|
self.parameters_ref = {}
|
|
1467
|
+
|
|
1408
1468
|
global last_offload_obj
|
|
1409
1469
|
last_offload_obj = self
|
|
1410
1470
|
|
|
@@ -1428,15 +1488,12 @@ class offload:
|
|
|
1428
1488
|
self.next_blocks_names[prev_entry_name] = entry_name
|
|
1429
1489
|
bef = blocks_params_size
|
|
1430
1490
|
|
|
1431
|
-
lora_name = None
|
|
1432
|
-
if self.lora_parents.get(submodule, None) != None:
|
|
1433
|
-
lora_name = str(submodule_name[ submodule_name.rfind(".") + 1: ] )
|
|
1434
1491
|
for k,p in submodule.named_parameters(recurse=False):
|
|
1435
1492
|
param_size = 0
|
|
1436
1493
|
ref = _get_tensor_ref(p)
|
|
1437
1494
|
tied_param = self.parameters_ref.get(ref, None)
|
|
1438
1495
|
if isinstance(p, QTensor):
|
|
1439
|
-
blocks_params.append( (submodule, k, p, False, tied_param
|
|
1496
|
+
blocks_params.append( (submodule, k, p, False, tied_param ) )
|
|
1440
1497
|
|
|
1441
1498
|
if p._qtype == qint4:
|
|
1442
1499
|
if hasattr(p,"_scale_shift"):
|
|
@@ -1450,7 +1507,7 @@ class offload:
|
|
|
1450
1507
|
param_size += torch.numel(p._scale) * p._scale.element_size()
|
|
1451
1508
|
param_size += torch.numel(p._data) * p._data.element_size()
|
|
1452
1509
|
else:
|
|
1453
|
-
blocks_params.append( (submodule, k, p, False, tied_param
|
|
1510
|
+
blocks_params.append( (submodule, k, p, False, tied_param) )
|
|
1454
1511
|
param_size += torch.numel(p.data) * p.data.element_size()
|
|
1455
1512
|
|
|
1456
1513
|
|
|
@@ -1459,7 +1516,7 @@ class offload:
|
|
|
1459
1516
|
self.parameters_ref[ref] = (submodule, k)
|
|
1460
1517
|
|
|
1461
1518
|
for k, p in submodule.named_buffers(recurse=False):
|
|
1462
|
-
blocks_params.append( (submodule, k, p, True, None
|
|
1519
|
+
blocks_params.append( (submodule, k, p, True, None) )
|
|
1463
1520
|
blocks_params_size += p.data.nbytes
|
|
1464
1521
|
|
|
1465
1522
|
aft = blocks_params_size
|
|
@@ -1484,6 +1541,19 @@ class offload:
|
|
|
1484
1541
|
return False
|
|
1485
1542
|
return True
|
|
1486
1543
|
|
|
1544
|
+
def _move_loras(self, loras_active_adapters, loras_modules, to_GPU):
|
|
1545
|
+
for name, lora_module in loras_modules.items():
|
|
1546
|
+
for adapter in loras_active_adapters:
|
|
1547
|
+
lora_data = lora_module.get(adapter, None)
|
|
1548
|
+
if lora_data == None:
|
|
1549
|
+
continue
|
|
1550
|
+
lora_A, lora_B, alpha = lora_data
|
|
1551
|
+
key = adapter + '_GPU'
|
|
1552
|
+
if to_GPU:
|
|
1553
|
+
lora_module[key] = [lora_A.cuda(), lora_B.cuda(), alpha]
|
|
1554
|
+
elif key in lora_module:
|
|
1555
|
+
del lora_module[key]
|
|
1556
|
+
|
|
1487
1557
|
@torch.compiler.disable()
|
|
1488
1558
|
def gpu_load_blocks(self, model_id, blocks_name, preload = False):
|
|
1489
1559
|
# cl = clock.start()
|
|
@@ -1492,12 +1562,17 @@ class offload:
|
|
|
1492
1562
|
entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
|
|
1493
1563
|
|
|
1494
1564
|
def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
|
|
1565
|
+
model = self.models[model_id]
|
|
1566
|
+
loras_modules = {}
|
|
1567
|
+
loras_active_adapters = getattr(model ,"_loras_active_adapters", None)
|
|
1568
|
+
if loras_active_adapters == None or len(loras_active_adapters) == 0:
|
|
1569
|
+
loras_model_data = None
|
|
1570
|
+
else:
|
|
1571
|
+
loras_model_data = getattr(model, "_loras_model_data", None)
|
|
1572
|
+
|
|
1495
1573
|
with torch.cuda.stream(stream_to_use):
|
|
1496
1574
|
for param in blocks_params:
|
|
1497
|
-
parent_module, n, p, is_buffer, tied_param
|
|
1498
|
-
if lora_name != None:
|
|
1499
|
-
if not lora_name in self.lora_parents[parent_module].active_adapters:
|
|
1500
|
-
continue
|
|
1575
|
+
parent_module, n, p, is_buffer, tied_param = param
|
|
1501
1576
|
|
|
1502
1577
|
if tied_param != None:
|
|
1503
1578
|
tied_p = getattr( tied_param[0], tied_param[1])
|
|
@@ -1515,6 +1590,12 @@ class offload:
|
|
|
1515
1590
|
if tied_param != None:
|
|
1516
1591
|
setattr( tied_param[0], tied_param[1], q)
|
|
1517
1592
|
del p, q
|
|
1593
|
+
if loras_model_data != None:
|
|
1594
|
+
lora_data = loras_model_data.get(parent_module, None)
|
|
1595
|
+
if lora_data != None:
|
|
1596
|
+
loras_modules[parent_module]= lora_data
|
|
1597
|
+
if len(loras_modules) > 0:
|
|
1598
|
+
self._move_loras(loras_active_adapters, loras_modules, True)
|
|
1518
1599
|
|
|
1519
1600
|
loaded_block = self.loaded_blocks[model_id]
|
|
1520
1601
|
|
|
@@ -1575,14 +1656,31 @@ class offload:
|
|
|
1575
1656
|
print(f"Unloading model {blocks_name} ({model_name}) from GPU")
|
|
1576
1657
|
|
|
1577
1658
|
blocks_params = self.blocks_of_modules[blocks_name]
|
|
1659
|
+
model = self.models[model_id]
|
|
1660
|
+
loras_modules = {}
|
|
1661
|
+
loras_active_adapters = getattr(model ,"_loras_active_adapters", None)
|
|
1662
|
+
if loras_active_adapters == None or len(loras_active_adapters) == 0 :
|
|
1663
|
+
loras_model_data = None
|
|
1664
|
+
else:
|
|
1665
|
+
loras_model_data = getattr(model, "_loras_model_data", None)
|
|
1666
|
+
|
|
1578
1667
|
for param in blocks_params:
|
|
1579
|
-
parent_module, n, p, is_buffer, _
|
|
1668
|
+
parent_module, n, p, is_buffer, _ = param
|
|
1580
1669
|
if is_buffer:
|
|
1581
1670
|
q = torch.nn.Buffer(p)
|
|
1582
1671
|
else:
|
|
1583
1672
|
q = torch.nn.Parameter(p , requires_grad=False)
|
|
1584
1673
|
setattr(parent_module, n , q)
|
|
1585
1674
|
del p, q
|
|
1675
|
+
|
|
1676
|
+
if loras_model_data != None:
|
|
1677
|
+
lora_data = loras_model_data.get(parent_module, None)
|
|
1678
|
+
if lora_data != None:
|
|
1679
|
+
loras_modules[parent_module]= lora_data
|
|
1680
|
+
|
|
1681
|
+
if len(loras_modules) > 0:
|
|
1682
|
+
self._move_loras(loras_active_adapters, loras_modules, False)
|
|
1683
|
+
|
|
1586
1684
|
# cl.stop()
|
|
1587
1685
|
# print(f"unload time: {cl.format_time_gap()}")
|
|
1588
1686
|
|
|
@@ -1670,6 +1768,92 @@ class offload:
|
|
|
1670
1768
|
|
|
1671
1769
|
return False
|
|
1672
1770
|
|
|
1771
|
+
def _lora_linear_forward(self, model, submodule, loras_data, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
|
1772
|
+
|
|
1773
|
+
def get_scaling(active_adapter):
|
|
1774
|
+
scaling_list = loras_scaling[active_adapter]
|
|
1775
|
+
if isinstance(scaling_list, list):
|
|
1776
|
+
step_no =getattr(model, "_lora_step_no", 0)
|
|
1777
|
+
return scaling_list[step_no]
|
|
1778
|
+
else:
|
|
1779
|
+
return float(scaling_list)
|
|
1780
|
+
|
|
1781
|
+
weight = submodule.weight
|
|
1782
|
+
|
|
1783
|
+
if loras_data == None:
|
|
1784
|
+
return torch.nn.functional.linear(x, weight, bias=submodule.bias)
|
|
1785
|
+
|
|
1786
|
+
active_adapters = model._loras_active_adapters
|
|
1787
|
+
loras_scaling = model._loras_scaling
|
|
1788
|
+
training = False
|
|
1789
|
+
|
|
1790
|
+
|
|
1791
|
+
if weight.shape[-1] < x.shape[-2] : # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
1792
|
+
if len(active_adapters) > 0:
|
|
1793
|
+
if isinstance(submodule, QModuleMixin):
|
|
1794
|
+
weight = weight.view(weight.shape) # get a persistent copy of the on the fly dequantized weights
|
|
1795
|
+
else:
|
|
1796
|
+
weight = weight.clone()
|
|
1797
|
+
|
|
1798
|
+
|
|
1799
|
+
for active_adapter in active_adapters:
|
|
1800
|
+
data = loras_data.get(active_adapter + '_GPU', None)
|
|
1801
|
+
if data == None:
|
|
1802
|
+
continue
|
|
1803
|
+
lora_A_weight, lora_B_weight, alpha = data
|
|
1804
|
+
scaling = get_scaling(active_adapter) * alpha
|
|
1805
|
+
weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
|
|
1806
|
+
# base_weight += scaling * lora_B_weight @ lora_A_weight
|
|
1807
|
+
|
|
1808
|
+
if training:
|
|
1809
|
+
pass
|
|
1810
|
+
# result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
|
|
1811
|
+
else:
|
|
1812
|
+
result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
|
|
1813
|
+
|
|
1814
|
+
else:
|
|
1815
|
+
result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
|
|
1816
|
+
|
|
1817
|
+
if len(active_adapters) > 0:
|
|
1818
|
+
x = x.to(torch.bfloat16)
|
|
1819
|
+
|
|
1820
|
+
for active_adapter in active_adapters:
|
|
1821
|
+
data = loras_data.get(active_adapter + '_GPU', None)
|
|
1822
|
+
if data == None:
|
|
1823
|
+
continue
|
|
1824
|
+
lora_A, lora_B, alpha = data
|
|
1825
|
+
# dropout = self.lora_dropout[active_adapter]
|
|
1826
|
+
scaling = get_scaling(active_adapter) * alpha
|
|
1827
|
+
x = x.to(lora_A.dtype)
|
|
1828
|
+
|
|
1829
|
+
if training:
|
|
1830
|
+
pass
|
|
1831
|
+
# y = lora_A(dropout(x))
|
|
1832
|
+
else:
|
|
1833
|
+
y = torch.nn.functional.linear(x, lora_A, bias=None)
|
|
1834
|
+
|
|
1835
|
+
y = torch.nn.functional.linear(y, lora_B, bias=None)
|
|
1836
|
+
y*= scaling
|
|
1837
|
+
result+= y
|
|
1838
|
+
del y
|
|
1839
|
+
|
|
1840
|
+
return result
|
|
1841
|
+
|
|
1842
|
+
|
|
1843
|
+
def hook_lora_linear(self, submodule, current_model, model_id, submodule_name):
|
|
1844
|
+
old_forward = submodule.forward
|
|
1845
|
+
def lora_linear_forward(module, *args, **kwargs):
|
|
1846
|
+
loras_model_data = getattr(current_model, "_loras_model_data", None)
|
|
1847
|
+
loras_data = None
|
|
1848
|
+
if loras_model_data != None:
|
|
1849
|
+
loras_data = loras_model_data.get(submodule, None)
|
|
1850
|
+
if loras_data == None:
|
|
1851
|
+
return old_forward(*args, **kwargs)
|
|
1852
|
+
else:
|
|
1853
|
+
return self._lora_linear_forward(current_model, submodule, loras_data, *args, **kwargs)
|
|
1854
|
+
|
|
1855
|
+
return functools.update_wrapper(functools.partial(lora_linear_forward, submodule), old_forward)
|
|
1856
|
+
|
|
1673
1857
|
def ensure_model_loaded(self, model_id):
|
|
1674
1858
|
if model_id in self.active_models_ids:
|
|
1675
1859
|
return
|
|
@@ -1851,6 +2035,8 @@ class offload:
|
|
|
1851
2035
|
|
|
1852
2036
|
for model_id, model in self.models.items():
|
|
1853
2037
|
move_loras_to_device(model, "cpu")
|
|
2038
|
+
if hasattr(model, "_loras_model_data"):
|
|
2039
|
+
unload_loras_from_model(model)
|
|
1854
2040
|
|
|
1855
2041
|
self.models = None
|
|
1856
2042
|
|
|
@@ -1860,7 +2046,7 @@ class offload:
|
|
|
1860
2046
|
|
|
1861
2047
|
|
|
1862
2048
|
|
|
1863
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False,
|
|
2049
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
1864
2050
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
1865
2051
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
1866
2052
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -1912,7 +2098,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
|
|
|
1912
2098
|
_welcome()
|
|
1913
2099
|
if coTenantsMap != None:
|
|
1914
2100
|
self.cotenants_map = coTenantsMap
|
|
1915
|
-
|
|
2101
|
+
if loras != None and isinstance(loras, str):
|
|
2102
|
+
loras = [loras]
|
|
1916
2103
|
self.models = models
|
|
1917
2104
|
|
|
1918
2105
|
extraModelsToQuantize = extraModelsToQuantize if extraModelsToQuantize is not None else []
|
|
@@ -2059,12 +2246,12 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
|
|
|
2059
2246
|
if self.verboseLevel >=1:
|
|
2060
2247
|
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
2061
2248
|
else:
|
|
2062
|
-
_pin_to_memory(current_model, model_id, partialPinning= partialPinning,
|
|
2063
|
-
|
|
2249
|
+
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, verboseLevel=verboseLevel)
|
|
2250
|
+
|
|
2064
2251
|
current_budget = model_budgets[model_id]
|
|
2065
2252
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
2066
2253
|
self.loaded_blocks[model_id] = None
|
|
2067
|
-
|
|
2254
|
+
any_lora = loras !=None and model_id in loras or getattr(current_model, "_loras_model_data", False)
|
|
2068
2255
|
for submodule_name, submodule in current_model.named_modules():
|
|
2069
2256
|
# create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
|
|
2070
2257
|
# (it is queried in many pipelines even if offloading is not properly implemented)
|
|
@@ -2096,7 +2283,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
|
|
|
2096
2283
|
|
|
2097
2284
|
|
|
2098
2285
|
if hasattr(submodule, "forward"):
|
|
2099
|
-
|
|
2286
|
+
if any_lora and isinstance(submodule, torch.nn.Linear):
|
|
2287
|
+
submodule_method = self.hook_lora_linear(submodule, current_model, model_id, submodule_name)
|
|
2288
|
+
else:
|
|
2289
|
+
submodule_method = getattr(submodule, "forward")
|
|
2100
2290
|
if callable(submodule_method):
|
|
2101
2291
|
if len(submodule_name.split("."))==1:
|
|
2102
2292
|
self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method)
|
|
@@ -2107,13 +2297,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
|
|
|
2107
2297
|
|
|
2108
2298
|
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
|
|
2109
2299
|
|
|
2110
|
-
if hasattr(submodule, "active_adapters"):
|
|
2111
|
-
for dictmodule in ["lora_A","lora_B"]:
|
|
2112
|
-
ssubmod = getattr(submodule, dictmodule, None)
|
|
2113
|
-
if ssubmod !=None:
|
|
2114
|
-
for k, loramod in ssubmod._modules.items():
|
|
2115
|
-
self.lora_parents[loramod] = submodule
|
|
2116
|
-
|
|
2117
2300
|
|
|
2118
2301
|
self.tune_preloading(model_id, current_budget, towers_names)
|
|
2119
2302
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.2.
|
|
3
|
+
Version: 3.2.8
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -13,11 +13,10 @@ Requires-Dist: optimum-quanto
|
|
|
13
13
|
Requires-Dist: accelerate
|
|
14
14
|
Requires-Dist: safetensors
|
|
15
15
|
Requires-Dist: psutil
|
|
16
|
-
Requires-Dist: peft
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.2.
|
|
19
|
+
<H2>Memory Management 3.2.8 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
20
|
</p>
|
|
22
21
|
|
|
23
22
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=Wwk3uV3ZJv3ApyX-vpzukOllkBOTkLwGm5qDadmqVqQ,105209
|
|
4
|
+
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
+
mmgp-3.2.8.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.2.8.dist-info/METADATA,sha256=_3nE_8-UHpItfJsJsb4KUIs_WdROc68SCTNTP5lj_ho,16131
|
|
7
|
+
mmgp-3.2.8.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
|
|
8
|
+
mmgp-3.2.8.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.2.8.dist-info/RECORD,,
|
mmgp-3.2.6.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=sN95BJAvdWOu36AWwJlACdxMDiOzeqL2HXLN90oaec4,98169
|
|
4
|
-
mmgp/safetensors2.py,sha256=DCdlRH3769CTyraAmWAB3b0XrVua7z6ygQ-OyKgJN6A,16453
|
|
5
|
-
mmgp-3.2.6.dist-info/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.2.6.dist-info/METADATA,sha256=F7LmNAvBTLEEfFT-Wbh7md4s1U4Vdnt4RrBfuBXpH_s,16151
|
|
7
|
-
mmgp-3.2.6.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
|
8
|
-
mmgp-3.2.6.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.2.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|