mmgp 3.2.6__tar.gz → 3.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- {mmgp-3.2.6/src/mmgp.egg-info → mmgp-3.2.7}/PKG-INFO +2 -3
- {mmgp-3.2.6 → mmgp-3.2.7}/README.md +1 -1
- {mmgp-3.2.6 → mmgp-3.2.7}/pyproject.toml +2 -3
- {mmgp-3.2.6 → mmgp-3.2.7}/src/mmgp/offload.py +447 -263
- {mmgp-3.2.6 → mmgp-3.2.7/src/mmgp.egg-info}/PKG-INFO +2 -3
- {mmgp-3.2.6 → mmgp-3.2.7}/src/mmgp.egg-info/requires.txt +0 -1
- {mmgp-3.2.6 → mmgp-3.2.7}/LICENSE.md +0 -0
- {mmgp-3.2.6 → mmgp-3.2.7}/setup.cfg +0 -0
- {mmgp-3.2.6 → mmgp-3.2.7}/src/__init__.py +0 -0
- {mmgp-3.2.6 → mmgp-3.2.7}/src/mmgp/__init__.py +0 -0
- {mmgp-3.2.6 → mmgp-3.2.7}/src/mmgp/safetensors2.py +0 -0
- {mmgp-3.2.6 → mmgp-3.2.7}/src/mmgp.egg-info/SOURCES.txt +0 -0
- {mmgp-3.2.6 → mmgp-3.2.7}/src/mmgp.egg-info/dependency_links.txt +0 -0
- {mmgp-3.2.6 → mmgp-3.2.7}/src/mmgp.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.2.
|
|
3
|
+
Version: 3.2.7
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -13,11 +13,10 @@ Requires-Dist: optimum-quanto
|
|
|
13
13
|
Requires-Dist: accelerate
|
|
14
14
|
Requires-Dist: safetensors
|
|
15
15
|
Requires-Dist: psutil
|
|
16
|
-
Requires-Dist: peft
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.2.
|
|
19
|
+
<H2>Memory Management 3.2.7 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
20
|
</p>
|
|
22
21
|
|
|
23
22
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "mmgp"
|
|
3
|
-
version = "3.2.
|
|
3
|
+
version = "3.2.7"
|
|
4
4
|
authors = [
|
|
5
5
|
{ name = "deepbeepmeep", email = "deepbeepmeep@yahoo.com" },
|
|
6
6
|
]
|
|
@@ -13,7 +13,6 @@ dependencies = [
|
|
|
13
13
|
"optimum-quanto",
|
|
14
14
|
"accelerate",
|
|
15
15
|
"safetensors",
|
|
16
|
-
"psutil"
|
|
17
|
-
"peft"
|
|
16
|
+
"psutil"
|
|
18
17
|
]
|
|
19
18
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.2.
|
|
1
|
+
# ------------------ Memory Management 3.2.7 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -63,25 +63,11 @@ import json
|
|
|
63
63
|
import psutil
|
|
64
64
|
from accelerate import init_empty_weights
|
|
65
65
|
|
|
66
|
-
try:
|
|
67
|
-
|
|
68
|
-
from peft.tuners.tuners_utils import BaseTuner
|
|
69
|
-
|
|
70
|
-
from diffusers.utils.peft_utils import set_weights_and_activate_adapters, get_peft_kwargs
|
|
71
|
-
except:
|
|
72
|
-
set_weights_and_activate_adapters = None
|
|
73
|
-
get_peft_kwargs = None
|
|
74
|
-
pass
|
|
75
|
-
try:
|
|
76
|
-
from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
|
|
77
|
-
except:
|
|
78
|
-
inject_adapter_in_model = None
|
|
79
|
-
pass
|
|
80
66
|
|
|
81
67
|
from mmgp import safetensors2
|
|
82
68
|
from mmgp import profile_type
|
|
83
69
|
|
|
84
|
-
from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QTensor, quantize_module, register_qmodule
|
|
70
|
+
from optimum.quanto import freeze, qfloat8, qint4 , qint8, quantize, QModuleMixin, QLinear, QTensor, quantize_module, register_qmodule
|
|
85
71
|
|
|
86
72
|
# support for Embedding module quantization that is not supported by default by quanto
|
|
87
73
|
@register_qmodule(torch.nn.Embedding)
|
|
@@ -302,13 +288,115 @@ def _get_tensor_ref(p):
|
|
|
302
288
|
return p.data_ptr()
|
|
303
289
|
|
|
304
290
|
|
|
305
|
-
|
|
291
|
+
# BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
292
|
+
BIG_TENSOR_MAX_SIZE = 2**27 # 128 MB
|
|
293
|
+
|
|
294
|
+
def _extract_tie_weights_from_sd(sd , sd_name, verboseLevel =1):
|
|
295
|
+
tied_weights = {}
|
|
296
|
+
tied_weights_count = 0
|
|
297
|
+
tied_weights_total = 0
|
|
298
|
+
tied_weights_last = None
|
|
299
|
+
ref_cache = {}
|
|
300
|
+
|
|
301
|
+
for n, p in sd.items():
|
|
302
|
+
ref = _get_tensor_ref(p)
|
|
303
|
+
match = ref_cache.get(ref, None)
|
|
304
|
+
if match != None:
|
|
305
|
+
match_name, match_size = match
|
|
306
|
+
tied_weights_count += 1
|
|
307
|
+
tied_weights_total += match_size
|
|
308
|
+
if verboseLevel >=1:
|
|
309
|
+
tied_weights_last = f"{match_name} <-> {n}"
|
|
310
|
+
tied_weights[n] = match_name
|
|
311
|
+
else:
|
|
312
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
313
|
+
ref_cache[ref] = (n, length)
|
|
314
|
+
|
|
315
|
+
if verboseLevel >=1 and tied_weights_count > 0:
|
|
316
|
+
if tied_weights_count == 1:
|
|
317
|
+
print(f"Tied weights of {tied_weights_total/ONE_MB:0.2f} MB detected: {tied_weights_last}")
|
|
318
|
+
else:
|
|
319
|
+
print(f"Found {tied_weights_count} tied weights for a total of {tied_weights_total/ONE_MB:0.2f} MB, last : {tied_weights_last}")
|
|
320
|
+
|
|
321
|
+
def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
322
|
+
current_big_tensor_size = 0
|
|
323
|
+
big_tensor_no = 0
|
|
324
|
+
big_tensors_sizes = []
|
|
325
|
+
tensor_map_indexes = []
|
|
326
|
+
total_tensor_bytes = 0
|
|
327
|
+
|
|
328
|
+
for n, p in sd.items():
|
|
329
|
+
if tied_weights == None or not n in tied_weights :
|
|
330
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
331
|
+
|
|
332
|
+
if current_big_tensor_size + length > gig_tensor_size :
|
|
333
|
+
big_tensors_sizes.append(current_big_tensor_size)
|
|
334
|
+
current_big_tensor_size = 0
|
|
335
|
+
big_tensor_no += 1
|
|
336
|
+
|
|
337
|
+
itemsize = p.data.dtype.itemsize
|
|
338
|
+
if current_big_tensor_size % itemsize:
|
|
339
|
+
current_big_tensor_size += itemsize - current_big_tensor_size % itemsize
|
|
340
|
+
tensor_map_indexes.append((big_tensor_no, current_big_tensor_size, length ))
|
|
341
|
+
current_big_tensor_size += length
|
|
342
|
+
|
|
343
|
+
total_tensor_bytes += length
|
|
344
|
+
|
|
345
|
+
big_tensors_sizes.append(current_big_tensor_size)
|
|
346
|
+
|
|
347
|
+
big_tensors = []
|
|
348
|
+
last_big_tensor = 0
|
|
349
|
+
total = 0
|
|
350
|
+
|
|
351
|
+
for size in big_tensors_sizes:
|
|
352
|
+
try:
|
|
353
|
+
current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
354
|
+
big_tensors.append(current_big_tensor)
|
|
355
|
+
except:
|
|
356
|
+
print(f"Unable to pin more tensors for '{sd_name}' as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
357
|
+
break
|
|
358
|
+
|
|
359
|
+
last_big_tensor += 1
|
|
360
|
+
total += size
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
tensor_no = 0
|
|
364
|
+
# prev_big_tensor = 0
|
|
365
|
+
q_name = None
|
|
366
|
+
for n, p in sd.items():
|
|
367
|
+
if tied_weights != None:
|
|
368
|
+
q_name = tied_weights.get(n,None)
|
|
369
|
+
if q_name != None:
|
|
370
|
+
q = sd[q_name]
|
|
371
|
+
p.data = q.data
|
|
372
|
+
assert p.data.is_pinned()
|
|
373
|
+
q = None
|
|
374
|
+
else:
|
|
375
|
+
big_tensor_no, offset, length = tensor_map_indexes[tensor_no]
|
|
376
|
+
|
|
377
|
+
if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
378
|
+
current_big_tensor = big_tensors[big_tensor_no]
|
|
379
|
+
length = torch.numel(p.data) * p.data.element_size()
|
|
380
|
+
q = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
381
|
+
torch.utils.swap_tensors(p, q)
|
|
382
|
+
del q
|
|
383
|
+
tensor_no += 1
|
|
384
|
+
del p
|
|
385
|
+
# global total_pinned_bytes
|
|
386
|
+
# total_pinned_bytes += total
|
|
387
|
+
gc.collect()
|
|
388
|
+
|
|
389
|
+
if verboseLevel >=1:
|
|
390
|
+
print(f"'{sd_name}' was pinned entirely to reserved RAM: {last_big_tensor} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
391
|
+
|
|
392
|
+
return
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, gig_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
306
396
|
if partialPinning:
|
|
307
397
|
towers_names, _ = _detect_main_towers(model)
|
|
308
398
|
|
|
309
399
|
|
|
310
|
-
# BIG_TENSOR_MAX_SIZE = 2**28 # 256 MB
|
|
311
|
-
BIG_TENSOR_MAX_SIZE = 2**27 # 128 MB
|
|
312
400
|
current_big_tensor_size = 0
|
|
313
401
|
big_tensor_no = 0
|
|
314
402
|
big_tensors_sizes = []
|
|
@@ -320,7 +408,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, v
|
|
|
320
408
|
include = True
|
|
321
409
|
if partialPinning:
|
|
322
410
|
include = any(k.startswith(pre) for pre in towers_names) if partialPinning else True
|
|
323
|
-
if include and not
|
|
411
|
+
if include and not pinnedPEFTLora and ".lora_" in k:
|
|
324
412
|
include = False
|
|
325
413
|
|
|
326
414
|
if include:
|
|
@@ -368,7 +456,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, v
|
|
|
368
456
|
length = torch.numel(p.data) * p.data.element_size()
|
|
369
457
|
|
|
370
458
|
ref_cache[ref] = (n, length)
|
|
371
|
-
if current_big_tensor_size + length >
|
|
459
|
+
if current_big_tensor_size + length > gig_tensor_size :
|
|
372
460
|
big_tensors_sizes.append(current_big_tensor_size)
|
|
373
461
|
current_big_tensor_size = 0
|
|
374
462
|
big_tensor_no += 1
|
|
@@ -463,7 +551,6 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedLora = True, v
|
|
|
463
551
|
else:
|
|
464
552
|
length = torch.numel(p.data) * p.data.element_size()
|
|
465
553
|
p.data = _move_to_pinned_tensor(p.data, current_big_tensor, offset, length)
|
|
466
|
-
p.aaaaa = n
|
|
467
554
|
tensor_no += 1
|
|
468
555
|
del p
|
|
469
556
|
global total_pinned_bytes
|
|
@@ -488,7 +575,7 @@ def _welcome():
|
|
|
488
575
|
if welcome_displayed:
|
|
489
576
|
return
|
|
490
577
|
welcome_displayed = True
|
|
491
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.
|
|
578
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.2.7) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
492
579
|
|
|
493
580
|
def _extract_num_from_str(num_in_str):
|
|
494
581
|
size = len(num_in_str)
|
|
@@ -771,167 +858,63 @@ def split_linear_modules(model, map ):
|
|
|
771
858
|
|
|
772
859
|
delattr(parent_module, module_suffix)
|
|
773
860
|
|
|
774
|
-
def _lora_linear_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
|
775
|
-
self._check_forward_args(x, *args, **kwargs)
|
|
776
|
-
adapter_names = kwargs.pop("adapter_names", None)
|
|
777
|
-
if self.disable_adapters:
|
|
778
|
-
if self.merged:
|
|
779
|
-
self.unmerge()
|
|
780
|
-
result = self.base_layer(x, *args, **kwargs)
|
|
781
|
-
elif adapter_names is not None:
|
|
782
|
-
result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
|
|
783
|
-
elif self.merged:
|
|
784
|
-
result = self.base_layer(x, *args, **kwargs)
|
|
785
|
-
else:
|
|
786
|
-
def get_scaling(active_adapter):
|
|
787
|
-
scaling_dict = shared_state.get("_lora_scaling", None)
|
|
788
|
-
if scaling_dict == None:
|
|
789
|
-
return self.scaling[active_adapter]
|
|
790
|
-
scaling_list = scaling_dict[active_adapter]
|
|
791
|
-
if isinstance(scaling_list, list):
|
|
792
|
-
step_no =shared_state.get("_lora_step_no", 0)
|
|
793
|
-
return scaling_list[step_no]
|
|
794
|
-
else:
|
|
795
|
-
return float(scaling_list)
|
|
796
|
-
|
|
797
|
-
base_weight = self.base_layer.weight
|
|
798
|
-
new_weights = not isinstance(self.base_layer, QModuleMixin)
|
|
799
|
-
if base_weight.shape[-1] < x.shape[-2] : # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
800
|
-
for active_adapter in self.active_adapters:
|
|
801
|
-
if active_adapter not in self.lora_A.keys():
|
|
802
|
-
continue
|
|
803
|
-
if self.use_dora[active_adapter]:
|
|
804
|
-
raise Exception("Dora not yet supported by mmgp")
|
|
805
|
-
|
|
806
|
-
lora_A = self.lora_A[active_adapter]
|
|
807
|
-
lora_B = self.lora_B[active_adapter]
|
|
808
|
-
dropout = self.lora_dropout[active_adapter]
|
|
809
|
-
scaling = get_scaling(active_adapter)
|
|
810
|
-
lora_A_weight = lora_A.weight
|
|
811
|
-
lora_B_weight = lora_B.weight
|
|
812
|
-
if new_weights or True:
|
|
813
|
-
base_weight = torch.addmm(base_weight, lora_B_weight, lora_A_weight, alpha= scaling )
|
|
814
|
-
# base_weight = base_weight + scaling * lora_B_weight @ lora_A_weight
|
|
815
|
-
else:
|
|
816
|
-
base_weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
|
|
817
|
-
# base_weight += scaling * lora_B_weight @ lora_A_weight
|
|
818
|
-
new_weights = False
|
|
819
|
-
|
|
820
|
-
if self.training:
|
|
821
|
-
result = torch.nn.functional.linear(dropout(x), base_weight, bias=self.base_layer.bias)
|
|
822
|
-
else:
|
|
823
|
-
result = torch.nn.functional.linear(x, base_weight, bias=self.base_layer.bias)
|
|
824
|
-
torch_result_dtype = result.dtype
|
|
825
|
-
|
|
826
|
-
else:
|
|
827
|
-
result = self.base_layer(x, *args, **kwargs)
|
|
828
|
-
torch_result_dtype = result.dtype
|
|
829
|
-
x = x.to(torch.bfloat16)
|
|
830
|
-
|
|
831
|
-
for active_adapter in self.active_adapters:
|
|
832
|
-
if active_adapter not in self.lora_A.keys():
|
|
833
|
-
continue
|
|
834
|
-
lora_A = self.lora_A[active_adapter]
|
|
835
|
-
lora_B = self.lora_B[active_adapter]
|
|
836
|
-
dropout = self.lora_dropout[active_adapter]
|
|
837
|
-
scaling = get_scaling(active_adapter)
|
|
838
|
-
x = x.to(lora_A.weight.dtype)
|
|
839
|
-
|
|
840
|
-
if not self.use_dora[active_adapter]:
|
|
841
|
-
if self.training:
|
|
842
|
-
y = lora_A(dropout(x))
|
|
843
|
-
else:
|
|
844
|
-
y = lora_A(x)
|
|
845
|
-
|
|
846
|
-
y = lora_B(y)
|
|
847
|
-
y*= scaling
|
|
848
|
-
result+= y
|
|
849
|
-
del lora_A, lora_B, y
|
|
850
|
-
# result = result + lora_B(lora_A(dropout(x))) * scaling
|
|
851
|
-
else:
|
|
852
|
-
if isinstance(dropout, torch.nn.Identity) or not self.training:
|
|
853
|
-
base_result = result
|
|
854
|
-
else:
|
|
855
|
-
x = dropout(x)
|
|
856
|
-
base_result = None
|
|
857
|
-
|
|
858
|
-
result = result + self.lora_magnitude_vector[active_adapter](
|
|
859
|
-
x,
|
|
860
|
-
lora_A=lora_A,
|
|
861
|
-
lora_B=lora_B,
|
|
862
|
-
scaling=scaling,
|
|
863
|
-
base_layer=self.get_base_layer(),
|
|
864
|
-
base_result=base_result,
|
|
865
|
-
)
|
|
866
|
-
|
|
867
|
-
result = result.to(torch_result_dtype)
|
|
868
|
-
return result
|
|
869
|
-
|
|
870
|
-
def _inject_adapter(
|
|
871
|
-
self, model: torch.nn.Module, adapter_name: str, autocast_adapter_dtype: bool = True, low_cpu_mem_usage: bool = False
|
|
872
|
-
) -> None:
|
|
873
|
-
|
|
874
|
-
def _get_submodules(model, key):
|
|
875
|
-
parent = model.get_submodule(".".join(key.split(".")[:-1]))
|
|
876
|
-
target_name = key.split(".")[-1]
|
|
877
|
-
target = model.get_submodule(key)
|
|
878
|
-
return parent, target, target_name
|
|
879
861
|
|
|
880
|
-
|
|
881
|
-
self._check_new_adapter_config(peft_config)
|
|
882
|
-
|
|
883
|
-
model_config = self.get_model_config(model)
|
|
884
|
-
|
|
885
|
-
peft_config = self._prepare_adapter_config(peft_config, model_config)
|
|
886
|
-
|
|
887
|
-
self._prepare_model(peft_config, model)
|
|
888
|
-
|
|
889
|
-
target_modules = peft_config.target_modules.copy()
|
|
890
|
-
|
|
891
|
-
# unexpected_modules = []
|
|
892
|
-
for key, target in model.named_modules():
|
|
893
|
-
if not key:
|
|
894
|
-
continue
|
|
895
|
-
if key in target_modules:
|
|
896
|
-
target_modules.remove(key)
|
|
897
|
-
self.targeted_module_names.append(key)
|
|
898
|
-
# pos = key.rfind(".")
|
|
899
|
-
# parent = key[:pos]
|
|
900
|
-
# target_name = key[pos+1:]
|
|
901
|
-
parent, target, target_name = _get_submodules(model, key)
|
|
902
|
-
with init_empty_weights():
|
|
903
|
-
self._create_and_replace(peft_config, adapter_name, target, target_name, parent, current_key=key)
|
|
904
|
-
|
|
905
|
-
self.set_adapter(self.active_adapters)
|
|
906
|
-
self._mark_only_adapters_as_trainable(model)
|
|
907
|
-
|
|
908
|
-
return target_modules
|
|
909
|
-
|
|
910
|
-
def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
|
|
862
|
+
def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, check_only = False, ignore_model_variations = False, pinnedLora = False, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
|
|
911
863
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
864
|
+
modules_dict = {k: v for k,v in model.named_modules()}
|
|
865
|
+
|
|
866
|
+
if not check_only:
|
|
867
|
+
loras_model_data = dict()
|
|
868
|
+
model._loras_model_data = loras_model_data
|
|
869
|
+
loras_active_adapters = set()
|
|
870
|
+
model._loras_active_adapters = loras_active_adapters
|
|
871
|
+
loras_scaling = dict()
|
|
872
|
+
model._loras_scaling = loras_scaling
|
|
873
|
+
loras_tied_weights = dict()
|
|
874
|
+
model._loras_tied_weights = loras_tied_weights
|
|
912
875
|
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
876
|
+
CrLf = '\r\n'
|
|
877
|
+
error_msg = ""
|
|
878
|
+
def append(source, text ):
|
|
879
|
+
if len(source) == 0:
|
|
880
|
+
return text
|
|
881
|
+
else:
|
|
882
|
+
return source + CrLf + text
|
|
883
|
+
|
|
884
|
+
def trunc(text, sz):
|
|
885
|
+
if len(text) < sz:
|
|
886
|
+
return str(text)
|
|
887
|
+
else:
|
|
888
|
+
return str(text)[0:sz] + '...'
|
|
919
889
|
|
|
920
890
|
if not isinstance(lora_path, list):
|
|
921
891
|
lora_path = [lora_path]
|
|
922
892
|
|
|
923
893
|
if lora_multi is None:
|
|
924
894
|
lora_multi = [1. for _ in lora_path]
|
|
925
|
-
|
|
895
|
+
loras_nos = []
|
|
896
|
+
loras_multi = []
|
|
897
|
+
new_lora_path = []
|
|
898
|
+
errors = []
|
|
899
|
+
adapters = {}
|
|
900
|
+
adapter_no = 0
|
|
926
901
|
for i, path in enumerate(lora_path):
|
|
927
|
-
adapter_name = str(
|
|
928
|
-
|
|
902
|
+
adapter_name = str(adapter_no)
|
|
903
|
+
error_msg = ""
|
|
904
|
+
if not os.path.isfile(path):
|
|
905
|
+
error_msg = f"Lora '{path}' was not found"
|
|
906
|
+
errors.append((path, error_msg))
|
|
907
|
+
print(error_msg)
|
|
908
|
+
continue
|
|
909
|
+
fail = False
|
|
910
|
+
skip = False
|
|
929
911
|
state_dict = safetensors2.torch_load_file(path)
|
|
912
|
+
|
|
930
913
|
if preprocess_sd != None:
|
|
931
914
|
state_dict = preprocess_sd(state_dict)
|
|
932
915
|
|
|
933
916
|
if split_linear_modules_map != None:
|
|
934
|
-
new_state_dict =
|
|
917
|
+
new_state_dict = dict()
|
|
935
918
|
targets_A = { "."+k+".lora_A.weight" : k for k in split_linear_modules_map }
|
|
936
919
|
targets_B = { "."+k+".lora_B.weight" : k for k in split_linear_modules_map }
|
|
937
920
|
for module_name, module_data in state_dict.items():
|
|
@@ -961,82 +944,162 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
961
944
|
new_state_dict[module_name] = module_data
|
|
962
945
|
state_dict = new_state_dict
|
|
963
946
|
del new_state_dict
|
|
947
|
+
# tied_weights = _extract_tie_weights_from_sd(state_dict, path) # to do
|
|
964
948
|
|
|
949
|
+
clean_up = False
|
|
965
950
|
keys = list(state_dict.keys())
|
|
966
951
|
if len(keys) == 0:
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
952
|
+
msg = f"Empty Lora '{path}'"
|
|
953
|
+
error_msg = append(error_msg, msg)
|
|
954
|
+
fail = True
|
|
955
|
+
|
|
956
|
+
if not fail:
|
|
957
|
+
network_alphas = {}
|
|
958
|
+
for k in keys:
|
|
959
|
+
if "alpha" in k:
|
|
960
|
+
alpha_value = state_dict.pop(k)
|
|
961
|
+
if not ( (torch.is_tensor(alpha_value) and torch.is_floating_point(alpha_value)) or isinstance(
|
|
962
|
+
alpha_value, float
|
|
963
|
+
)):
|
|
964
|
+
network_alphas[k] = torch.tensor( float(alpha_value.item() ) )
|
|
965
|
+
|
|
966
|
+
pos = keys[0].find(".")
|
|
967
|
+
prefix = keys[0][0:pos]
|
|
968
|
+
if prefix not in ["diffusion_model", "transformer"]:
|
|
969
|
+
msg = f"No compatible weight was found in Lora file '{path}'. Please check that it is compatible with the Diffusers format."
|
|
970
|
+
error_msg = append(error_msg, msg)
|
|
971
|
+
fail = True
|
|
972
|
+
|
|
973
|
+
if not fail:
|
|
974
|
+
state_dict = { k[ len(prefix) + 1:]: v for k, v in state_dict.items() if k.startswith(prefix) }
|
|
975
|
+
rank = {}
|
|
976
|
+
clean_up = True
|
|
977
|
+
|
|
978
|
+
# for key, val in state_dict.items():
|
|
979
|
+
# if "lora_B" in key:
|
|
980
|
+
# rank[key] = val.shape[1]
|
|
981
|
+
|
|
982
|
+
# if network_alphas is not None and len(network_alphas) >= 1:
|
|
983
|
+
# alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
|
|
984
|
+
# network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
|
|
985
|
+
network_alphas = None
|
|
986
|
+
|
|
987
|
+
invalid_keys = []
|
|
988
|
+
unexpected_keys = []
|
|
989
|
+
for k, v in state_dict.items():
|
|
990
|
+
pos = k.rfind(".lora_")
|
|
991
|
+
if pos <=0:
|
|
992
|
+
invalid_keys.append(k)
|
|
993
|
+
continue
|
|
994
|
+
module_name = k[ : pos]
|
|
995
|
+
lora_key = k[ pos+1:]
|
|
996
|
+
lora_A = None
|
|
997
|
+
lora_B = None
|
|
998
|
+
if lora_key == "lora_A.weight":
|
|
999
|
+
lora_A = v
|
|
1000
|
+
elif lora_key == "lora_B.weight":
|
|
1001
|
+
lora_B = v
|
|
1002
|
+
else:
|
|
1003
|
+
invalid_keys.append(k)
|
|
1004
|
+
continue
|
|
997
1005
|
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1006
|
+
module = modules_dict.get(module_name, None)
|
|
1007
|
+
if module == None:
|
|
1008
|
+
unexpected_keys.append(k)
|
|
1009
|
+
continue
|
|
1010
|
+
if not isinstance(module, (QLinear, torch.nn.Linear)):
|
|
1011
|
+
msg = f"Lora '{path}' contains a non linear layer '{k}'"
|
|
1012
|
+
error_msg = append(error_msg, msg)
|
|
1013
|
+
fail = True
|
|
1014
|
+
break
|
|
1015
|
+
module_shape = module.weight.shape
|
|
1016
|
+
if lora_A != None:
|
|
1017
|
+
if module_shape[1] != v.shape[1]:
|
|
1018
|
+
if ignore_model_variations:
|
|
1019
|
+
skip = True
|
|
1020
|
+
else:
|
|
1021
|
+
msg = f"Lora '{path}': Lora A dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[1]}, lora A = {v.shape[1]}). It is likely this Lora has been made for another version of this model."
|
|
1022
|
+
error_msg = append(error_msg, msg)
|
|
1023
|
+
fail = True
|
|
1024
|
+
break
|
|
1025
|
+
if lora_B != None:
|
|
1026
|
+
if module_shape[0] != v.shape[0]:
|
|
1027
|
+
if ignore_model_variations:
|
|
1028
|
+
skip = True
|
|
1029
|
+
else:
|
|
1030
|
+
msg = f"Lora '{path}': Lora B dimension is not compatible with model '{_get_module_name(model)}' (model = {module_shape[0]}, lora B = {v.shape[0]}). It is likely this Lora has been made for another version of this model."
|
|
1031
|
+
error_msg = append(error_msg, msg)
|
|
1032
|
+
fail = True
|
|
1033
|
+
break
|
|
1034
|
+
if not check_only:
|
|
1035
|
+
loras_module_data = loras_model_data.get(module, None)
|
|
1036
|
+
if loras_module_data == None:
|
|
1037
|
+
loras_module_data = dict()
|
|
1038
|
+
loras_model_data[module] = loras_module_data
|
|
1039
|
+
loras_adapter_data = loras_module_data.get(adapter_name, None)
|
|
1040
|
+
if loras_adapter_data == None:
|
|
1041
|
+
loras_adapter_data = [lora_A, lora_B]
|
|
1042
|
+
loras_module_data[adapter_name] = loras_adapter_data
|
|
1043
|
+
elif lora_A != None:
|
|
1044
|
+
loras_adapter_data[0] = lora_A
|
|
1045
|
+
else:
|
|
1046
|
+
loras_adapter_data[1] = lora_B
|
|
1047
|
+
lora_A, lora_B, v, loras_module_data, loras_adapter_data = None, None, None, None, None
|
|
1048
|
+
|
|
1049
|
+
if len(invalid_keys) > 0:
|
|
1050
|
+
msg = "Lora '{path}' contains non Lora keys '{trunc(invalid_keys,200)}'"
|
|
1051
|
+
error_msg = append(error_msg, msg)
|
|
1052
|
+
fail = True
|
|
1053
|
+
if len(unexpected_keys) > 0:
|
|
1054
|
+
msg = f"Lora '{path}' contains unexpected module keys, it is likely that this Lora is for a different model : '{trunc(unexpected_keys,200)}'"
|
|
1055
|
+
error_msg = append(error_msg, msg)
|
|
1056
|
+
fail = True
|
|
1057
|
+
if fail or skip:
|
|
1058
|
+
if fail:
|
|
1059
|
+
errors.append((path, error_msg))
|
|
1060
|
+
print(error_msg)
|
|
1061
|
+
if clean_up and not check_only:
|
|
1062
|
+
for m,loras_module_data in loras_model_data.items():
|
|
1063
|
+
if adapter_name in loras_module_data:
|
|
1064
|
+
del loras_module_data[adapter_name]
|
|
1002
1065
|
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1066
|
+
else:
|
|
1067
|
+
if not check_only:
|
|
1068
|
+
# model._loras_tied_weights[adapter_name] = tied_weights
|
|
1069
|
+
if pinnedLora:
|
|
1070
|
+
_pin_sd_to_memory(state_dict, path)
|
|
1007
1071
|
|
|
1008
|
-
|
|
1009
|
-
for key, val in state_dict.items():
|
|
1010
|
-
if "lora_B" in key:
|
|
1011
|
-
rank[key] = val.shape[1]
|
|
1072
|
+
del state_dict
|
|
1012
1073
|
|
|
1013
|
-
if network_alphas is not None and len(network_alphas) >= 1:
|
|
1014
|
-
alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
|
|
1015
|
-
network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}
|
|
1016
1074
|
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
# Check only for unexpected keys.
|
|
1029
|
-
unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
|
|
1030
|
-
if unexpected_keys:
|
|
1031
|
-
raise Exception(f"Lora '{path}' contains invalid keys '{unexpected_keys}'")
|
|
1075
|
+
adapters[adapter_name] = path
|
|
1076
|
+
loras_nos.append(adapter_name)
|
|
1077
|
+
new_lora_path.append(path)
|
|
1078
|
+
loras_multi.append(1.0 if i > (len(lora_multi) -1) else lora_multi[i])
|
|
1079
|
+
pass
|
|
1080
|
+
adapter_no += 1
|
|
1081
|
+
if verboseLevel >=1:
|
|
1082
|
+
if check_only:
|
|
1083
|
+
print(f"Lora '{path}' was found for model '{_get_module_name(model)}'")
|
|
1084
|
+
else:
|
|
1085
|
+
print(f"Lora '{path}' was loaded in model '{_get_module_name(model)}'")
|
|
1032
1086
|
|
|
1033
|
-
|
|
1034
|
-
|
|
1087
|
+
model._loras_errors = errors
|
|
1088
|
+
if not check_only:
|
|
1089
|
+
model._loras_adapters = adapters
|
|
1035
1090
|
if activate_all_loras:
|
|
1036
|
-
|
|
1091
|
+
activate_loras(model, loras_nos, loras_multi)
|
|
1092
|
+
return new_lora_path
|
|
1037
1093
|
|
|
1038
|
-
def
|
|
1039
|
-
|
|
1094
|
+
def unload_loras_from_model(model):
|
|
1095
|
+
model._loras_model_data = None
|
|
1096
|
+
model._loras_errors = None
|
|
1097
|
+
model._loras_adapters = None
|
|
1098
|
+
model._loras_active_adapters = None
|
|
1099
|
+
model._loras_scaling = None
|
|
1100
|
+
|
|
1101
|
+
def set_step_no_for_lora(model, step_no):
|
|
1102
|
+
model._lora_step_no = step_no
|
|
1040
1103
|
|
|
1041
1104
|
def activate_loras(model, lora_nos, lora_multi = None ):
|
|
1042
1105
|
if not isinstance(lora_nos, list):
|
|
@@ -1046,15 +1109,13 @@ def activate_loras(model, lora_nos, lora_multi = None ):
|
|
|
1046
1109
|
if lora_multi is None:
|
|
1047
1110
|
lora_multi = [1. for _ in lora_nos]
|
|
1048
1111
|
|
|
1049
|
-
lora_fake_scaling = [1. if isinstance(mult, list) else mult for mult in lora_multi ]
|
|
1050
1112
|
lora_scaling_dict = {}
|
|
1051
1113
|
for no, multi in zip(lora_nos, lora_multi):
|
|
1052
1114
|
lora_scaling_dict[no] = multi
|
|
1053
1115
|
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
set_weights_and_activate_adapters(model, lora_nos, lora_fake_scaling)
|
|
1116
|
+
model._lora_step_no = 0
|
|
1117
|
+
model._loras_active_adapters = set(lora_nos)
|
|
1118
|
+
model._loras_scaling = lora_scaling_dict
|
|
1058
1119
|
|
|
1059
1120
|
|
|
1060
1121
|
def move_loras_to_device(model, device="cpu" ):
|
|
@@ -1399,12 +1460,12 @@ class offload:
|
|
|
1399
1460
|
self.loaded_blocks = {}
|
|
1400
1461
|
self.prev_blocks_names = {}
|
|
1401
1462
|
self.next_blocks_names = {}
|
|
1402
|
-
self.lora_parents = {}
|
|
1403
1463
|
self.preloaded_blocks_per_model = {}
|
|
1404
1464
|
self.default_stream = torch.cuda.default_stream(torch.device("cuda")) # torch.cuda.current_stream()
|
|
1405
1465
|
self.transfer_stream = torch.cuda.Stream()
|
|
1406
1466
|
self.async_transfers = False
|
|
1407
1467
|
self.parameters_ref = {}
|
|
1468
|
+
|
|
1408
1469
|
global last_offload_obj
|
|
1409
1470
|
last_offload_obj = self
|
|
1410
1471
|
|
|
@@ -1428,15 +1489,12 @@ class offload:
|
|
|
1428
1489
|
self.next_blocks_names[prev_entry_name] = entry_name
|
|
1429
1490
|
bef = blocks_params_size
|
|
1430
1491
|
|
|
1431
|
-
lora_name = None
|
|
1432
|
-
if self.lora_parents.get(submodule, None) != None:
|
|
1433
|
-
lora_name = str(submodule_name[ submodule_name.rfind(".") + 1: ] )
|
|
1434
1492
|
for k,p in submodule.named_parameters(recurse=False):
|
|
1435
1493
|
param_size = 0
|
|
1436
1494
|
ref = _get_tensor_ref(p)
|
|
1437
1495
|
tied_param = self.parameters_ref.get(ref, None)
|
|
1438
1496
|
if isinstance(p, QTensor):
|
|
1439
|
-
blocks_params.append( (submodule, k, p, False, tied_param
|
|
1497
|
+
blocks_params.append( (submodule, k, p, False, tied_param ) )
|
|
1440
1498
|
|
|
1441
1499
|
if p._qtype == qint4:
|
|
1442
1500
|
if hasattr(p,"_scale_shift"):
|
|
@@ -1450,7 +1508,7 @@ class offload:
|
|
|
1450
1508
|
param_size += torch.numel(p._scale) * p._scale.element_size()
|
|
1451
1509
|
param_size += torch.numel(p._data) * p._data.element_size()
|
|
1452
1510
|
else:
|
|
1453
|
-
blocks_params.append( (submodule, k, p, False, tied_param
|
|
1511
|
+
blocks_params.append( (submodule, k, p, False, tied_param) )
|
|
1454
1512
|
param_size += torch.numel(p.data) * p.data.element_size()
|
|
1455
1513
|
|
|
1456
1514
|
|
|
@@ -1459,7 +1517,7 @@ class offload:
|
|
|
1459
1517
|
self.parameters_ref[ref] = (submodule, k)
|
|
1460
1518
|
|
|
1461
1519
|
for k, p in submodule.named_buffers(recurse=False):
|
|
1462
|
-
blocks_params.append( (submodule, k, p, True, None
|
|
1520
|
+
blocks_params.append( (submodule, k, p, True, None) )
|
|
1463
1521
|
blocks_params_size += p.data.nbytes
|
|
1464
1522
|
|
|
1465
1523
|
aft = blocks_params_size
|
|
@@ -1484,6 +1542,19 @@ class offload:
|
|
|
1484
1542
|
return False
|
|
1485
1543
|
return True
|
|
1486
1544
|
|
|
1545
|
+
def _move_loras(self, loras_active_adapters, loras_modules, to_GPU):
|
|
1546
|
+
for name, lora_module in loras_modules.items():
|
|
1547
|
+
for adapter in loras_active_adapters:
|
|
1548
|
+
lora_data = lora_module.get(adapter, None)
|
|
1549
|
+
if lora_data == None:
|
|
1550
|
+
continue
|
|
1551
|
+
lora_A, lora_B = lora_data
|
|
1552
|
+
key = adapter + '_GPU'
|
|
1553
|
+
if to_GPU:
|
|
1554
|
+
lora_module[key] = [lora_A.cuda(), lora_B.cuda()]
|
|
1555
|
+
elif key in lora_module:
|
|
1556
|
+
del lora_module[key]
|
|
1557
|
+
|
|
1487
1558
|
@torch.compiler.disable()
|
|
1488
1559
|
def gpu_load_blocks(self, model_id, blocks_name, preload = False):
|
|
1489
1560
|
# cl = clock.start()
|
|
@@ -1492,12 +1563,17 @@ class offload:
|
|
|
1492
1563
|
entry_name = model_id if blocks_name is None else model_id + "/" + blocks_name
|
|
1493
1564
|
|
|
1494
1565
|
def cpu_to_gpu(stream_to_use, blocks_params): #, record_for_stream = None
|
|
1566
|
+
model = self.models[model_id]
|
|
1567
|
+
loras_modules = {}
|
|
1568
|
+
loras_active_adapters = getattr(model ,"_loras_active_adapters", None)
|
|
1569
|
+
if loras_active_adapters == None or len(loras_active_adapters) == 0:
|
|
1570
|
+
loras_model_data = None
|
|
1571
|
+
else:
|
|
1572
|
+
loras_model_data = getattr(model, "_loras_model_data", None)
|
|
1573
|
+
|
|
1495
1574
|
with torch.cuda.stream(stream_to_use):
|
|
1496
1575
|
for param in blocks_params:
|
|
1497
|
-
parent_module, n, p, is_buffer, tied_param
|
|
1498
|
-
if lora_name != None:
|
|
1499
|
-
if not lora_name in self.lora_parents[parent_module].active_adapters:
|
|
1500
|
-
continue
|
|
1576
|
+
parent_module, n, p, is_buffer, tied_param = param
|
|
1501
1577
|
|
|
1502
1578
|
if tied_param != None:
|
|
1503
1579
|
tied_p = getattr( tied_param[0], tied_param[1])
|
|
@@ -1515,6 +1591,12 @@ class offload:
|
|
|
1515
1591
|
if tied_param != None:
|
|
1516
1592
|
setattr( tied_param[0], tied_param[1], q)
|
|
1517
1593
|
del p, q
|
|
1594
|
+
if loras_model_data != None:
|
|
1595
|
+
lora_data = loras_model_data.get(parent_module, None)
|
|
1596
|
+
if lora_data != None:
|
|
1597
|
+
loras_modules[parent_module]= lora_data
|
|
1598
|
+
if len(loras_modules) > 0:
|
|
1599
|
+
self._move_loras(loras_active_adapters, loras_modules, True)
|
|
1518
1600
|
|
|
1519
1601
|
loaded_block = self.loaded_blocks[model_id]
|
|
1520
1602
|
|
|
@@ -1575,14 +1657,31 @@ class offload:
|
|
|
1575
1657
|
print(f"Unloading model {blocks_name} ({model_name}) from GPU")
|
|
1576
1658
|
|
|
1577
1659
|
blocks_params = self.blocks_of_modules[blocks_name]
|
|
1660
|
+
model = self.models[model_id]
|
|
1661
|
+
loras_modules = {}
|
|
1662
|
+
loras_active_adapters = getattr(model ,"_loras_active_adapters", None)
|
|
1663
|
+
if loras_active_adapters == None or len(loras_active_adapters) == 0 :
|
|
1664
|
+
loras_model_data = None
|
|
1665
|
+
else:
|
|
1666
|
+
loras_model_data = getattr(model, "_loras_model_data", None)
|
|
1667
|
+
|
|
1578
1668
|
for param in blocks_params:
|
|
1579
|
-
parent_module, n, p, is_buffer, _
|
|
1669
|
+
parent_module, n, p, is_buffer, _ = param
|
|
1580
1670
|
if is_buffer:
|
|
1581
1671
|
q = torch.nn.Buffer(p)
|
|
1582
1672
|
else:
|
|
1583
1673
|
q = torch.nn.Parameter(p , requires_grad=False)
|
|
1584
1674
|
setattr(parent_module, n , q)
|
|
1585
1675
|
del p, q
|
|
1676
|
+
|
|
1677
|
+
if loras_model_data != None:
|
|
1678
|
+
lora_data = loras_model_data.get(parent_module, None)
|
|
1679
|
+
if lora_data != None:
|
|
1680
|
+
loras_modules[parent_module]= lora_data
|
|
1681
|
+
|
|
1682
|
+
if len(loras_modules) > 0:
|
|
1683
|
+
self._move_loras(loras_active_adapters, loras_modules, False)
|
|
1684
|
+
|
|
1586
1685
|
# cl.stop()
|
|
1587
1686
|
# print(f"unload time: {cl.format_time_gap()}")
|
|
1588
1687
|
|
|
@@ -1670,6 +1769,92 @@ class offload:
|
|
|
1670
1769
|
|
|
1671
1770
|
return False
|
|
1672
1771
|
|
|
1772
|
+
def _lora_linear_forward(self, model, submodule, loras_data, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
|
1773
|
+
|
|
1774
|
+
def get_scaling(active_adapter):
|
|
1775
|
+
scaling_list = loras_scaling[active_adapter]
|
|
1776
|
+
if isinstance(scaling_list, list):
|
|
1777
|
+
step_no =getattr(model, "_lora_step_no", 0)
|
|
1778
|
+
return scaling_list[step_no]
|
|
1779
|
+
else:
|
|
1780
|
+
return float(scaling_list)
|
|
1781
|
+
|
|
1782
|
+
weight = submodule.weight
|
|
1783
|
+
|
|
1784
|
+
if loras_data == None:
|
|
1785
|
+
return torch.nn.functional.linear(x, weight, bias=submodule.bias)
|
|
1786
|
+
|
|
1787
|
+
active_adapters = model._loras_active_adapters
|
|
1788
|
+
loras_scaling = model._loras_scaling
|
|
1789
|
+
training = False
|
|
1790
|
+
|
|
1791
|
+
|
|
1792
|
+
if weight.shape[-1] < x.shape[-2] : # sum base weight and lora matrices instead of applying input on each sub lora matrice if input is too large. This will save a lot VRAM and compute
|
|
1793
|
+
if len(active_adapters) > 0:
|
|
1794
|
+
if isinstance(submodule, QModuleMixin):
|
|
1795
|
+
weight = weight.view(weight.shape) # get a persistent copy of the on the fly dequantized weights
|
|
1796
|
+
else:
|
|
1797
|
+
weight = weight.clone()
|
|
1798
|
+
|
|
1799
|
+
|
|
1800
|
+
for active_adapter in active_adapters:
|
|
1801
|
+
data = loras_data.get(active_adapter + '_GPU', None)
|
|
1802
|
+
if data == None:
|
|
1803
|
+
continue
|
|
1804
|
+
lora_A_weight, lora_B_weight = data
|
|
1805
|
+
scaling = get_scaling(active_adapter)
|
|
1806
|
+
weight.addmm_(lora_B_weight, lora_A_weight, alpha= scaling )
|
|
1807
|
+
# base_weight += scaling * lora_B_weight @ lora_A_weight
|
|
1808
|
+
|
|
1809
|
+
if training:
|
|
1810
|
+
pass
|
|
1811
|
+
# result = torch.nn.functional.linear(dropout(x), base_weight, bias=submodule.bias)
|
|
1812
|
+
else:
|
|
1813
|
+
result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
|
|
1814
|
+
|
|
1815
|
+
else:
|
|
1816
|
+
result = torch.nn.functional.linear(x, weight, bias=submodule.bias)
|
|
1817
|
+
|
|
1818
|
+
if len(active_adapters) > 0:
|
|
1819
|
+
x = x.to(torch.bfloat16)
|
|
1820
|
+
|
|
1821
|
+
for active_adapter in active_adapters:
|
|
1822
|
+
data = loras_data.get(active_adapter + '_GPU', None)
|
|
1823
|
+
if data == None:
|
|
1824
|
+
continue
|
|
1825
|
+
lora_A, lora_B = data
|
|
1826
|
+
# dropout = self.lora_dropout[active_adapter]
|
|
1827
|
+
scaling = get_scaling(active_adapter)
|
|
1828
|
+
x = x.to(lora_A.dtype)
|
|
1829
|
+
|
|
1830
|
+
if training:
|
|
1831
|
+
pass
|
|
1832
|
+
# y = lora_A(dropout(x))
|
|
1833
|
+
else:
|
|
1834
|
+
y = torch.nn.functional.linear(x, lora_A, bias=None)
|
|
1835
|
+
|
|
1836
|
+
y = torch.nn.functional.linear(y, lora_B, bias=None)
|
|
1837
|
+
y*= scaling
|
|
1838
|
+
result+= y
|
|
1839
|
+
del y
|
|
1840
|
+
|
|
1841
|
+
return result
|
|
1842
|
+
|
|
1843
|
+
|
|
1844
|
+
def hook_lora_linear(self, submodule, current_model, model_id, submodule_name):
|
|
1845
|
+
old_forward = submodule.forward
|
|
1846
|
+
def lora_linear_forward(module, *args, **kwargs):
|
|
1847
|
+
loras_model_data = getattr(current_model, "_loras_model_data", None)
|
|
1848
|
+
loras_data = None
|
|
1849
|
+
if loras_model_data != None:
|
|
1850
|
+
loras_data = loras_model_data.get(submodule, None)
|
|
1851
|
+
if loras_data == None:
|
|
1852
|
+
return old_forward(*args, **kwargs)
|
|
1853
|
+
else:
|
|
1854
|
+
return self._lora_linear_forward(current_model, submodule, loras_data, *args, **kwargs)
|
|
1855
|
+
|
|
1856
|
+
return functools.update_wrapper(functools.partial(lora_linear_forward, submodule), old_forward)
|
|
1857
|
+
|
|
1673
1858
|
def ensure_model_loaded(self, model_id):
|
|
1674
1859
|
if model_id in self.active_models_ids:
|
|
1675
1860
|
return
|
|
@@ -1851,6 +2036,8 @@ class offload:
|
|
|
1851
2036
|
|
|
1852
2037
|
for model_id, model in self.models.items():
|
|
1853
2038
|
move_loras_to_device(model, "cpu")
|
|
2039
|
+
if hasattr(model, "_loras_model_data"):
|
|
2040
|
+
unload_loras_from_model(model)
|
|
1854
2041
|
|
|
1855
2042
|
self.models = None
|
|
1856
2043
|
|
|
@@ -1860,7 +2047,7 @@ class offload:
|
|
|
1860
2047
|
|
|
1861
2048
|
|
|
1862
2049
|
|
|
1863
|
-
def all(pipe_or_dict_of_modules, pinnedMemory = False,
|
|
2050
|
+
def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, loras = None, quantizeTransformer = True, extraModelsToQuantize = None, quantizationType = qint8, budgets= 0, workingVRAM = None, asyncTransfers = True, compile = False, perc_reserved_mem_max = 0, coTenantsMap = None, verboseLevel = -1):
|
|
1864
2051
|
"""Hook to a pipeline or a group of modules in order to reduce their VRAM requirements:
|
|
1865
2052
|
pipe_or_dict_of_modules : the pipeline object or a dictionary of modules of the model
|
|
1866
2053
|
quantizeTransformer: set True by default will quantize on the fly the video / image model
|
|
@@ -1912,7 +2099,8 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
|
|
|
1912
2099
|
_welcome()
|
|
1913
2100
|
if coTenantsMap != None:
|
|
1914
2101
|
self.cotenants_map = coTenantsMap
|
|
1915
|
-
|
|
2102
|
+
if loras != None and isinstance(loras, str):
|
|
2103
|
+
loras = [loras]
|
|
1916
2104
|
self.models = models
|
|
1917
2105
|
|
|
1918
2106
|
extraModelsToQuantize = extraModelsToQuantize if extraModelsToQuantize is not None else []
|
|
@@ -2059,12 +2247,12 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
|
|
|
2059
2247
|
if self.verboseLevel >=1:
|
|
2060
2248
|
print(f"Model '{model_id}' already pinned to reserved memory")
|
|
2061
2249
|
else:
|
|
2062
|
-
_pin_to_memory(current_model, model_id, partialPinning= partialPinning,
|
|
2250
|
+
_pin_to_memory(current_model, model_id, partialPinning= partialPinning, pinnedPEFTLora = pinnedPEFTLora, verboseLevel=verboseLevel)
|
|
2063
2251
|
|
|
2064
2252
|
current_budget = model_budgets[model_id]
|
|
2065
2253
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
2066
2254
|
self.loaded_blocks[model_id] = None
|
|
2067
|
-
|
|
2255
|
+
any_lora = loras !=None and model_id in loras or getattr(current_model, "_loras_model_data", False)
|
|
2068
2256
|
for submodule_name, submodule in current_model.named_modules():
|
|
2069
2257
|
# create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
|
|
2070
2258
|
# (it is queried in many pipelines even if offloading is not properly implemented)
|
|
@@ -2096,7 +2284,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
|
|
|
2096
2284
|
|
|
2097
2285
|
|
|
2098
2286
|
if hasattr(submodule, "forward"):
|
|
2099
|
-
|
|
2287
|
+
if any_lora and isinstance(submodule, torch.nn.Linear):
|
|
2288
|
+
submodule_method = self.hook_lora_linear(submodule, current_model, model_id, submodule_name)
|
|
2289
|
+
else:
|
|
2290
|
+
submodule_method = getattr(submodule, "forward")
|
|
2100
2291
|
if callable(submodule_method):
|
|
2101
2292
|
if len(submodule_name.split("."))==1:
|
|
2102
2293
|
self.hook_change_module(submodule, current_model, model_id, submodule_name, submodule_method)
|
|
@@ -2107,13 +2298,6 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedLora = False, quant
|
|
|
2107
2298
|
|
|
2108
2299
|
self.add_module_to_blocks(model_id, cur_blocks_name, submodule, prev_blocks_name, submodule_name)
|
|
2109
2300
|
|
|
2110
|
-
if hasattr(submodule, "active_adapters"):
|
|
2111
|
-
for dictmodule in ["lora_A","lora_B"]:
|
|
2112
|
-
ssubmod = getattr(submodule, dictmodule, None)
|
|
2113
|
-
if ssubmod !=None:
|
|
2114
|
-
for k, loramod in ssubmod._modules.items():
|
|
2115
|
-
self.lora_parents[loramod] = submodule
|
|
2116
|
-
|
|
2117
2301
|
|
|
2118
2302
|
self.tune_preloading(model_id, current_budget, towers_names)
|
|
2119
2303
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.2.
|
|
3
|
+
Version: 3.2.7
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -13,11 +13,10 @@ Requires-Dist: optimum-quanto
|
|
|
13
13
|
Requires-Dist: accelerate
|
|
14
14
|
Requires-Dist: safetensors
|
|
15
15
|
Requires-Dist: psutil
|
|
16
|
-
Requires-Dist: peft
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.2.
|
|
19
|
+
<H2>Memory Management 3.2.7 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
20
|
</p>
|
|
22
21
|
|
|
23
22
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|