mmgp 3.3.2__py3-none-any.whl → 3.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmgp might be problematic. Click here for more details.
- mmgp/offload.py +36 -56
- {mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/METADATA +2 -2
- mmgp-3.3.4.dist-info/RECORD +9 -0
- mmgp-3.3.2.dist-info/RECORD +0 -9
- {mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/WHEEL +0 -0
- {mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/licenses/LICENSE.md +0 -0
- {mmgp-3.3.2.dist-info → mmgp-3.3.4.dist-info}/top_level.txt +0 -0
mmgp/offload.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# ------------------ Memory Management 3.3.
|
|
1
|
+
# ------------------ Memory Management 3.3.4 for the GPU Poor by DeepBeepMeep (mmgp)------------------
|
|
2
2
|
#
|
|
3
3
|
# This module contains multiples optimisations so that models such as Flux (and derived), Mochi, CogView, HunyuanVideo, ... can run smoothly on a 24 GB GPU limited card.
|
|
4
4
|
# This a replacement for the accelerate library that should in theory manage offloading, but doesn't work properly with models that are loaded / unloaded several
|
|
@@ -401,7 +401,7 @@ def _pin_sd_to_memory(sd, sd_name, tied_weights = None, gig_tensor_size = BIG_TE
|
|
|
401
401
|
return
|
|
402
402
|
|
|
403
403
|
|
|
404
|
-
def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True,
|
|
404
|
+
def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = True, big_tensor_size = BIG_TENSOR_MAX_SIZE, verboseLevel = 1):
|
|
405
405
|
|
|
406
406
|
global max_pinnable_bytes, total_pinned_bytes
|
|
407
407
|
if max_pinnable_bytes > 0 and max_pinnable_bytes >= max_pinnable_bytes:
|
|
@@ -474,7 +474,7 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
474
474
|
length = torch.numel(p.data) * p.data.element_size()
|
|
475
475
|
|
|
476
476
|
ref_cache[ref] = (n, length)
|
|
477
|
-
if current_big_tensor_size + length >
|
|
477
|
+
if current_big_tensor_size + length > big_tensor_size and current_big_tensor_size !=0 :
|
|
478
478
|
big_tensors_sizes.append(current_big_tensor_size)
|
|
479
479
|
current_big_tensor_size = 0
|
|
480
480
|
big_tensor_no += 1
|
|
@@ -498,28 +498,11 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
498
498
|
big_tensors_sizes.append(current_big_tensor_size)
|
|
499
499
|
|
|
500
500
|
big_tensors = []
|
|
501
|
-
last_big_tensor = 0
|
|
502
501
|
total = 0
|
|
503
502
|
|
|
504
503
|
|
|
505
504
|
failed_planned_allocation = False
|
|
506
505
|
|
|
507
|
-
# for size in big_tensors_sizes:
|
|
508
|
-
# try:
|
|
509
|
-
# # if total > 7000 * ONE_MB:
|
|
510
|
-
# # raise Exception ("test no more reserved RAM")
|
|
511
|
-
# current_big_tensor = torch.empty( size, dtype= torch.uint8, pin_memory=True, device="cpu")
|
|
512
|
-
# big_tensors.append(current_big_tensor)
|
|
513
|
-
# except:
|
|
514
|
-
# print(f"Unable to pin more tensors for this model as the maximum reservable memory has been reached ({total/ONE_MB:.2f})")
|
|
515
|
-
# max_pinnable_bytes = total + total_pinned_bytes
|
|
516
|
-
# failed_planned_allocation = True
|
|
517
|
-
# break
|
|
518
|
-
|
|
519
|
-
# last_big_tensor += 1
|
|
520
|
-
# total += size
|
|
521
|
-
|
|
522
|
-
|
|
523
506
|
gc.collect()
|
|
524
507
|
|
|
525
508
|
last_allocated_big_tensor = -1
|
|
@@ -561,13 +544,6 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
561
544
|
|
|
562
545
|
total += size
|
|
563
546
|
|
|
564
|
-
# if big_tensor_no != prev_big_tensor:
|
|
565
|
-
# gc.collect()
|
|
566
|
-
# prev_big_tensor = big_tensor_no
|
|
567
|
-
# match_param, match_isbuffer = tied_weights.get(n, (None, False))
|
|
568
|
-
# if match_param != None:
|
|
569
|
-
|
|
570
|
-
# if big_tensor_no>=0 and big_tensor_no < last_big_tensor:
|
|
571
547
|
current_big_tensor = big_tensors[big_tensor_no]
|
|
572
548
|
if is_buffer :
|
|
573
549
|
_force_load_buffer(p) # otherwise potential memory leak
|
|
@@ -600,9 +576,9 @@ def _pin_to_memory(model, model_id, partialPinning = False, pinnedPEFTLora = Tru
|
|
|
600
576
|
|
|
601
577
|
if verboseLevel >=1:
|
|
602
578
|
if partialPinning or failed_planned_allocation:
|
|
603
|
-
print(f"The model was partially pinned to reserved RAM: {
|
|
579
|
+
print(f"The model was partially pinned to reserved RAM: {last_allocated_big_tensor + 1} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
604
580
|
else:
|
|
605
|
-
print(f"The whole model was pinned to reserved RAM: {
|
|
581
|
+
print(f"The whole model was pinned to reserved RAM: {last_allocated_big_tensor + 1} large blocks spread across {total/ONE_MB:.2f} MB")
|
|
606
582
|
|
|
607
583
|
model._already_pinned = True
|
|
608
584
|
|
|
@@ -615,7 +591,7 @@ def _welcome():
|
|
|
615
591
|
if welcome_displayed:
|
|
616
592
|
return
|
|
617
593
|
welcome_displayed = True
|
|
618
|
-
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.
|
|
594
|
+
print(f"{BOLD}{HEADER}************ Memory Management for the GPU Poor (mmgp 3.3.4) by DeepBeepMeep ************{ENDC}{UNBOLD}")
|
|
619
595
|
|
|
620
596
|
def _extract_num_from_str(num_in_str):
|
|
621
597
|
size = len(num_in_str)
|
|
@@ -901,17 +877,15 @@ def split_linear_modules(model, map ):
|
|
|
901
877
|
|
|
902
878
|
def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_loras = True, check_only = False, ignore_model_variations = False, pinnedLora = False, split_linear_modules_map = None, preprocess_sd = None, verboseLevel = -1,):
|
|
903
879
|
verboseLevel = _compute_verbose_level(verboseLevel)
|
|
904
|
-
modules_dict = {k: v for k,v in model.named_modules()}
|
|
905
880
|
|
|
881
|
+
loras_model_data = getattr(model, "_loras_model_data", None)
|
|
882
|
+
if loras_model_data == None:
|
|
883
|
+
raise Exception(f"No Loras has been declared for this model while creating the corresponding offload object")
|
|
884
|
+
|
|
906
885
|
if not check_only:
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
model._loras_active_adapters = loras_active_adapters
|
|
911
|
-
loras_scaling = dict()
|
|
912
|
-
model._loras_scaling = loras_scaling
|
|
913
|
-
loras_tied_weights = dict()
|
|
914
|
-
model._loras_tied_weights = loras_tied_weights
|
|
886
|
+
unload_loras_from_model(model)
|
|
887
|
+
|
|
888
|
+
modules_dict = {k: v for k,v in model.named_modules()}
|
|
915
889
|
|
|
916
890
|
CrLf = '\r\n'
|
|
917
891
|
error_msg = ""
|
|
@@ -949,10 +923,7 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
949
923
|
continue
|
|
950
924
|
fail = False
|
|
951
925
|
skip = False
|
|
952
|
-
state_dict = safetensors2.torch_load_file(path)
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
926
|
+
state_dict = safetensors2.torch_load_file(path, writable_tensors= False)
|
|
956
927
|
|
|
957
928
|
if preprocess_sd != None:
|
|
958
929
|
state_dict = preprocess_sd(state_dict)
|
|
@@ -1069,9 +1040,10 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1069
1040
|
break
|
|
1070
1041
|
if not check_only:
|
|
1071
1042
|
loras_module_data = loras_model_data.get(module, None)
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1043
|
+
assert loras_module_data != None
|
|
1044
|
+
# if loras_module_data == None:
|
|
1045
|
+
# loras_module_data = dict()
|
|
1046
|
+
# loras_model_data[module] = loras_module_data
|
|
1075
1047
|
loras_adapter_data = loras_module_data.get(adapter_name, None)
|
|
1076
1048
|
lora_A = None if lora_A == None else lora_A.to(torch.bfloat16)
|
|
1077
1049
|
lora_B = None if lora_B == None else lora_B.to(torch.bfloat16)
|
|
@@ -1132,12 +1104,17 @@ def load_loras_into_model(model, lora_path, lora_multi = None, activate_all_lora
|
|
|
1132
1104
|
return new_lora_path
|
|
1133
1105
|
|
|
1134
1106
|
def unload_loras_from_model(model):
|
|
1135
|
-
model._loras_model_data
|
|
1107
|
+
for _, v in model._loras_model_data.items():
|
|
1108
|
+
v.clear()
|
|
1109
|
+
|
|
1110
|
+
model._loras_active_adapters = set()
|
|
1111
|
+
model._loras_scaling = dict()
|
|
1112
|
+
model._loras_tied_weights = dict()
|
|
1136
1113
|
model._loras_errors = None
|
|
1137
1114
|
model._loras_adapters = None
|
|
1138
|
-
model._loras_active_adapters = None
|
|
1139
1115
|
model._loras_scaling = None
|
|
1140
1116
|
|
|
1117
|
+
|
|
1141
1118
|
def set_step_no_for_lora(model, step_no):
|
|
1142
1119
|
model._lora_step_no = step_no
|
|
1143
1120
|
|
|
@@ -1881,14 +1858,14 @@ class offload:
|
|
|
1881
1858
|
return result
|
|
1882
1859
|
|
|
1883
1860
|
|
|
1884
|
-
def hook_lora_linear(self, submodule, current_model, model_id, submodule_name):
|
|
1861
|
+
def hook_lora_linear(self, submodule, current_model, model_id, loras_model_data, submodule_name):
|
|
1885
1862
|
old_forward = submodule.forward
|
|
1863
|
+
|
|
1864
|
+
loras_data = {}
|
|
1865
|
+
loras_model_data[submodule] = loras_data
|
|
1866
|
+
|
|
1886
1867
|
def lora_linear_forward(module, *args, **kwargs):
|
|
1887
|
-
|
|
1888
|
-
loras_data = None
|
|
1889
|
-
if loras_model_data != None:
|
|
1890
|
-
loras_data = loras_model_data.get(submodule, None)
|
|
1891
|
-
if loras_data == None:
|
|
1868
|
+
if len(loras_data) == 0:
|
|
1892
1869
|
return old_forward(*args, **kwargs)
|
|
1893
1870
|
else:
|
|
1894
1871
|
return self._lora_linear_forward(current_model, submodule, loras_data, *args, **kwargs)
|
|
@@ -2295,7 +2272,10 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2295
2272
|
current_budget = model_budgets[model_id]
|
|
2296
2273
|
cur_blocks_prefix, prev_blocks_name, cur_blocks_name,cur_blocks_seq, is_mod_seq = None, None, None, -1, False
|
|
2297
2274
|
self.loaded_blocks[model_id] = None
|
|
2298
|
-
any_lora = loras !=None and model_id in loras
|
|
2275
|
+
any_lora = loras !=None and model_id in loras
|
|
2276
|
+
if any_lora:
|
|
2277
|
+
loras_model_data = {}
|
|
2278
|
+
current_model._loras_model_data = loras_model_data
|
|
2299
2279
|
for submodule_name, submodule in current_model.named_modules():
|
|
2300
2280
|
# create a fake 'accelerate' parameter so that the _execution_device property returns always "cuda"
|
|
2301
2281
|
# (it is queried in many pipelines even if offloading is not properly implemented)
|
|
@@ -2328,7 +2308,7 @@ def all(pipe_or_dict_of_modules, pinnedMemory = False, pinnedPEFTLora = False, p
|
|
|
2328
2308
|
|
|
2329
2309
|
if hasattr(submodule, "forward"):
|
|
2330
2310
|
if any_lora and isinstance(submodule, torch.nn.Linear):
|
|
2331
|
-
submodule_method = self.hook_lora_linear(submodule, current_model, model_id, submodule_name)
|
|
2311
|
+
submodule_method = self.hook_lora_linear(submodule, current_model, model_id, loras_model_data, submodule_name)
|
|
2332
2312
|
else:
|
|
2333
2313
|
submodule_method = getattr(submodule, "forward")
|
|
2334
2314
|
if callable(submodule_method):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mmgp
|
|
3
|
-
Version: 3.3.
|
|
3
|
+
Version: 3.3.4
|
|
4
4
|
Summary: Memory Management for the GPU Poor
|
|
5
5
|
Author-email: deepbeepmeep <deepbeepmeep@yahoo.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -17,7 +17,7 @@ Dynamic: license-file
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
<p align="center">
|
|
20
|
-
<H2>Memory Management 3.3.
|
|
20
|
+
<H2>Memory Management 3.3.4 for the GPU Poor by DeepBeepMeep</H2>
|
|
21
21
|
</p>
|
|
22
22
|
|
|
23
23
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
+
mmgp/offload.py,sha256=WpQK1af2g0qcAm32EguTX8oBHZGKumPX2EqYS-df69Y,106583
|
|
4
|
+
mmgp/safetensors2.py,sha256=rmUbBmK3Dra5prUTTRSVi6-XUFAa9Mj6B5CNPgzt9To,17333
|
|
5
|
+
mmgp-3.3.4.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
+
mmgp-3.3.4.dist-info/METADATA,sha256=Yk2eSpNITRDHK0lclsP6VXhW0_5hkUNVvXSfk25f7Ds,16154
|
|
7
|
+
mmgp-3.3.4.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
|
|
8
|
+
mmgp-3.3.4.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
+
mmgp-3.3.4.dist-info/RECORD,,
|
mmgp-3.3.2.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mmgp/__init__.py,sha256=A9qBwyQMd1M7vshSTOBnFGP1MQvS2hXmTcTCMUcmyzE,509
|
|
3
|
-
mmgp/offload.py,sha256=43FnFfWqwhh2qz0uykqEpxb_XP9Jx8MPGzN31PExT2w,107470
|
|
4
|
-
mmgp/safetensors2.py,sha256=rmUbBmK3Dra5prUTTRSVi6-XUFAa9Mj6B5CNPgzt9To,17333
|
|
5
|
-
mmgp-3.3.2.dist-info/licenses/LICENSE.md,sha256=HjzvY2grdtdduZclbZ46B2M-XpT4MDCxFub5ZwTWq2g,93
|
|
6
|
-
mmgp-3.3.2.dist-info/METADATA,sha256=mVMLkutqhUihIeo8uo_LK71ithm84_AEaNvnyRnzmEA,16153
|
|
7
|
-
mmgp-3.3.2.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
|
|
8
|
-
mmgp-3.3.2.dist-info/top_level.txt,sha256=waGaepj2qVfnS2yAOkaMu4r9mJaVjGbEi6AwOUogU_U,14
|
|
9
|
-
mmgp-3.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|